aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-05 17:55:20 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-05 17:55:20 -0400
commitea62ccd00fd0b6720b033adfc9984f31130ce195 (patch)
tree9837b797b2466fffcb0af96c388b06eae9c3df18 /arch
parent886a0768affe9a32f18c45f8e1393bca9ece5392 (diff)
parent35060b6a9a4e1c89bc6fbea61090e302dbc61847 (diff)
Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
* 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6: (231 commits) [PATCH] i386: Don't delete cpu_devs data to identify different x86 types in late_initcall [PATCH] i386: type may be unused [PATCH] i386: Some additional chipset register values validation. [PATCH] i386: Add missing !X86_PAE dependincy to the 2G/2G split. [PATCH] x86-64: Don't exclude asm-offsets.c in Documentation/dontdiff [PATCH] i386: avoid redundant preempt_disable in __unlazy_fpu [PATCH] i386: white space fixes in i387.h [PATCH] i386: Drop noisy e820 debugging printks [PATCH] x86-64: Fix allnoconfig error in genapic_flat.c [PATCH] x86-64: Shut up warnings for vfat compat ioctls on other file systems [PATCH] x86-64: Share identical video.S between i386 and x86-64 [PATCH] x86-64: Remove CONFIG_REORDER [PATCH] x86-64: Print type and size correctly for unknown compat ioctls [PATCH] i386: Remove copy_*_user BUG_ONs for (size < 0) [PATCH] i386: Little cleanups in smpboot.c [PATCH] x86-64: Don't enable NUMA for a single node in K8 NUMA scanning [PATCH] x86: Use RDTSCP for synchronous get_cycles if possible [PATCH] i386: Add X86_FEATURE_RDTSCP [PATCH] i386: Implement X86_FEATURE_SYNC_RDTSC on i386 [PATCH] i386: Implement alternative_io for i386 ... Fix up trivial conflict in include/linux/highmem.h manually. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/boot/misc.c2
-rw-r--r--arch/alpha/kernel/vmlinux.lds.S2
-rw-r--r--arch/arm/boot/compressed/misc.c2
-rw-r--r--arch/arm/kernel/vmlinux.lds.S2
-rw-r--r--arch/arm26/boot/compressed/misc.c2
-rw-r--r--arch/cris/arch-v32/vmlinux.lds.S1
-rw-r--r--arch/frv/kernel/vmlinux.lds.S1
-rw-r--r--arch/i386/Kconfig38
-rw-r--r--arch/i386/Kconfig.cpu35
-rw-r--r--arch/i386/Kconfig.debug10
-rw-r--r--arch/i386/Makefile2
-rw-r--r--arch/i386/Makefile.cpu9
-rw-r--r--arch/i386/boot/Makefile4
-rw-r--r--arch/i386/boot/compressed/misc.c2
-rw-r--r--arch/i386/boot/setup.S24
-rw-r--r--arch/i386/defconfig73
-rw-r--r--arch/i386/kernel/Makefile4
-rw-r--r--arch/i386/kernel/acpi/boot.c2
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c21
-rw-r--r--arch/i386/kernel/alternative.c102
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c10
-rw-r--r--arch/i386/kernel/asm-offsets.c18
-rw-r--r--arch/i386/kernel/cpu/Makefile4
-rw-r--r--arch/i386/kernel/cpu/amd.c15
-rw-r--r--arch/i386/kernel/cpu/bugs.c191
-rw-r--r--arch/i386/kernel/cpu/centaur.c10
-rw-r--r--arch/i386/kernel/cpu/common.c217
-rw-r--r--arch/i386/kernel/cpu/cyrix.c21
-rw-r--r--arch/i386/kernel/cpu/intel.c4
-rw-r--r--arch/i386/kernel/cpu/mcheck/k7.c13
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c3
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c16
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c101
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c11
-rw-r--r--arch/i386/kernel/cpu/nexgen.c10
-rw-r--r--arch/i386/kernel/cpu/perfctr-watchdog.c658
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpu/rise.c9
-rw-r--r--arch/i386/kernel/cpu/transmeta.c10
-rw-r--r--arch/i386/kernel/cpu/umc.c10
-rw-r--r--arch/i386/kernel/doublefault.c29
-rw-r--r--arch/i386/kernel/e820.c64
-rw-r--r--arch/i386/kernel/efi.c16
-rw-r--r--arch/i386/kernel/entry.S23
-rw-r--r--arch/i386/kernel/head.S118
-rw-r--r--arch/i386/kernel/i386_ksyms.c2
-rw-r--r--arch/i386/kernel/io_apic.c38
-rw-r--r--arch/i386/kernel/ioport.c3
-rw-r--r--arch/i386/kernel/irq.c3
-rw-r--r--arch/i386/kernel/mpparse.c2
-rw-r--r--arch/i386/kernel/nmi.c832
-rw-r--r--arch/i386/kernel/paravirt.c522
-rw-r--r--arch/i386/kernel/process.c37
-rw-r--r--arch/i386/kernel/quirks.c69
-rw-r--r--arch/i386/kernel/reboot.c55
-rw-r--r--arch/i386/kernel/reboot_fixups.c2
-rw-r--r--arch/i386/kernel/smp.c304
-rw-r--r--arch/i386/kernel/smpboot.c146
-rw-r--r--arch/i386/kernel/sysenter.c269
-rw-r--r--arch/i386/kernel/time.c2
-rw-r--r--arch/i386/kernel/trampoline.S12
-rw-r--r--arch/i386/kernel/traps.c32
-rw-r--r--arch/i386/kernel/tsc.c13
-rw-r--r--arch/i386/kernel/verify_cpu.S65
-rw-r--r--arch/i386/kernel/vmi.c131
-rw-r--r--arch/i386/kernel/vmiclock.c318
-rw-r--r--arch/i386/kernel/vmitime.c482
-rw-r--r--arch/i386/kernel/vmlinux.lds.S24
-rw-r--r--arch/i386/kernel/vsyscall.lds.S4
-rw-r--r--arch/i386/lib/bitops.c4
-rw-r--r--arch/i386/lib/checksum.S69
-rw-r--r--arch/i386/lib/getuser.S26
-rw-r--r--arch/i386/lib/putuser.S39
-rw-r--r--arch/i386/lib/usercopy.c7
-rw-r--r--arch/i386/mach-generic/bigsmp.c2
-rw-r--r--arch/i386/mach-generic/es7000.c41
-rw-r--r--arch/i386/mach-voyager/voyager_smp.c16
-rw-r--r--arch/i386/mm/fault.c60
-rw-r--r--arch/i386/mm/highmem.c10
-rw-r--r--arch/i386/mm/init.c196
-rw-r--r--arch/i386/mm/pageattr.c6
-rw-r--r--arch/i386/mm/pgtable.c94
-rw-r--r--arch/i386/oprofile/nmi_int.c4
-rw-r--r--arch/i386/pci/init.c2
-rw-r--r--arch/i386/pci/mmconfig-shared.c25
-rw-r--r--arch/i386/power/cpu.c1
-rw-r--r--arch/i386/power/suspend.c14
-rw-r--r--arch/m32r/kernel/vmlinux.lds.S2
-rw-r--r--arch/mips/kernel/vmlinux.lds.S2
-rw-r--r--arch/parisc/kernel/vmlinux.lds.S2
-rw-r--r--arch/powerpc/kernel/Makefile1
-rw-r--r--arch/powerpc/kernel/setup_64.c4
-rw-r--r--arch/powerpc/kernel/suspend.c24
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S6
-rw-r--r--arch/ppc/kernel/vmlinux.lds.S2
-rw-r--r--arch/s390/kernel/vmlinux.lds.S2
-rw-r--r--arch/sh/kernel/vmlinux.lds.S2
-rw-r--r--arch/sh64/kernel/vmlinux.lds.S2
-rw-r--r--arch/sparc/kernel/vmlinux.lds.S2
-rw-r--r--arch/sparc64/kernel/smp.c6
-rw-r--r--arch/um/defconfig1
-rw-r--r--arch/x86_64/Kconfig61
-rw-r--r--arch/x86_64/Makefile4
-rw-r--r--arch/x86_64/boot/Makefile2
-rw-r--r--arch/x86_64/boot/compressed/Makefile12
-rw-r--r--arch/x86_64/boot/compressed/head.S339
-rw-r--r--arch/x86_64/boot/compressed/misc.c247
-rw-r--r--arch/x86_64/boot/compressed/vmlinux.lds44
-rw-r--r--arch/x86_64/boot/compressed/vmlinux.scr9
-rw-r--r--arch/x86_64/boot/setup.S85
-rw-r--r--arch/x86_64/boot/video.S2043
-rw-r--r--arch/x86_64/defconfig183
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c10
-rw-r--r--arch/x86_64/ia32/ia32entry.S4
-rw-r--r--arch/x86_64/ia32/syscall32.c1
-rw-r--r--arch/x86_64/kernel/Makefile7
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c24
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S286
-rw-r--r--arch/x86_64/kernel/aperture.c5
-rw-r--r--arch/x86_64/kernel/apic.c35
-rw-r--r--arch/x86_64/kernel/asm-offsets.c10
-rw-r--r--arch/x86_64/kernel/bugs.c21
-rw-r--r--arch/x86_64/kernel/e820.c5
-rw-r--r--arch/x86_64/kernel/early-quirks.c13
-rw-r--r--arch/x86_64/kernel/early_printk.c5
-rw-r--r--arch/x86_64/kernel/entry.S5
-rw-r--r--arch/x86_64/kernel/functionlist1284
-rw-r--r--arch/x86_64/kernel/genapic.c104
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c137
-rw-r--r--arch/x86_64/kernel/genapic_flat.c25
-rw-r--r--arch/x86_64/kernel/head.S340
-rw-r--r--arch/x86_64/kernel/head64.c41
-rw-r--r--arch/x86_64/kernel/io_apic.c31
-rw-r--r--arch/x86_64/kernel/ioport.c1
-rw-r--r--arch/x86_64/kernel/machine_kexec.c14
-rw-r--r--arch/x86_64/kernel/mce.c32
-rw-r--r--arch/x86_64/kernel/mpparse.c2
-rw-r--r--arch/x86_64/kernel/nmi.c678
-rw-r--r--arch/x86_64/kernel/pci-calgary.c2
-rw-r--r--arch/x86_64/kernel/pci-gart.c2
-rw-r--r--arch/x86_64/kernel/pci-nommu.c2
-rw-r--r--arch/x86_64/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86_64/kernel/process.c12
-rw-r--r--arch/x86_64/kernel/setup.c35
-rw-r--r--arch/x86_64/kernel/setup64.c5
-rw-r--r--arch/x86_64/kernel/signal.c6
-rw-r--r--arch/x86_64/kernel/smp.c30
-rw-r--r--arch/x86_64/kernel/smpboot.c47
-rw-r--r--arch/x86_64/kernel/suspend.c19
-rw-r--r--arch/x86_64/kernel/suspend_asm.S7
-rw-r--r--arch/x86_64/kernel/syscall.c1
-rw-r--r--arch/x86_64/kernel/time.c71
-rw-r--r--arch/x86_64/kernel/trampoline.S123
-rw-r--r--arch/x86_64/kernel/traps.c34
-rw-r--r--arch/x86_64/kernel/tsc.c17
-rw-r--r--arch/x86_64/kernel/tsc_sync.c4
-rw-r--r--arch/x86_64/kernel/verify_cpu.S119
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S20
-rw-r--r--arch/x86_64/kernel/vsyscall.c68
-rw-r--r--arch/x86_64/mm/fault.c5
-rw-r--r--arch/x86_64/mm/init.c172
-rw-r--r--arch/x86_64/mm/k8topology.c9
-rw-r--r--arch/x86_64/mm/numa.c306
-rw-r--r--arch/x86_64/mm/pageattr.c32
-rw-r--r--arch/x86_64/mm/srat.c8
-rw-r--r--arch/xtensa/kernel/vmlinux.lds.S2
167 files changed, 4803 insertions, 8491 deletions
diff --git a/arch/alpha/boot/misc.c b/arch/alpha/boot/misc.c
index 1d65adf5691..c00646b25f6 100644
--- a/arch/alpha/boot/misc.c
+++ b/arch/alpha/boot/misc.c
@@ -98,7 +98,7 @@ extern int end;
98static ulg free_mem_ptr; 98static ulg free_mem_ptr;
99static ulg free_mem_ptr_end; 99static ulg free_mem_ptr_end;
100 100
101#define HEAP_SIZE 0x2000 101#define HEAP_SIZE 0x3000
102 102
103#include "../../../lib/inflate.c" 103#include "../../../lib/inflate.c"
104 104
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 4cc44bd33d3..cf1e6fc6c68 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ SECTIONS
69 . = ALIGN(8); 69 . = ALIGN(8);
70 SECURITY_INIT 70 SECURITY_INIT
71 71
72 . = ALIGN(64); 72 . = ALIGN(8192);
73 __per_cpu_start = .; 73 __per_cpu_start = .;
74 .data.percpu : { *(.data.percpu) } 74 .data.percpu : { *(.data.percpu) }
75 __per_cpu_end = .; 75 __per_cpu_end = .;
diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c
index 283891c736c..9b444022cb9 100644
--- a/arch/arm/boot/compressed/misc.c
+++ b/arch/arm/boot/compressed/misc.c
@@ -239,7 +239,7 @@ extern int end;
239static ulg free_mem_ptr; 239static ulg free_mem_ptr;
240static ulg free_mem_ptr_end; 240static ulg free_mem_ptr_end;
241 241
242#define HEAP_SIZE 0x2000 242#define HEAP_SIZE 0x3000
243 243
244#include "../../../../lib/inflate.c" 244#include "../../../../lib/inflate.c"
245 245
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index ddbdad48f5b..d1a6a597ed9 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -59,7 +59,7 @@ SECTIONS
59 usr/built-in.o(.init.ramfs) 59 usr/built-in.o(.init.ramfs)
60 __initramfs_end = .; 60 __initramfs_end = .;
61#endif 61#endif
62 . = ALIGN(64); 62 . = ALIGN(4096);
63 __per_cpu_start = .; 63 __per_cpu_start = .;
64 *(.data.percpu) 64 *(.data.percpu)
65 __per_cpu_end = .; 65 __per_cpu_end = .;
diff --git a/arch/arm26/boot/compressed/misc.c b/arch/arm26/boot/compressed/misc.c
index f17f50e5516..0714d19c577 100644
--- a/arch/arm26/boot/compressed/misc.c
+++ b/arch/arm26/boot/compressed/misc.c
@@ -182,7 +182,7 @@ extern int end;
182static ulg free_mem_ptr; 182static ulg free_mem_ptr;
183static ulg free_mem_ptr_end; 183static ulg free_mem_ptr_end;
184 184
185#define HEAP_SIZE 0x2000 185#define HEAP_SIZE 0x3000
186 186
187#include "../../../../lib/inflate.c" 187#include "../../../../lib/inflate.c"
188 188
diff --git a/arch/cris/arch-v32/vmlinux.lds.S b/arch/cris/arch-v32/vmlinux.lds.S
index e124fcd766d..dfa25e1542b 100644
--- a/arch/cris/arch-v32/vmlinux.lds.S
+++ b/arch/cris/arch-v32/vmlinux.lds.S
@@ -91,6 +91,7 @@ SECTIONS
91 } 91 }
92 SECURITY_INIT 92 SECURITY_INIT
93 93
94 . = ALIGN (8192);
94 __per_cpu_start = .; 95 __per_cpu_start = .;
95 .data.percpu : { *(.data.percpu) } 96 .data.percpu : { *(.data.percpu) }
96 __per_cpu_end = .; 97 __per_cpu_end = .;
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index 97910e01682..28eae9735ad 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -57,6 +57,7 @@ SECTIONS
57 __alt_instructions_end = .; 57 __alt_instructions_end = .;
58 .altinstr_replacement : { *(.altinstr_replacement) } 58 .altinstr_replacement : { *(.altinstr_replacement) }
59 59
60 . = ALIGN(4096);
60 __per_cpu_start = .; 61 __per_cpu_start = .;
61 .data.percpu : { *(.data.percpu) } 62 .data.percpu : { *(.data.percpu) }
62 __per_cpu_end = .; 63 __per_cpu_end = .;
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index bcf2fc408a1..a9af760c7e5 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -220,7 +220,7 @@ config PARAVIRT
220 220
221config VMI 221config VMI
222 bool "VMI Paravirt-ops support" 222 bool "VMI Paravirt-ops support"
223 depends on PARAVIRT && !COMPAT_VDSO 223 depends on PARAVIRT
224 help 224 help
225 VMI provides a paravirtualized interface to the VMware ESX server 225 VMI provides a paravirtualized interface to the VMware ESX server
226 (it could be used by other hypervisors in theory too, but is not 226 (it could be used by other hypervisors in theory too, but is not
@@ -571,6 +571,9 @@ choice
571 bool "3G/1G user/kernel split (for full 1G low memory)" 571 bool "3G/1G user/kernel split (for full 1G low memory)"
572 config VMSPLIT_2G 572 config VMSPLIT_2G
573 bool "2G/2G user/kernel split" 573 bool "2G/2G user/kernel split"
574 config VMSPLIT_2G_OPT
575 depends on !HIGHMEM
576 bool "2G/2G user/kernel split (for full 2G low memory)"
574 config VMSPLIT_1G 577 config VMSPLIT_1G
575 bool "1G/3G user/kernel split" 578 bool "1G/3G user/kernel split"
576endchoice 579endchoice
@@ -578,7 +581,8 @@ endchoice
578config PAGE_OFFSET 581config PAGE_OFFSET
579 hex 582 hex
580 default 0xB0000000 if VMSPLIT_3G_OPT 583 default 0xB0000000 if VMSPLIT_3G_OPT
581 default 0x78000000 if VMSPLIT_2G 584 default 0x80000000 if VMSPLIT_2G
585 default 0x78000000 if VMSPLIT_2G_OPT
582 default 0x40000000 if VMSPLIT_1G 586 default 0x40000000 if VMSPLIT_1G
583 default 0xC0000000 587 default 0xC0000000
584 588
@@ -915,12 +919,9 @@ source kernel/power/Kconfig
915 919
916source "drivers/acpi/Kconfig" 920source "drivers/acpi/Kconfig"
917 921
918menu "APM (Advanced Power Management) BIOS Support" 922menuconfig APM
919depends on PM && !X86_VISWS
920
921config APM
922 tristate "APM (Advanced Power Management) BIOS support" 923 tristate "APM (Advanced Power Management) BIOS support"
923 depends on PM 924 depends on PM && !X86_VISWS
924 ---help--- 925 ---help---
925 APM is a BIOS specification for saving power using several different 926 APM is a BIOS specification for saving power using several different
926 techniques. This is mostly useful for battery powered laptops with 927 techniques. This is mostly useful for battery powered laptops with
@@ -977,9 +978,10 @@ config APM
977 To compile this driver as a module, choose M here: the 978 To compile this driver as a module, choose M here: the
978 module will be called apm. 979 module will be called apm.
979 980
981if APM
982
980config APM_IGNORE_USER_SUSPEND 983config APM_IGNORE_USER_SUSPEND
981 bool "Ignore USER SUSPEND" 984 bool "Ignore USER SUSPEND"
982 depends on APM
983 help 985 help
984 This option will ignore USER SUSPEND requests. On machines with a 986 This option will ignore USER SUSPEND requests. On machines with a
985 compliant APM BIOS, you want to say N. However, on the NEC Versa M 987 compliant APM BIOS, you want to say N. However, on the NEC Versa M
@@ -987,7 +989,6 @@ config APM_IGNORE_USER_SUSPEND
987 989
988config APM_DO_ENABLE 990config APM_DO_ENABLE
989 bool "Enable PM at boot time" 991 bool "Enable PM at boot time"
990 depends on APM
991 ---help--- 992 ---help---
992 Enable APM features at boot time. From page 36 of the APM BIOS 993 Enable APM features at boot time. From page 36 of the APM BIOS
993 specification: "When disabled, the APM BIOS does not automatically 994 specification: "When disabled, the APM BIOS does not automatically
@@ -1005,7 +1006,6 @@ config APM_DO_ENABLE
1005 1006
1006config APM_CPU_IDLE 1007config APM_CPU_IDLE
1007 bool "Make CPU Idle calls when idle" 1008 bool "Make CPU Idle calls when idle"
1008 depends on APM
1009 help 1009 help
1010 Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. 1010 Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
1011 On some machines, this can activate improved power savings, such as 1011 On some machines, this can activate improved power savings, such as
@@ -1017,7 +1017,6 @@ config APM_CPU_IDLE
1017 1017
1018config APM_DISPLAY_BLANK 1018config APM_DISPLAY_BLANK
1019 bool "Enable console blanking using APM" 1019 bool "Enable console blanking using APM"
1020 depends on APM
1021 help 1020 help
1022 Enable console blanking using the APM. Some laptops can use this to 1021 Enable console blanking using the APM. Some laptops can use this to
1023 turn off the LCD backlight when the screen blanker of the Linux 1022 turn off the LCD backlight when the screen blanker of the Linux
@@ -1029,22 +1028,8 @@ config APM_DISPLAY_BLANK
1029 backlight at all, or it might print a lot of errors to the console, 1028 backlight at all, or it might print a lot of errors to the console,
1030 especially if you are using gpm. 1029 especially if you are using gpm.
1031 1030
1032config APM_RTC_IS_GMT
1033 bool "RTC stores time in GMT"
1034 depends on APM
1035 help
1036 Say Y here if your RTC (Real Time Clock a.k.a. hardware clock)
1037 stores the time in GMT (Greenwich Mean Time). Say N if your RTC
1038 stores localtime.
1039
1040 It is in fact recommended to store GMT in your RTC, because then you
1041 don't have to worry about daylight savings time changes. The only
1042 reason not to use GMT in your RTC is if you also run a broken OS
1043 that doesn't understand GMT.
1044
1045config APM_ALLOW_INTS 1031config APM_ALLOW_INTS
1046 bool "Allow interrupts during APM BIOS calls" 1032 bool "Allow interrupts during APM BIOS calls"
1047 depends on APM
1048 help 1033 help
1049 Normally we disable external interrupts while we are making calls to 1034 Normally we disable external interrupts while we are making calls to
1050 the APM BIOS as a measure to lessen the effects of a badly behaving 1035 the APM BIOS as a measure to lessen the effects of a badly behaving
@@ -1055,13 +1040,12 @@ config APM_ALLOW_INTS
1055 1040
1056config APM_REAL_MODE_POWER_OFF 1041config APM_REAL_MODE_POWER_OFF
1057 bool "Use real mode APM BIOS call to power off" 1042 bool "Use real mode APM BIOS call to power off"
1058 depends on APM
1059 help 1043 help
1060 Use real mode APM BIOS calls to switch off the computer. This is 1044 Use real mode APM BIOS calls to switch off the computer. This is
1061 a work-around for a number of buggy BIOSes. Switch this option on if 1045 a work-around for a number of buggy BIOSes. Switch this option on if
1062 your computer crashes instead of powering off properly. 1046 your computer crashes instead of powering off properly.
1063 1047
1064endmenu 1048endif # APM
1065 1049
1066source "arch/i386/kernel/cpu/cpufreq/Kconfig" 1050source "arch/i386/kernel/cpu/cpufreq/Kconfig"
1067 1051
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index b99c0e2a4e6..dce6124cb84 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -43,6 +43,7 @@ config M386
43 - "Geode GX/LX" For AMD Geode GX and LX processors. 43 - "Geode GX/LX" For AMD Geode GX and LX processors.
44 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. 44 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
45 - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). 45 - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above).
46 - "VIA C7" for VIA C7.
46 47
47 If you don't know what to do, choose "386". 48 If you don't know what to do, choose "386".
48 49
@@ -203,6 +204,12 @@ config MVIAC3_2
203 of SSE and tells gcc to treat the CPU as a 686. 204 of SSE and tells gcc to treat the CPU as a 686.
204 Note, this kernel will not boot on older (pre model 9) C3s. 205 Note, this kernel will not boot on older (pre model 9) C3s.
205 206
207config MVIAC7
208 bool "VIA C7"
209 help
210 Select this for a VIA C7. Selecting this uses the correct cache
211 shift and tells gcc to treat the CPU as a 686.
212
206endchoice 213endchoice
207 214
208config X86_GENERIC 215config X86_GENERIC
@@ -231,16 +238,21 @@ config X86_L1_CACHE_SHIFT
231 default "7" if MPENTIUM4 || X86_GENERIC 238 default "7" if MPENTIUM4 || X86_GENERIC
232 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 239 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
233 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 240 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
234 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 241 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
242
243config X86_XADD
244 bool
245 depends on !M386
246 default y
235 247
236config RWSEM_GENERIC_SPINLOCK 248config RWSEM_GENERIC_SPINLOCK
237 bool 249 bool
238 depends on M386 250 depends on !X86_XADD
239 default y 251 default y
240 252
241config RWSEM_XCHGADD_ALGORITHM 253config RWSEM_XCHGADD_ALGORITHM
242 bool 254 bool
243 depends on !M386 255 depends on X86_XADD
244 default y 256 default y
245 257
246config ARCH_HAS_ILOG2_U32 258config ARCH_HAS_ILOG2_U32
@@ -297,7 +309,7 @@ config X86_ALIGNMENT_16
297 309
298config X86_GOOD_APIC 310config X86_GOOD_APIC
299 bool 311 bool
300 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 312 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7
301 default y 313 default y
302 314
303config X86_INTEL_USERCOPY 315config X86_INTEL_USERCOPY
@@ -322,5 +334,18 @@ config X86_OOSTORE
322 334
323config X86_TSC 335config X86_TSC
324 bool 336 bool
325 depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ 337 depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ
326 default y 338 default y
339
340# this should be set for all -march=.. options where the compiler
341# generates cmov.
342config X86_CMOV
343 bool
344 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
345 default y
346
347config X86_MINIMUM_CPU_MODEL
348 int
349 default "4" if X86_XADD || X86_CMPXCHG || X86_BSWAP
350 default "0"
351
diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index 458bc161193..b31c0802e1c 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -85,14 +85,4 @@ config DOUBLEFAULT
85 option saves about 4k and might cause you much additional grey 85 option saves about 4k and might cause you much additional grey
86 hair. 86 hair.
87 87
88config DEBUG_PARAVIRT
89 bool "Enable some paravirtualization debugging"
90 default n
91 depends on PARAVIRT && DEBUG_KERNEL
92 help
93 Currently deliberately clobbers regs which are allowed to be
94 clobbered in inlined paravirt hooks, even in native mode.
95 If turning this off solves a problem, then DISABLE_INTERRUPTS() or
96 ENABLE_INTERRUPTS() is lying about what registers can be clobbered.
97
98endmenu 88endmenu
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index bd28f9f9b4b..6dc5e5d90fe 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -34,7 +34,7 @@ CHECKFLAGS += -D__i386__
34CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return 34CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return
35 35
36# prevent gcc from keeping the stack 16 byte aligned 36# prevent gcc from keeping the stack 16 byte aligned
37CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) 37CFLAGS += -mpreferred-stack-boundary=4
38 38
39# CPU-specific tuning. Anything which can be shared with UML should go here. 39# CPU-specific tuning. Anything which can be shared with UML should go here.
40include $(srctree)/arch/i386/Makefile.cpu 40include $(srctree)/arch/i386/Makefile.cpu
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index a32c031c90d..e372b584e91 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -4,9 +4,9 @@
4#-mtune exists since gcc 3.4 4#-mtune exists since gcc 3.4
5HAS_MTUNE := $(call cc-option-yn, -mtune=i386) 5HAS_MTUNE := $(call cc-option-yn, -mtune=i386)
6ifeq ($(HAS_MTUNE),y) 6ifeq ($(HAS_MTUNE),y)
7tune = $(call cc-option,-mtune=$(1),) 7tune = $(call cc-option,-mtune=$(1),$(2))
8else 8else
9tune = $(call cc-option,-mcpu=$(1),) 9tune = $(call cc-option,-mcpu=$(1),$(2))
10endif 10endif
11 11
12align := $(cc-option-align) 12align := $(cc-option-align)
@@ -32,7 +32,8 @@ cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
35cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686)) 35cflags-$(CONFIG_MVIAC7) += -march=i686
36cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
36 37
37# AMD Elan support 38# AMD Elan support
38cflags-$(CONFIG_X86_ELAN) += -march=i486 39cflags-$(CONFIG_X86_ELAN) += -march=i486
@@ -42,5 +43,5 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
42 43
43# add at the end to overwrite eventual tuning options from earlier 44# add at the end to overwrite eventual tuning options from earlier
44# cpu entries 45# cpu entries
45cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic) 46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
46 47
diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile
index e9794662606..bfbc32098a4 100644
--- a/arch/i386/boot/Makefile
+++ b/arch/i386/boot/Makefile
@@ -36,9 +36,9 @@ HOSTCFLAGS_build.o := $(LINUXINCLUDE)
36# --------------------------------------------------------------------------- 36# ---------------------------------------------------------------------------
37 37
38$(obj)/zImage: IMAGE_OFFSET := 0x1000 38$(obj)/zImage: IMAGE_OFFSET := 0x1000
39$(obj)/zImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) 39$(obj)/zImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK)
40$(obj)/bzImage: IMAGE_OFFSET := 0x100000 40$(obj)/bzImage: IMAGE_OFFSET := 0x100000
41$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ 41$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
42$(obj)/bzImage: BUILDFLAGS := -b 42$(obj)/bzImage: BUILDFLAGS := -b
43 43
44quiet_cmd_image = BUILD $@ 44quiet_cmd_image = BUILD $@
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index 1ce7017fd62..b28505c544c 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -189,7 +189,7 @@ static void putstr(const char *);
189static unsigned long free_mem_ptr; 189static unsigned long free_mem_ptr;
190static unsigned long free_mem_end_ptr; 190static unsigned long free_mem_end_ptr;
191 191
192#define HEAP_SIZE 0x3000 192#define HEAP_SIZE 0x4000
193 193
194static char *vidmem = (char *)0xb8000; 194static char *vidmem = (char *)0xb8000;
195static int vidport; 195static int vidport;
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
index 06edf1c6624..f8b3b9cda2b 100644
--- a/arch/i386/boot/setup.S
+++ b/arch/i386/boot/setup.S
@@ -52,6 +52,7 @@
52#include <asm/boot.h> 52#include <asm/boot.h>
53#include <asm/e820.h> 53#include <asm/e820.h>
54#include <asm/page.h> 54#include <asm/page.h>
55#include <asm/setup.h>
55 56
56/* Signature words to ensure LILO loaded us right */ 57/* Signature words to ensure LILO loaded us right */
57#define SIG1 0xAA55 58#define SIG1 0xAA55
@@ -81,7 +82,7 @@ start:
81# This is the setup header, and it must start at %cs:2 (old 0x9020:2) 82# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
82 83
83 .ascii "HdrS" # header signature 84 .ascii "HdrS" # header signature
84 .word 0x0205 # header version number (>= 0x0105) 85 .word 0x0206 # header version number (>= 0x0105)
85 # or else old loadlin-1.5 will fail) 86 # or else old loadlin-1.5 will fail)
86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 87realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
87start_sys_seg: .word SYSSEG 88start_sys_seg: .word SYSSEG
@@ -171,6 +172,10 @@ relocatable_kernel: .byte 0
171pad2: .byte 0 172pad2: .byte 0
172pad3: .word 0 173pad3: .word 0
173 174
175cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
176 #added with boot protocol
177 #version 2.06
178
174trampoline: call start_of_setup 179trampoline: call start_of_setup
175 .align 16 180 .align 16
176 # The offset at this point is 0x240 181 # The offset at this point is 0x240
@@ -297,7 +302,24 @@ good_sig:
297 302
298loader_panic_mess: .string "Wrong loader, giving up..." 303loader_panic_mess: .string "Wrong loader, giving up..."
299 304
305# check minimum cpuid
306# we do this here because it is the last place we can actually
307# show a user visible error message. Later the video modus
308# might be already messed up.
300loader_ok: 309loader_ok:
310 call verify_cpu
311 testl %eax,%eax
312 jz cpu_ok
313 lea cpu_panic_mess,%si
314 call prtstr
3151: jmp 1b
316
317cpu_panic_mess:
318 .asciz "PANIC: CPU too old for this kernel."
319
320#include "../kernel/verify_cpu.S"
321
322cpu_ok:
301# Get memory size (extended mem, kB) 323# Get memory size (extended mem, kB)
302 324
303 xorl %eax, %eax 325 xorl %eax, %eax
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index c96911c37ae..9da84412a83 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.21-rc3 3# Linux kernel version: 2.6.21-git3
4# Wed Mar 7 15:29:47 2007 4# Tue May 1 07:30:51 2007
5# 5#
6CONFIG_X86_32=y 6CONFIG_X86_32=y
7CONFIG_GENERIC_TIME=y 7CONFIG_GENERIC_TIME=y
@@ -108,9 +108,9 @@ CONFIG_DEFAULT_IOSCHED="anticipatory"
108# 108#
109# Processor type and features 109# Processor type and features
110# 110#
111# CONFIG_TICK_ONESHOT is not set 111CONFIG_TICK_ONESHOT=y
112# CONFIG_NO_HZ is not set 112CONFIG_NO_HZ=y
113# CONFIG_HIGH_RES_TIMERS is not set 113CONFIG_HIGH_RES_TIMERS=y
114CONFIG_SMP=y 114CONFIG_SMP=y
115# CONFIG_X86_PC is not set 115# CONFIG_X86_PC is not set
116# CONFIG_X86_ELAN is not set 116# CONFIG_X86_ELAN is not set
@@ -146,9 +146,11 @@ CONFIG_MPENTIUMIII=y
146# CONFIG_MGEODE_LX is not set 146# CONFIG_MGEODE_LX is not set
147# CONFIG_MCYRIXIII is not set 147# CONFIG_MCYRIXIII is not set
148# CONFIG_MVIAC3_2 is not set 148# CONFIG_MVIAC3_2 is not set
149# CONFIG_MVIAC7 is not set
149CONFIG_X86_GENERIC=y 150CONFIG_X86_GENERIC=y
150CONFIG_X86_CMPXCHG=y 151CONFIG_X86_CMPXCHG=y
151CONFIG_X86_L1_CACHE_SHIFT=7 152CONFIG_X86_L1_CACHE_SHIFT=7
153CONFIG_X86_XADD=y
152CONFIG_RWSEM_XCHGADD_ALGORITHM=y 154CONFIG_RWSEM_XCHGADD_ALGORITHM=y
153# CONFIG_ARCH_HAS_ILOG2_U32 is not set 155# CONFIG_ARCH_HAS_ILOG2_U32 is not set
154# CONFIG_ARCH_HAS_ILOG2_U64 is not set 156# CONFIG_ARCH_HAS_ILOG2_U64 is not set
@@ -162,6 +164,8 @@ CONFIG_X86_GOOD_APIC=y
162CONFIG_X86_INTEL_USERCOPY=y 164CONFIG_X86_INTEL_USERCOPY=y
163CONFIG_X86_USE_PPRO_CHECKSUM=y 165CONFIG_X86_USE_PPRO_CHECKSUM=y
164CONFIG_X86_TSC=y 166CONFIG_X86_TSC=y
167CONFIG_X86_CMOV=y
168CONFIG_X86_MINIMUM_CPU_MODEL=4
165CONFIG_HPET_TIMER=y 169CONFIG_HPET_TIMER=y
166CONFIG_HPET_EMULATE_RTC=y 170CONFIG_HPET_EMULATE_RTC=y
167CONFIG_NR_CPUS=32 171CONFIG_NR_CPUS=32
@@ -248,7 +252,6 @@ CONFIG_ACPI_FAN=y
248CONFIG_ACPI_PROCESSOR=y 252CONFIG_ACPI_PROCESSOR=y
249CONFIG_ACPI_THERMAL=y 253CONFIG_ACPI_THERMAL=y
250# CONFIG_ACPI_ASUS is not set 254# CONFIG_ACPI_ASUS is not set
251# CONFIG_ACPI_IBM is not set
252# CONFIG_ACPI_TOSHIBA is not set 255# CONFIG_ACPI_TOSHIBA is not set
253CONFIG_ACPI_BLACKLIST_YEAR=2001 256CONFIG_ACPI_BLACKLIST_YEAR=2001
254CONFIG_ACPI_DEBUG=y 257CONFIG_ACPI_DEBUG=y
@@ -257,10 +260,7 @@ CONFIG_ACPI_POWER=y
257CONFIG_ACPI_SYSTEM=y 260CONFIG_ACPI_SYSTEM=y
258CONFIG_X86_PM_TIMER=y 261CONFIG_X86_PM_TIMER=y
259# CONFIG_ACPI_CONTAINER is not set 262# CONFIG_ACPI_CONTAINER is not set
260 263# CONFIG_ACPI_SBS is not set
261#
262# APM (Advanced Power Management) BIOS Support
263#
264# CONFIG_APM is not set 264# CONFIG_APM is not set
265 265
266# 266#
@@ -277,7 +277,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
277# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set 277# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
278CONFIG_CPU_FREQ_GOV_USERSPACE=y 278CONFIG_CPU_FREQ_GOV_USERSPACE=y
279CONFIG_CPU_FREQ_GOV_ONDEMAND=y 279CONFIG_CPU_FREQ_GOV_ONDEMAND=y
280# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set 280CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
281 281
282# 282#
283# CPUFreq processor drivers 283# CPUFreq processor drivers
@@ -349,7 +349,6 @@ CONFIG_NET=y
349# 349#
350# Networking options 350# Networking options
351# 351#
352# CONFIG_NETDEBUG is not set
353CONFIG_PACKET=y 352CONFIG_PACKET=y
354# CONFIG_PACKET_MMAP is not set 353# CONFIG_PACKET_MMAP is not set
355CONFIG_UNIX=y 354CONFIG_UNIX=y
@@ -388,6 +387,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic"
388CONFIG_IPV6=y 387CONFIG_IPV6=y
389# CONFIG_IPV6_PRIVACY is not set 388# CONFIG_IPV6_PRIVACY is not set
390# CONFIG_IPV6_ROUTER_PREF is not set 389# CONFIG_IPV6_ROUTER_PREF is not set
390# CONFIG_IPV6_OPTIMISTIC_DAD is not set
391# CONFIG_INET6_AH is not set 391# CONFIG_INET6_AH is not set
392# CONFIG_INET6_ESP is not set 392# CONFIG_INET6_ESP is not set
393# CONFIG_INET6_IPCOMP is not set 393# CONFIG_INET6_IPCOMP is not set
@@ -443,6 +443,13 @@ CONFIG_IPV6_SIT=y
443# CONFIG_HAMRADIO is not set 443# CONFIG_HAMRADIO is not set
444# CONFIG_IRDA is not set 444# CONFIG_IRDA is not set
445# CONFIG_BT is not set 445# CONFIG_BT is not set
446# CONFIG_AF_RXRPC is not set
447
448#
449# Wireless
450#
451# CONFIG_CFG80211 is not set
452# CONFIG_WIRELESS_EXT is not set
446# CONFIG_IEEE80211 is not set 453# CONFIG_IEEE80211 is not set
447 454
448# 455#
@@ -463,10 +470,6 @@ CONFIG_FW_LOADER=y
463# Connector - unified userspace <-> kernelspace linker 470# Connector - unified userspace <-> kernelspace linker
464# 471#
465# CONFIG_CONNECTOR is not set 472# CONFIG_CONNECTOR is not set
466
467#
468# Memory Technology Devices (MTD)
469#
470# CONFIG_MTD is not set 473# CONFIG_MTD is not set
471 474
472# 475#
@@ -513,6 +516,7 @@ CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
513# CONFIG_SGI_IOC4 is not set 516# CONFIG_SGI_IOC4 is not set
514# CONFIG_TIFM_CORE is not set 517# CONFIG_TIFM_CORE is not set
515# CONFIG_SONY_LAPTOP is not set 518# CONFIG_SONY_LAPTOP is not set
519# CONFIG_THINKPAD_ACPI is not set
516 520
517# 521#
518# ATA/ATAPI/MFM/RLL support 522# ATA/ATAPI/MFM/RLL support
@@ -548,7 +552,6 @@ CONFIG_BLK_DEV_IDEPCI=y
548# CONFIG_BLK_DEV_RZ1000 is not set 552# CONFIG_BLK_DEV_RZ1000 is not set
549CONFIG_BLK_DEV_IDEDMA_PCI=y 553CONFIG_BLK_DEV_IDEDMA_PCI=y
550# CONFIG_BLK_DEV_IDEDMA_FORCED is not set 554# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
551CONFIG_IDEDMA_PCI_AUTO=y
552# CONFIG_IDEDMA_ONLYDISK is not set 555# CONFIG_IDEDMA_ONLYDISK is not set
553# CONFIG_BLK_DEV_AEC62XX is not set 556# CONFIG_BLK_DEV_AEC62XX is not set
554# CONFIG_BLK_DEV_ALI15X3 is not set 557# CONFIG_BLK_DEV_ALI15X3 is not set
@@ -580,7 +583,6 @@ CONFIG_BLK_DEV_PIIX=y
580# CONFIG_IDE_ARM is not set 583# CONFIG_IDE_ARM is not set
581CONFIG_BLK_DEV_IDEDMA=y 584CONFIG_BLK_DEV_IDEDMA=y
582# CONFIG_IDEDMA_IVB is not set 585# CONFIG_IDEDMA_IVB is not set
583CONFIG_IDEDMA_AUTO=y
584# CONFIG_BLK_DEV_HD is not set 586# CONFIG_BLK_DEV_HD is not set
585 587
586# 588#
@@ -669,6 +671,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0
669# CONFIG_SCSI_DC390T is not set 671# CONFIG_SCSI_DC390T is not set
670# CONFIG_SCSI_NSP32 is not set 672# CONFIG_SCSI_NSP32 is not set
671# CONFIG_SCSI_DEBUG is not set 673# CONFIG_SCSI_DEBUG is not set
674# CONFIG_SCSI_ESP_CORE is not set
672# CONFIG_SCSI_SRP is not set 675# CONFIG_SCSI_SRP is not set
673 676
674# 677#
@@ -697,6 +700,7 @@ CONFIG_SATA_ACPI=y
697# CONFIG_PATA_AMD is not set 700# CONFIG_PATA_AMD is not set
698# CONFIG_PATA_ARTOP is not set 701# CONFIG_PATA_ARTOP is not set
699# CONFIG_PATA_ATIIXP is not set 702# CONFIG_PATA_ATIIXP is not set
703# CONFIG_PATA_CMD640_PCI is not set
700# CONFIG_PATA_CMD64X is not set 704# CONFIG_PATA_CMD64X is not set
701# CONFIG_PATA_CS5520 is not set 705# CONFIG_PATA_CS5520 is not set
702# CONFIG_PATA_CS5530 is not set 706# CONFIG_PATA_CS5530 is not set
@@ -762,10 +766,9 @@ CONFIG_IEEE1394=y
762# Subsystem Options 766# Subsystem Options
763# 767#
764# CONFIG_IEEE1394_VERBOSEDEBUG is not set 768# CONFIG_IEEE1394_VERBOSEDEBUG is not set
765# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set
766 769
767# 770#
768# Device Drivers 771# Controllers
769# 772#
770 773
771# 774#
@@ -774,10 +777,11 @@ CONFIG_IEEE1394=y
774CONFIG_IEEE1394_OHCI1394=y 777CONFIG_IEEE1394_OHCI1394=y
775 778
776# 779#
777# Protocol Drivers 780# Protocols
778# 781#
779# CONFIG_IEEE1394_VIDEO1394 is not set 782# CONFIG_IEEE1394_VIDEO1394 is not set
780# CONFIG_IEEE1394_SBP2 is not set 783# CONFIG_IEEE1394_SBP2 is not set
784# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set
781# CONFIG_IEEE1394_ETH1394 is not set 785# CONFIG_IEEE1394_ETH1394 is not set
782# CONFIG_IEEE1394_DV1394 is not set 786# CONFIG_IEEE1394_DV1394 is not set
783CONFIG_IEEE1394_RAWIO=y 787CONFIG_IEEE1394_RAWIO=y
@@ -820,7 +824,9 @@ CONFIG_MII=y
820# CONFIG_HAPPYMEAL is not set 824# CONFIG_HAPPYMEAL is not set
821# CONFIG_SUNGEM is not set 825# CONFIG_SUNGEM is not set
822# CONFIG_CASSINI is not set 826# CONFIG_CASSINI is not set
823# CONFIG_NET_VENDOR_3COM is not set 827CONFIG_NET_VENDOR_3COM=y
828CONFIG_VORTEX=y
829# CONFIG_TYPHOON is not set
824 830
825# 831#
826# Tulip family network device support 832# Tulip family network device support
@@ -901,9 +907,10 @@ CONFIG_BNX2=y
901# CONFIG_TR is not set 907# CONFIG_TR is not set
902 908
903# 909#
904# Wireless LAN (non-hamradio) 910# Wireless LAN
905# 911#
906# CONFIG_NET_RADIO is not set 912# CONFIG_WLAN_PRE80211 is not set
913# CONFIG_WLAN_80211 is not set
907 914
908# 915#
909# Wan interfaces 916# Wan interfaces
@@ -917,7 +924,6 @@ CONFIG_BNX2=y
917# CONFIG_SHAPER is not set 924# CONFIG_SHAPER is not set
918CONFIG_NETCONSOLE=y 925CONFIG_NETCONSOLE=y
919CONFIG_NETPOLL=y 926CONFIG_NETPOLL=y
920# CONFIG_NETPOLL_RX is not set
921# CONFIG_NETPOLL_TRAP is not set 927# CONFIG_NETPOLL_TRAP is not set
922CONFIG_NET_POLL_CONTROLLER=y 928CONFIG_NET_POLL_CONTROLLER=y
923 929
@@ -1050,7 +1056,7 @@ CONFIG_MAX_RAW_DEVS=256
1050CONFIG_HPET=y 1056CONFIG_HPET=y
1051# CONFIG_HPET_RTC_IRQ is not set 1057# CONFIG_HPET_RTC_IRQ is not set
1052CONFIG_HPET_MMAP=y 1058CONFIG_HPET_MMAP=y
1053CONFIG_HANGCHECK_TIMER=y 1059# CONFIG_HANGCHECK_TIMER is not set
1054 1060
1055# 1061#
1056# TPM devices 1062# TPM devices
@@ -1142,6 +1148,14 @@ CONFIG_HID=y
1142# CONFIG_HID_DEBUG is not set 1148# CONFIG_HID_DEBUG is not set
1143 1149
1144# 1150#
1151# USB Input Devices
1152#
1153CONFIG_USB_HID=y
1154# CONFIG_USB_HIDINPUT_POWERBOOK is not set
1155# CONFIG_HID_FF is not set
1156# CONFIG_USB_HIDDEV is not set
1157
1158#
1145# USB support 1159# USB support
1146# 1160#
1147CONFIG_USB_ARCH_HAS_HCD=y 1161CONFIG_USB_ARCH_HAS_HCD=y
@@ -1154,6 +1168,7 @@ CONFIG_USB=y
1154# Miscellaneous USB options 1168# Miscellaneous USB options
1155# 1169#
1156CONFIG_USB_DEVICEFS=y 1170CONFIG_USB_DEVICEFS=y
1171# CONFIG_USB_DEVICE_CLASS is not set
1157# CONFIG_USB_DYNAMIC_MINORS is not set 1172# CONFIG_USB_DYNAMIC_MINORS is not set
1158# CONFIG_USB_SUSPEND is not set 1173# CONFIG_USB_SUSPEND is not set
1159# CONFIG_USB_OTG is not set 1174# CONFIG_USB_OTG is not set
@@ -1204,10 +1219,6 @@ CONFIG_USB_STORAGE=y
1204# 1219#
1205# USB Input Devices 1220# USB Input Devices
1206# 1221#
1207CONFIG_USB_HID=y
1208# CONFIG_USB_HIDINPUT_POWERBOOK is not set
1209# CONFIG_HID_FF is not set
1210# CONFIG_USB_HIDDEV is not set
1211# CONFIG_USB_AIPTEK is not set 1222# CONFIG_USB_AIPTEK is not set
1212# CONFIG_USB_WACOM is not set 1223# CONFIG_USB_WACOM is not set
1213# CONFIG_USB_ACECAD is not set 1224# CONFIG_USB_ACECAD is not set
@@ -1528,7 +1539,7 @@ CONFIG_DEBUG_KERNEL=y
1528CONFIG_LOG_BUF_SHIFT=18 1539CONFIG_LOG_BUF_SHIFT=18
1529CONFIG_DETECT_SOFTLOCKUP=y 1540CONFIG_DETECT_SOFTLOCKUP=y
1530# CONFIG_SCHEDSTATS is not set 1541# CONFIG_SCHEDSTATS is not set
1531# CONFIG_TIMER_STATS is not set 1542CONFIG_TIMER_STATS=y
1532# CONFIG_DEBUG_SLAB is not set 1543# CONFIG_DEBUG_SLAB is not set
1533# CONFIG_DEBUG_RT_MUTEXES is not set 1544# CONFIG_DEBUG_RT_MUTEXES is not set
1534# CONFIG_RT_MUTEX_TESTER is not set 1545# CONFIG_RT_MUTEX_TESTER is not set
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 4ae3dcf1d2f..4f98516b9f9 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,12 +39,10 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
39obj-$(CONFIG_HPET_TIMER) += hpet.o 39obj-$(CONFIG_HPET_TIMER) += hpet.o
40obj-$(CONFIG_K8_NB) += k8.o 40obj-$(CONFIG_K8_NB) += k8.o
41 41
42obj-$(CONFIG_VMI) += vmi.o vmitime.o 42obj-$(CONFIG_VMI) += vmi.o vmiclock.o
43obj-$(CONFIG_PARAVIRT) += paravirt.o 43obj-$(CONFIG_PARAVIRT) += paravirt.o
44obj-y += pcspeaker.o 44obj-y += pcspeaker.o
45 45
46EXTRA_AFLAGS := -traditional
47
48obj-$(CONFIG_SCx200) += scx200.o 46obj-$(CONFIG_SCx200) += scx200.o
49 47
50# vsyscall.o contains the vsyscall DSO images as __initdata. 48# vsyscall.o contains the vsyscall DSO images as __initdata.
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 9ea5b8ecc7e..280898b045b 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -874,7 +874,7 @@ static void __init acpi_process_madt(void)
874 acpi_ioapic = 1; 874 acpi_ioapic = 1;
875 875
876 smp_found_config = 1; 876 smp_found_config = 1;
877 clustered_apic_check(); 877 setup_apic_routing();
878 } 878 }
879 } 879 }
880 if (error == -EINVAL) { 880 if (error == -EINVAL) {
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 8f7efd38254..23f78efc577 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,7 +10,6 @@
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/acpi.h> 11#include <asm/acpi.h>
12#include <asm/apic.h> 12#include <asm/apic.h>
13#include <asm/irq.h>
14 13
15#ifdef CONFIG_ACPI 14#ifdef CONFIG_ACPI
16 15
@@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device)
48 return 0; 47 return 0;
49} 48}
50 49
51static void check_intel(void)
52{
53 u16 vendor, device;
54
55 vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
56
57 if (vendor != PCI_VENDOR_ID_INTEL)
58 return;
59
60 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
61#ifdef CONFIG_SMP
62 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
63 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
64 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
65 quirk_intel_irqbalance();
66#endif
67}
68
69void __init check_acpi_pci(void) 50void __init check_acpi_pci(void)
70{ 51{
71 int num, slot, func; 52 int num, slot, func;
@@ -77,8 +58,6 @@ void __init check_acpi_pci(void)
77 if (!early_pci_allowed()) 58 if (!early_pci_allowed())
78 return; 59 return;
79 60
80 check_intel();
81
82 /* Poor man's PCI discovery */ 61 /* Poor man's PCI discovery */
83 for (num = 0; num < 32; num++) { 62 for (num = 0; num < 32; num++) {
84 for (slot = 0; slot < 32; slot++) { 63 for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 426f59b0106..e5cec6685cc 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -5,6 +5,7 @@
5#include <asm/alternative.h> 5#include <asm/alternative.h>
6#include <asm/sections.h> 6#include <asm/sections.h>
7 7
8static int noreplace_smp = 0;
8static int smp_alt_once = 0; 9static int smp_alt_once = 0;
9static int debug_alternative = 0; 10static int debug_alternative = 0;
10 11
@@ -13,15 +14,33 @@ static int __init bootonly(char *str)
13 smp_alt_once = 1; 14 smp_alt_once = 1;
14 return 1; 15 return 1;
15} 16}
17__setup("smp-alt-boot", bootonly);
18
16static int __init debug_alt(char *str) 19static int __init debug_alt(char *str)
17{ 20{
18 debug_alternative = 1; 21 debug_alternative = 1;
19 return 1; 22 return 1;
20} 23}
21
22__setup("smp-alt-boot", bootonly);
23__setup("debug-alternative", debug_alt); 24__setup("debug-alternative", debug_alt);
24 25
26static int __init setup_noreplace_smp(char *str)
27{
28 noreplace_smp = 1;
29 return 1;
30}
31__setup("noreplace-smp", setup_noreplace_smp);
32
33#ifdef CONFIG_PARAVIRT
34static int noreplace_paravirt = 0;
35
36static int __init setup_noreplace_paravirt(char *str)
37{
38 noreplace_paravirt = 1;
39 return 1;
40}
41__setup("noreplace-paravirt", setup_noreplace_paravirt);
42#endif
43
25#define DPRINTK(fmt, args...) if (debug_alternative) \ 44#define DPRINTK(fmt, args...) if (debug_alternative) \
26 printk(KERN_DEBUG fmt, args) 45 printk(KERN_DEBUG fmt, args)
27 46
@@ -132,11 +151,8 @@ static void nop_out(void *insns, unsigned int len)
132} 151}
133 152
134extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 153extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
135extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
136extern u8 *__smp_locks[], *__smp_locks_end[]; 154extern u8 *__smp_locks[], *__smp_locks_end[];
137 155
138extern u8 __smp_alt_begin[], __smp_alt_end[];
139
140/* Replace instructions with better alternatives for this CPU type. 156/* Replace instructions with better alternatives for this CPU type.
141 This runs before SMP is initialized to avoid SMP problems with 157 This runs before SMP is initialized to avoid SMP problems with
142 self modifying code. This implies that assymetric systems where 158 self modifying code. This implies that assymetric systems where
@@ -171,29 +187,6 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
171 187
172#ifdef CONFIG_SMP 188#ifdef CONFIG_SMP
173 189
174static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end)
175{
176 struct alt_instr *a;
177
178 DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end);
179 for (a = start; a < end; a++) {
180 memcpy(a->replacement + a->replacementlen,
181 a->instr,
182 a->instrlen);
183 }
184}
185
186static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end)
187{
188 struct alt_instr *a;
189
190 for (a = start; a < end; a++) {
191 memcpy(a->instr,
192 a->replacement + a->replacementlen,
193 a->instrlen);
194 }
195}
196
197static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) 190static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
198{ 191{
199 u8 **ptr; 192 u8 **ptr;
@@ -211,6 +204,9 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
211{ 204{
212 u8 **ptr; 205 u8 **ptr;
213 206
207 if (noreplace_smp)
208 return;
209
214 for (ptr = start; ptr < end; ptr++) { 210 for (ptr = start; ptr < end; ptr++) {
215 if (*ptr < text) 211 if (*ptr < text)
216 continue; 212 continue;
@@ -245,6 +241,9 @@ void alternatives_smp_module_add(struct module *mod, char *name,
245 struct smp_alt_module *smp; 241 struct smp_alt_module *smp;
246 unsigned long flags; 242 unsigned long flags;
247 243
244 if (noreplace_smp)
245 return;
246
248 if (smp_alt_once) { 247 if (smp_alt_once) {
249 if (boot_cpu_has(X86_FEATURE_UP)) 248 if (boot_cpu_has(X86_FEATURE_UP))
250 alternatives_smp_unlock(locks, locks_end, 249 alternatives_smp_unlock(locks, locks_end,
@@ -279,7 +278,7 @@ void alternatives_smp_module_del(struct module *mod)
279 struct smp_alt_module *item; 278 struct smp_alt_module *item;
280 unsigned long flags; 279 unsigned long flags;
281 280
282 if (smp_alt_once) 281 if (smp_alt_once || noreplace_smp)
283 return; 282 return;
284 283
285 spin_lock_irqsave(&smp_alt, flags); 284 spin_lock_irqsave(&smp_alt, flags);
@@ -310,7 +309,7 @@ void alternatives_smp_switch(int smp)
310 return; 309 return;
311#endif 310#endif
312 311
313 if (smp_alt_once) 312 if (noreplace_smp || smp_alt_once)
314 return; 313 return;
315 BUG_ON(!smp && (num_online_cpus() > 1)); 314 BUG_ON(!smp && (num_online_cpus() > 1));
316 315
@@ -319,8 +318,6 @@ void alternatives_smp_switch(int smp)
319 printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); 318 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
320 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 319 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
321 clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); 320 clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
322 alternatives_smp_apply(__smp_alt_instructions,
323 __smp_alt_instructions_end);
324 list_for_each_entry(mod, &smp_alt_modules, next) 321 list_for_each_entry(mod, &smp_alt_modules, next)
325 alternatives_smp_lock(mod->locks, mod->locks_end, 322 alternatives_smp_lock(mod->locks, mod->locks_end,
326 mod->text, mod->text_end); 323 mod->text, mod->text_end);
@@ -328,8 +325,6 @@ void alternatives_smp_switch(int smp)
328 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 325 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
329 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 326 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
330 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); 327 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
331 apply_alternatives(__smp_alt_instructions,
332 __smp_alt_instructions_end);
333 list_for_each_entry(mod, &smp_alt_modules, next) 328 list_for_each_entry(mod, &smp_alt_modules, next)
334 alternatives_smp_unlock(mod->locks, mod->locks_end, 329 alternatives_smp_unlock(mod->locks, mod->locks_end,
335 mod->text, mod->text_end); 330 mod->text, mod->text_end);
@@ -340,36 +335,31 @@ void alternatives_smp_switch(int smp)
340#endif 335#endif
341 336
342#ifdef CONFIG_PARAVIRT 337#ifdef CONFIG_PARAVIRT
343void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) 338void apply_paravirt(struct paravirt_patch_site *start,
339 struct paravirt_patch_site *end)
344{ 340{
345 struct paravirt_patch *p; 341 struct paravirt_patch_site *p;
342
343 if (noreplace_paravirt)
344 return;
346 345
347 for (p = start; p < end; p++) { 346 for (p = start; p < end; p++) {
348 unsigned int used; 347 unsigned int used;
349 348
350 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, 349 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
351 p->len); 350 p->len);
352#ifdef CONFIG_DEBUG_PARAVIRT 351
353 { 352 BUG_ON(used > p->len);
354 int i; 353
355 /* Deliberately clobber regs using "not %reg" to find bugs. */
356 for (i = 0; i < 3; i++) {
357 if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
358 memcpy(p->instr + used, "\xf7\xd0", 2);
359 p->instr[used+1] |= i;
360 used += 2;
361 }
362 }
363 }
364#endif
365 /* Pad the rest with nops */ 354 /* Pad the rest with nops */
366 nop_out(p->instr + used, p->len - used); 355 nop_out(p->instr + used, p->len - used);
367 } 356 }
368 357
369 /* Sync to be conservative, in case we patched following instructions */ 358 /* Sync to be conservative, in case we patched following
359 * instructions */
370 sync_core(); 360 sync_core();
371} 361}
372extern struct paravirt_patch __start_parainstructions[], 362extern struct paravirt_patch_site __start_parainstructions[],
373 __stop_parainstructions[]; 363 __stop_parainstructions[];
374#endif /* CONFIG_PARAVIRT */ 364#endif /* CONFIG_PARAVIRT */
375 365
@@ -396,23 +386,19 @@ void __init alternative_instructions(void)
396 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 386 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
397 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 387 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
398 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); 388 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
399 apply_alternatives(__smp_alt_instructions,
400 __smp_alt_instructions_end);
401 alternatives_smp_unlock(__smp_locks, __smp_locks_end, 389 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
402 _text, _etext); 390 _text, _etext);
403 } 391 }
404 free_init_pages("SMP alternatives", 392 free_init_pages("SMP alternatives",
405 (unsigned long)__smp_alt_begin, 393 __pa_symbol(&__smp_locks),
406 (unsigned long)__smp_alt_end); 394 __pa_symbol(&__smp_locks_end));
407 } else { 395 } else {
408 alternatives_smp_save(__smp_alt_instructions,
409 __smp_alt_instructions_end);
410 alternatives_smp_module_add(NULL, "core kernel", 396 alternatives_smp_module_add(NULL, "core kernel",
411 __smp_locks, __smp_locks_end, 397 __smp_locks, __smp_locks_end,
412 _text, _etext); 398 _text, _etext);
413 alternatives_smp_switch(0); 399 alternatives_smp_switch(0);
414 } 400 }
415#endif 401#endif
416 apply_paravirt(__start_parainstructions, __stop_parainstructions); 402 apply_paravirt(__parainstructions, __parainstructions_end);
417 local_irq_restore(flags); 403 local_irq_restore(flags);
418} 404}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 93aa911646a..aca054cc055 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -129,6 +129,28 @@ static int modern_apic(void)
129 return lapic_get_version() >= 0x14; 129 return lapic_get_version() >= 0x14;
130} 130}
131 131
132void apic_wait_icr_idle(void)
133{
134 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
135 cpu_relax();
136}
137
138unsigned long safe_apic_wait_icr_idle(void)
139{
140 unsigned long send_status;
141 int timeout;
142
143 timeout = 0;
144 do {
145 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
146 if (!send_status)
147 break;
148 udelay(100);
149 } while (timeout++ < 1000);
150
151 return send_status;
152}
153
132/** 154/**
133 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
134 */ 156 */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 064bbf2861f..367ff1d930c 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -233,11 +233,10 @@
233#include <asm/desc.h> 233#include <asm/desc.h>
234#include <asm/i8253.h> 234#include <asm/i8253.h>
235#include <asm/paravirt.h> 235#include <asm/paravirt.h>
236#include <asm/reboot.h>
236 237
237#include "io_ports.h" 238#include "io_ports.h"
238 239
239extern void machine_real_restart(unsigned char *, int);
240
241#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) 240#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
242extern int (*console_blank_hook)(int); 241extern int (*console_blank_hook)(int);
243#endif 242#endif
@@ -384,13 +383,6 @@ static int ignore_sys_suspend;
384static int ignore_normal_resume; 383static int ignore_normal_resume;
385static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
386 385
387#ifdef CONFIG_APM_RTC_IS_GMT
388# define clock_cmos_diff 0
389# define got_clock_diff 1
390#else
391static long clock_cmos_diff;
392static int got_clock_diff;
393#endif
394static int debug __read_mostly; 386static int debug __read_mostly;
395static int smp __read_mostly; 387static int smp __read_mostly;
396static int apm_disabled = -1; 388static int apm_disabled = -1;
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c37535163bf..27a776c9044 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -11,11 +11,11 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <asm/ucontext.h> 12#include <asm/ucontext.h>
13#include "sigframe.h" 13#include "sigframe.h"
14#include <asm/pgtable.h>
14#include <asm/fixmap.h> 15#include <asm/fixmap.h>
15#include <asm/processor.h> 16#include <asm/processor.h>
16#include <asm/thread_info.h> 17#include <asm/thread_info.h>
17#include <asm/elf.h> 18#include <asm/elf.h>
18#include <asm/pda.h>
19 19
20#define DEFINE(sym, val) \ 20#define DEFINE(sym, val) \
21 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 21 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -25,6 +25,9 @@
25#define OFFSET(sym, str, mem) \ 25#define OFFSET(sym, str, mem) \
26 DEFINE(sym, offsetof(struct str, mem)); 26 DEFINE(sym, offsetof(struct str, mem));
27 27
28/* workaround for a warning with -Wmissing-prototypes */
29void foo(void);
30
28void foo(void) 31void foo(void)
29{ 32{
30 OFFSET(SIGCONTEXT_eax, sigcontext, eax); 33 OFFSET(SIGCONTEXT_eax, sigcontext, eax);
@@ -90,17 +93,18 @@ void foo(void)
90 OFFSET(pbe_next, pbe, next); 93 OFFSET(pbe_next, pbe, next);
91 94
92 /* Offset from the sysenter stack to tss.esp0 */ 95 /* Offset from the sysenter stack to tss.esp0 */
93 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - 96 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
94 sizeof(struct tss_struct)); 97 sizeof(struct tss_struct));
95 98
96 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 99 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
97 DEFINE(VDSO_PRELINK, VDSO_PRELINK); 100 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
101 DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
102 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
103 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
98 104
99 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 105 DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
100 106
101 BLANK(); 107 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
102 OFFSET(PDA_cpu, i386_pda, cpu_number);
103 OFFSET(PDA_pcurrent, i386_pda, pcurrent);
104 108
105#ifdef CONFIG_PARAVIRT 109#ifdef CONFIG_PARAVIRT
106 BLANK(); 110 BLANK();
diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile
index 010aecfffbc..74f27a463db 100644
--- a/arch/i386/kernel/cpu/Makefile
+++ b/arch/i386/kernel/cpu/Makefile
@@ -2,7 +2,7 @@
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details and quirks
3# 3#
4 4
5obj-y := common.o proc.o 5obj-y := common.o proc.o bugs.o
6 6
7obj-y += amd.o 7obj-y += amd.o
8obj-y += cyrix.o 8obj-y += cyrix.o
@@ -17,3 +17,5 @@ obj-$(CONFIG_X86_MCE) += mcheck/
17 17
18obj-$(CONFIG_MTRR) += mtrr/ 18obj-$(CONFIG_MTRR) += mtrr/
19obj-$(CONFIG_CPU_FREQ) += cpufreq/ 19obj-$(CONFIG_CPU_FREQ) += cpufreq/
20
21obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 2d47db48297..4fec702afd7 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void)
53 return 0; 53 return 0;
54} 54}
55 55
56int force_mwait __cpuinitdata;
57
56static void __cpuinit init_amd(struct cpuinfo_x86 *c) 58static void __cpuinit init_amd(struct cpuinfo_x86 *c)
57{ 59{
58 u32 l, h; 60 u32 l, h;
@@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
275 277
276 if (amd_apic_timer_broken()) 278 if (amd_apic_timer_broken())
277 set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); 279 set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability);
280
281 if (c->x86 == 0x10 && !force_mwait)
282 clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
278} 283}
279 284
280static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) 285static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
@@ -314,13 +319,3 @@ int __init amd_init_cpu(void)
314 cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev; 319 cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
315 return 0; 320 return 0;
316} 321}
317
318//early_arch_initcall(amd_init_cpu);
319
320static int __init amd_exit_cpu(void)
321{
322 cpu_devs[X86_VENDOR_AMD] = NULL;
323 return 0;
324}
325
326late_initcall(amd_exit_cpu);
diff --git a/arch/i386/kernel/cpu/bugs.c b/arch/i386/kernel/cpu/bugs.c
new file mode 100644
index 00000000000..54428a2500f
--- /dev/null
+++ b/arch/i386/kernel/cpu/bugs.c
@@ -0,0 +1,191 @@
1/*
2 * arch/i386/cpu/bugs.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * Cyrix stuff, June 1998 by:
7 * - Rafael R. Reilova (moved everything from head.S),
8 * <rreilova@ececs.uc.edu>
9 * - Channing Corn (tests & fixes),
10 * - Andrew D. Balsa (code cleanup).
11 */
12#include <linux/init.h>
13#include <linux/utsname.h>
14#include <asm/processor.h>
15#include <asm/i387.h>
16#include <asm/msr.h>
17#include <asm/paravirt.h>
18#include <asm/alternative.h>
19
20static int __init no_halt(char *s)
21{
22 boot_cpu_data.hlt_works_ok = 0;
23 return 1;
24}
25
26__setup("no-hlt", no_halt);
27
28static int __init mca_pentium(char *s)
29{
30 mca_pentium_flag = 1;
31 return 1;
32}
33
34__setup("mca-pentium", mca_pentium);
35
36static int __init no_387(char *s)
37{
38 boot_cpu_data.hard_math = 0;
39 write_cr0(0xE | read_cr0());
40 return 1;
41}
42
43__setup("no387", no_387);
44
45static double __initdata x = 4195835.0;
46static double __initdata y = 3145727.0;
47
48/*
49 * This used to check for exceptions..
50 * However, it turns out that to support that,
51 * the XMM trap handlers basically had to
52 * be buggy. So let's have a correct XMM trap
53 * handler, and forget about printing out
54 * some status at boot.
55 *
56 * We should really only care about bugs here
57 * anyway. Not features.
58 */
59static void __init check_fpu(void)
60{
61 if (!boot_cpu_data.hard_math) {
62#ifndef CONFIG_MATH_EMULATION
63 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
64 printk(KERN_EMERG "Giving up.\n");
65 for (;;) ;
66#endif
67 return;
68 }
69
70/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
71 /* Test for the divl bug.. */
72 __asm__("fninit\n\t"
73 "fldl %1\n\t"
74 "fdivl %2\n\t"
75 "fmull %2\n\t"
76 "fldl %1\n\t"
77 "fsubp %%st,%%st(1)\n\t"
78 "fistpl %0\n\t"
79 "fwait\n\t"
80 "fninit"
81 : "=m" (*&boot_cpu_data.fdiv_bug)
82 : "m" (*&x), "m" (*&y));
83 if (boot_cpu_data.fdiv_bug)
84 printk("Hmm, FPU with FDIV bug.\n");
85}
86
87static void __init check_hlt(void)
88{
89 if (paravirt_enabled())
90 return;
91
92 printk(KERN_INFO "Checking 'hlt' instruction... ");
93 if (!boot_cpu_data.hlt_works_ok) {
94 printk("disabled\n");
95 return;
96 }
97 halt();
98 halt();
99 halt();
100 halt();
101 printk("OK.\n");
102}
103
104/*
105 * Most 386 processors have a bug where a POPAD can lock the
106 * machine even from user space.
107 */
108
109static void __init check_popad(void)
110{
111#ifndef CONFIG_X86_POPAD_OK
112 int res, inp = (int) &res;
113
114 printk(KERN_INFO "Checking for popad bug... ");
115 __asm__ __volatile__(
116 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
117 : "=&a" (res)
118 : "d" (inp)
119 : "ecx", "edi" );
120 /* If this fails, it means that any user program may lock the CPU hard. Too bad. */
121 if (res != 12345678) printk( "Buggy.\n" );
122 else printk( "OK.\n" );
123#endif
124}
125
126/*
127 * Check whether we are able to run this kernel safely on SMP.
128 *
129 * - In order to run on a i386, we need to be compiled for i386
130 * (for due to lack of "invlpg" and working WP on a i386)
131 * - In order to run on anything without a TSC, we need to be
132 * compiled for a i486.
133 * - In order to support the local APIC on a buggy Pentium machine,
134 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
135 * which happens implicitly if compiled for a Pentium or lower
136 * (unless an advanced selection of CPU features is used) as an
137 * otherwise config implies a properly working local APIC without
138 * the need to do extra reads from the APIC.
139*/
140
141static void __init check_config(void)
142{
143/*
144 * We'd better not be a i386 if we're configured to use some
145 * i486+ only features! (WP works in supervisor mode and the
146 * new "invlpg" and "bswap" instructions)
147 */
148#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
149 if (boot_cpu_data.x86 == 3)
150 panic("Kernel requires i486+ for 'invlpg' and other features");
151#endif
152
153/*
154 * If we configured ourselves for a TSC, we'd better have one!
155 */
156#ifdef CONFIG_X86_TSC
157 if (!cpu_has_tsc && !tsc_disable)
158 panic("Kernel compiled for Pentium+, requires TSC feature!");
159#endif
160
161/*
162 * If we were told we had a good local APIC, check for buggy Pentia,
163 * i.e. all B steppings and the C2 stepping of P54C when using their
164 * integrated APIC (see 11AP erratum in "Pentium Processor
165 * Specification Update").
166 */
167#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
168 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
169 && cpu_has_apic
170 && boot_cpu_data.x86 == 5
171 && boot_cpu_data.x86_model == 2
172 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
173 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
174#endif
175}
176
177
178void __init check_bugs(void)
179{
180 identify_boot_cpu();
181#ifndef CONFIG_SMP
182 printk("CPU: ");
183 print_cpu_info(&boot_cpu_data);
184#endif
185 check_config();
186 check_fpu();
187 check_hlt();
188 check_popad();
189 init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
190 alternative_instructions();
191}
diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c
index 8c25047975c..473eac883c7 100644
--- a/arch/i386/kernel/cpu/centaur.c
+++ b/arch/i386/kernel/cpu/centaur.c
@@ -469,13 +469,3 @@ int __init centaur_init_cpu(void)
469 cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev; 469 cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev;
470 return 0; 470 return 0;
471} 471}
472
473//early_arch_initcall(centaur_init_cpu);
474
475static int __init centaur_exit_cpu(void)
476{
477 cpu_devs[X86_VENDOR_CENTAUR] = NULL;
478 return 0;
479}
480
481late_initcall(centaur_exit_cpu);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index dcbbd0a8bfc..794d593c47e 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,15 +18,37 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <mach_apic.h> 19#include <mach_apic.h>
20#endif 20#endif
21#include <asm/pda.h>
22 21
23#include "cpu.h" 22#include "cpu.h"
24 23
25DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); 24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
26EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); 25 [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
26 [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
28 [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
29 /*
30 * Segments used for calling PnP BIOS have byte granularity.
31 * They code segments and data segments have fixed 64k limits,
32 * the transfer segment sizes are set at run time.
33 */
34 [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
35 [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
36 [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
37 [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
38 [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
39 /*
40 * The APM segments have byte granularity and their bases
41 * are set at run time. All have 64k limits.
42 */
43 [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
44 /* 16-bit code */
45 [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
46 [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
27 47
28struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; 48 [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
29EXPORT_SYMBOL(_cpu_pda); 49 [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
50} };
51EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
30 52
31static int cachesize_override __cpuinitdata = -1; 53static int cachesize_override __cpuinitdata = -1;
32static int disable_x86_fxsr __cpuinitdata; 54static int disable_x86_fxsr __cpuinitdata;
@@ -368,7 +390,7 @@ __setup("serialnumber", x86_serial_nr_setup);
368/* 390/*
369 * This does the hard work of actually picking apart the CPU stuff... 391 * This does the hard work of actually picking apart the CPU stuff...
370 */ 392 */
371void __cpuinit identify_cpu(struct cpuinfo_x86 *c) 393static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
372{ 394{
373 int i; 395 int i;
374 396
@@ -479,15 +501,22 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
479 501
480 /* Init Machine Check Exception if available. */ 502 /* Init Machine Check Exception if available. */
481 mcheck_init(c); 503 mcheck_init(c);
504}
482 505
483 if (c == &boot_cpu_data) 506void __init identify_boot_cpu(void)
484 sysenter_setup(); 507{
508 identify_cpu(&boot_cpu_data);
509 sysenter_setup();
485 enable_sep_cpu(); 510 enable_sep_cpu();
511 mtrr_bp_init();
512}
486 513
487 if (c == &boot_cpu_data) 514void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
488 mtrr_bp_init(); 515{
489 else 516 BUG_ON(c == &boot_cpu_data);
490 mtrr_ap_init(); 517 identify_cpu(c);
518 enable_sep_cpu();
519 mtrr_ap_init();
491} 520}
492 521
493#ifdef CONFIG_X86_HT 522#ifdef CONFIG_X86_HT
@@ -601,129 +630,36 @@ void __init early_cpu_init(void)
601#endif 630#endif
602} 631}
603 632
604/* Make sure %gs is initialized properly in idle threads */ 633/* Make sure %fs is initialized properly in idle threads */
605struct pt_regs * __devinit idle_regs(struct pt_regs *regs) 634struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
606{ 635{
607 memset(regs, 0, sizeof(struct pt_regs)); 636 memset(regs, 0, sizeof(struct pt_regs));
608 regs->xfs = __KERNEL_PDA; 637 regs->xfs = __KERNEL_PERCPU;
609 return regs; 638 return regs;
610} 639}
611 640
612static __cpuinit int alloc_gdt(int cpu) 641/* Current gdt points %fs at the "master" per-cpu area: after this,
642 * it's on the real one. */
643void switch_to_new_gdt(void)
613{ 644{
614 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); 645 struct Xgt_desc_struct gdt_descr;
615 struct desc_struct *gdt;
616 struct i386_pda *pda;
617
618 gdt = (struct desc_struct *)cpu_gdt_descr->address;
619 pda = cpu_pda(cpu);
620
621 /*
622 * This is a horrible hack to allocate the GDT. The problem
623 * is that cpu_init() is called really early for the boot CPU
624 * (and hence needs bootmem) but much later for the secondary
625 * CPUs, when bootmem will have gone away
626 */
627 if (NODE_DATA(0)->bdata->node_bootmem_map) {
628 BUG_ON(gdt != NULL || pda != NULL);
629
630 gdt = alloc_bootmem_pages(PAGE_SIZE);
631 pda = alloc_bootmem(sizeof(*pda));
632 /* alloc_bootmem(_pages) panics on failure, so no check */
633
634 memset(gdt, 0, PAGE_SIZE);
635 memset(pda, 0, sizeof(*pda));
636 } else {
637 /* GDT and PDA might already have been allocated if
638 this is a CPU hotplug re-insertion. */
639 if (gdt == NULL)
640 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
641
642 if (pda == NULL)
643 pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
644
645 if (unlikely(!gdt || !pda)) {
646 free_pages((unsigned long)gdt, 0);
647 kfree(pda);
648 return 0;
649 }
650 }
651
652 cpu_gdt_descr->address = (unsigned long)gdt;
653 cpu_pda(cpu) = pda;
654
655 return 1;
656}
657 646
658/* Initial PDA used by boot CPU */ 647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
659struct i386_pda boot_pda = { 648 gdt_descr.size = GDT_SIZE - 1;
660 ._pda = &boot_pda, 649 load_gdt(&gdt_descr);
661 .cpu_number = 0, 650 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
662 .pcurrent = &init_task,
663};
664
665static inline void set_kernel_fs(void)
666{
667 /* Set %fs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */
670 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
671} 651}
672 652
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for 653/*
674 itself, but secondaries find this done for them. */ 654 * cpu_init() initializes state that is per-CPU. Some data is already
675__cpuinit int init_gdt(int cpu, struct task_struct *idle) 655 * initialized (naturally) in the bootstrap process, such as the GDT
676{ 656 * and IDT. We reload them nevertheless, this function acts as a
677 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); 657 * 'CPU state barrier', nothing should get across.
678 struct desc_struct *gdt; 658 */
679 struct i386_pda *pda; 659void __cpuinit cpu_init(void)
680
681 /* For non-boot CPUs, the GDT and PDA should already have been
682 allocated. */
683 if (!alloc_gdt(cpu)) {
684 printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
685 return 0;
686 }
687
688 gdt = (struct desc_struct *)cpu_gdt_descr->address;
689 pda = cpu_pda(cpu);
690
691 BUG_ON(gdt == NULL || pda == NULL);
692
693 /*
694 * Initialize the per-CPU GDT with the boot GDT,
695 * and set up the GDT descriptor:
696 */
697 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
698 cpu_gdt_descr->size = GDT_SIZE - 1;
699
700 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
701 (u32 *)&gdt[GDT_ENTRY_PDA].b,
702 (unsigned long)pda, sizeof(*pda) - 1,
703 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
704
705 memset(pda, 0, sizeof(*pda));
706 pda->_pda = pda;
707 pda->cpu_number = cpu;
708 pda->pcurrent = idle;
709
710 return 1;
711}
712
713void __cpuinit cpu_set_gdt(int cpu)
714{
715 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
716
717 /* Reinit these anyway, even if they've already been done (on
718 the boot CPU, this will transition from the boot gdt+pda to
719 the real ones). */
720 load_gdt(cpu_gdt_descr);
721 set_kernel_fs();
722}
723
724/* Common CPU init for both boot and secondary CPUs */
725static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
726{ 660{
661 int cpu = smp_processor_id();
662 struct task_struct *curr = current;
727 struct tss_struct * t = &per_cpu(init_tss, cpu); 663 struct tss_struct * t = &per_cpu(init_tss, cpu);
728 struct thread_struct *thread = &curr->thread; 664 struct thread_struct *thread = &curr->thread;
729 665
@@ -744,6 +680,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
744 } 680 }
745 681
746 load_idt(&idt_descr); 682 load_idt(&idt_descr);
683 switch_to_new_gdt();
747 684
748 /* 685 /*
749 * Set up and load the per-CPU TSS and LDT 686 * Set up and load the per-CPU TSS and LDT
@@ -783,38 +720,6 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
783 mxcsr_feature_mask_init(); 720 mxcsr_feature_mask_init();
784} 721}
785 722
786/* Entrypoint to initialize secondary CPU */
787void __cpuinit secondary_cpu_init(void)
788{
789 int cpu = smp_processor_id();
790 struct task_struct *curr = current;
791
792 _cpu_init(cpu, curr);
793}
794
795/*
796 * cpu_init() initializes state that is per-CPU. Some data is already
797 * initialized (naturally) in the bootstrap process, such as the GDT
798 * and IDT. We reload them nevertheless, this function acts as a
799 * 'CPU state barrier', nothing should get across.
800 */
801void __cpuinit cpu_init(void)
802{
803 int cpu = smp_processor_id();
804 struct task_struct *curr = current;
805
806 /* Set up the real GDT and PDA, so we can transition from the
807 boot versions. */
808 if (!init_gdt(cpu, curr)) {
809 /* failed to allocate something; not much we can do... */
810 for (;;)
811 local_irq_enable();
812 }
813
814 cpu_set_gdt(cpu);
815 _cpu_init(cpu, curr);
816}
817
818#ifdef CONFIG_HOTPLUG_CPU 723#ifdef CONFIG_HOTPLUG_CPU
819void __cpuinit cpu_uninit(void) 724void __cpuinit cpu_uninit(void)
820{ 725{
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index de27bd07bc9..0b8411a864f 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
279 */ 279 */
280 if (vendor == PCI_VENDOR_ID_CYRIX && 280 if (vendor == PCI_VENDOR_ID_CYRIX &&
281 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) 281 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
282 pit_latch_buggy = 1; 282 mark_tsc_unstable("cyrix 5510/5520 detected");
283 } 283 }
284#endif 284#endif
285 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ 285 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
@@ -448,16 +448,6 @@ int __init cyrix_init_cpu(void)
448 return 0; 448 return 0;
449} 449}
450 450
451//early_arch_initcall(cyrix_init_cpu);
452
453static int __init cyrix_exit_cpu(void)
454{
455 cpu_devs[X86_VENDOR_CYRIX] = NULL;
456 return 0;
457}
458
459late_initcall(cyrix_exit_cpu);
460
461static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 451static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
462 .c_vendor = "NSC", 452 .c_vendor = "NSC",
463 .c_ident = { "Geode by NSC" }, 453 .c_ident = { "Geode by NSC" },
@@ -470,12 +460,3 @@ int __init nsc_init_cpu(void)
470 return 0; 460 return 0;
471} 461}
472 462
473//early_arch_initcall(nsc_init_cpu);
474
475static int __init nsc_exit_cpu(void)
476{
477 cpu_devs[X86_VENDOR_NSC] = NULL;
478 return 0;
479}
480
481late_initcall(nsc_exit_cpu);
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 56fe2658495..dc4e08147b1 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -188,8 +188,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
188 } 188 }
189#endif 189#endif
190 190
191 if (c->x86 == 15) 191 if (c->x86 == 15) {
192 set_bit(X86_FEATURE_P4, c->x86_capability); 192 set_bit(X86_FEATURE_P4, c->x86_capability);
193 set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
194 }
193 if (c->x86 == 6) 195 if (c->x86 == 6)
194 set_bit(X86_FEATURE_P3, c->x86_capability); 196 set_bit(X86_FEATURE_P3, c->x86_capability);
195 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 197 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c
index b0862af595a..f9fa4142551 100644
--- a/arch/i386/kernel/cpu/mcheck/k7.c
+++ b/arch/i386/kernel/cpu/mcheck/k7.c
@@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
75 machine_check_vector = k7_machine_check; 75 machine_check_vector = k7_machine_check;
76 wmb(); 76 wmb();
77 77
78 if (!cpu_has(c, X86_FEATURE_MCE))
79 return;
80
78 printk (KERN_INFO "Intel machine check architecture supported.\n"); 81 printk (KERN_INFO "Intel machine check architecture supported.\n");
79 rdmsr (MSR_IA32_MCG_CAP, l, h); 82 rdmsr (MSR_IA32_MCG_CAP, l, h);
80 if (l & (1<<8)) /* Control register present ? */ 83 if (l & (1<<8)) /* Control register present ? */
@@ -82,9 +85,13 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
82 nr_mce_banks = l & 0xff; 85 nr_mce_banks = l & 0xff;
83 86
84 /* Clear status for MC index 0 separately, we don't touch CTL, 87 /* Clear status for MC index 0 separately, we don't touch CTL,
85 * as some Athlons cause spurious MCEs when its enabled. */ 88 * as some K7 Athlons cause spurious MCEs when its enabled. */
86 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); 89 if (boot_cpu_data.x86 == 6) {
87 for (i=1; i<nr_mce_banks; i++) { 90 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
91 i = 1;
92 } else
93 i = 0;
94 for (; i<nr_mce_banks; i++) {
88 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 95 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
89 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 96 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
90 } 97 }
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 4f10c62d180..56cd485b127 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -38,8 +38,7 @@ void mcheck_init(struct cpuinfo_x86 *c)
38 38
39 switch (c->x86_vendor) { 39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD: 40 case X86_VENDOR_AMD:
41 if (c->x86==6 || c->x86==15) 41 amd_mcheck_init(c);
42 amd_mcheck_init(c);
43 break; 42 break;
44 43
45 case X86_VENDOR_INTEL: 44 case X86_VENDOR_INTEL:
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 504434a4601..1509edfb231 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -124,13 +124,10 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
124 124
125 125
126/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 126/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
127static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 127static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
128{ 128{
129 u32 h; 129 u32 h;
130 130
131 if (mce_num_extended_msrs == 0)
132 goto done;
133
134 rdmsr (MSR_IA32_MCG_EAX, r->eax, h); 131 rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
135 rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); 132 rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
136 rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); 133 rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
@@ -141,12 +138,6 @@ static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
141 rdmsr (MSR_IA32_MCG_ESP, r->esp, h); 138 rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
142 rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); 139 rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
143 rdmsr (MSR_IA32_MCG_EIP, r->eip, h); 140 rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
144
145 /* can we rely on kmalloc to do a dynamic
146 * allocation for the reserved registers?
147 */
148done:
149 return mce_num_extended_msrs;
150} 141}
151 142
152static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) 143static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
@@ -155,7 +146,6 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
155 u32 alow, ahigh, high, low; 146 u32 alow, ahigh, high, low;
156 u32 mcgstl, mcgsth; 147 u32 mcgstl, mcgsth;
157 int i; 148 int i;
158 struct intel_mce_extended_msrs dbg;
159 149
160 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 150 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
161 if (mcgstl & (1<<0)) /* Recoverable ? */ 151 if (mcgstl & (1<<0)) /* Recoverable ? */
@@ -164,7 +154,9 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
164 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 154 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
165 smp_processor_id(), mcgsth, mcgstl); 155 smp_processor_id(), mcgsth, mcgstl);
166 156
167 if (intel_get_extended_msrs(&dbg)) { 157 if (mce_num_extended_msrs > 0) {
158 struct intel_mce_extended_msrs dbg;
159 intel_get_extended_msrs(&dbg);
168 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", 160 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
169 smp_processor_id(), dbg.eip, dbg.eflags); 161 smp_processor_id(), dbg.eip, dbg.eflags);
170 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", 162 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index f77fc53db65..5367e32e040 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -20,13 +20,25 @@ struct mtrr_state {
20 mtrr_type def_type; 20 mtrr_type def_type;
21}; 21};
22 22
23struct fixed_range_block {
24 int base_msr; /* start address of an MTRR block */
25 int ranges; /* number of MTRRs in this block */
26};
27
28static struct fixed_range_block fixed_range_blocks[] = {
29 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */
30 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */
31 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */
32 {}
33};
34
23static unsigned long smp_changes_mask; 35static unsigned long smp_changes_mask;
24static struct mtrr_state mtrr_state = {}; 36static struct mtrr_state mtrr_state = {};
25 37
26#undef MODULE_PARAM_PREFIX 38#undef MODULE_PARAM_PREFIX
27#define MODULE_PARAM_PREFIX "mtrr." 39#define MODULE_PARAM_PREFIX "mtrr."
28 40
29static __initdata int mtrr_show; 41static int mtrr_show;
30module_param_named(show, mtrr_show, bool, 0); 42module_param_named(show, mtrr_show, bool, 0);
31 43
32/* Get the MSR pair relating to a var range */ 44/* Get the MSR pair relating to a var range */
@@ -37,7 +49,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
37 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 49 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
38} 50}
39 51
40static void __init 52static void
41get_fixed_ranges(mtrr_type * frs) 53get_fixed_ranges(mtrr_type * frs)
42{ 54{
43 unsigned int *p = (unsigned int *) frs; 55 unsigned int *p = (unsigned int *) frs;
@@ -51,12 +63,18 @@ get_fixed_ranges(mtrr_type * frs)
51 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 63 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
52} 64}
53 65
54static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types) 66void mtrr_save_fixed_ranges(void *info)
67{
68 get_fixed_ranges(mtrr_state.fixed_ranges);
69}
70
71static void __cpuinit print_fixed(unsigned base, unsigned step, const mtrr_type*types)
55{ 72{
56 unsigned i; 73 unsigned i;
57 74
58 for (i = 0; i < 8; ++i, ++types, base += step) 75 for (i = 0; i < 8; ++i, ++types, base += step)
59 printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); 76 printk(KERN_INFO "MTRR %05X-%05X %s\n",
77 base, base + step - 1, mtrr_attrib_to_str(*types));
60} 78}
61 79
62/* Grab all of the MTRR state for this CPU into *state */ 80/* Grab all of the MTRR state for this CPU into *state */
@@ -147,6 +165,44 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
147 smp_processor_id(), msr, a, b); 165 smp_processor_id(), msr, a, b);
148} 166}
149 167
168/**
169 * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
170 * see AMD publication no. 24593, chapter 3.2.1 for more information
171 */
172static inline void k8_enable_fixed_iorrs(void)
173{
174 unsigned lo, hi;
175
176 rdmsr(MSR_K8_SYSCFG, lo, hi);
177 mtrr_wrmsr(MSR_K8_SYSCFG, lo
178 | K8_MTRRFIXRANGE_DRAM_ENABLE
179 | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
180}
181
182/**
183 * Checks and updates an fixed-range MTRR if it differs from the value it
184 * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also.
185 * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information
186 * \param msr MSR address of the MTTR which should be checked and updated
187 * \param changed pointer which indicates whether the MTRR needed to be changed
188 * \param msrwords pointer to the MSR values which the MSR should have
189 */
190static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
191{
192 unsigned lo, hi;
193
194 rdmsr(msr, lo, hi);
195
196 if (lo != msrwords[0] || hi != msrwords[1]) {
197 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
198 boot_cpu_data.x86 == 15 &&
199 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
200 k8_enable_fixed_iorrs();
201 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
202 *changed = TRUE;
203 }
204}
205
150int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) 206int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
151/* [SUMMARY] Get a free MTRR. 207/* [SUMMARY] Get a free MTRR.
152 <base> The starting (base) address of the region. 208 <base> The starting (base) address of the region.
@@ -196,36 +252,21 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
196 *type = base_lo & 0xff; 252 *type = base_lo & 0xff;
197} 253}
198 254
255/**
256 * Checks and updates the fixed-range MTRRs if they differ from the saved set
257 * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges()
258 */
199static int set_fixed_ranges(mtrr_type * frs) 259static int set_fixed_ranges(mtrr_type * frs)
200{ 260{
201 unsigned int *p = (unsigned int *) frs; 261 unsigned long long *saved = (unsigned long long *) frs;
202 int changed = FALSE; 262 int changed = FALSE;
203 int i; 263 int block=-1, range;
204 unsigned int lo, hi;
205 264
206 rdmsr(MTRRfix64K_00000_MSR, lo, hi); 265 while (fixed_range_blocks[++block].ranges)
207 if (p[0] != lo || p[1] != hi) { 266 for (range=0; range < fixed_range_blocks[block].ranges; range++)
208 mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 267 set_fixed_range(fixed_range_blocks[block].base_msr + range,
209 changed = TRUE; 268 &changed, (unsigned int *) saved++);
210 }
211 269
212 for (i = 0; i < 2; i++) {
213 rdmsr(MTRRfix16K_80000_MSR + i, lo, hi);
214 if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) {
215 mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2],
216 p[3 + i * 2]);
217 changed = TRUE;
218 }
219 }
220
221 for (i = 0; i < 8; i++) {
222 rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi);
223 if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) {
224 mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2],
225 p[7 + i * 2]);
226 changed = TRUE;
227 }
228 }
229 return changed; 270 return changed;
230} 271}
231 272
@@ -428,7 +469,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
428 } 469 }
429 } 470 }
430 471
431 if (base + size < 0x100) { 472 if (base < 0x100) {
432 printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", 473 printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n",
433 base, size); 474 base, size);
434 return -EINVAL; 475 return -EINVAL;
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 0acfb6a5a22..02a2f39e5e0 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -729,6 +729,17 @@ void mtrr_ap_init(void)
729 local_irq_restore(flags); 729 local_irq_restore(flags);
730} 730}
731 731
732/**
733 * Save current fixed-range MTRR state of the BSP
734 */
735void mtrr_save_state(void)
736{
737 if (smp_processor_id() == 0)
738 mtrr_save_fixed_ranges(NULL);
739 else
740 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
741}
742
732static int __init mtrr_init_finialize(void) 743static int __init mtrr_init_finialize(void)
733{ 744{
734 if (!mtrr_if) 745 if (!mtrr_if)
diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c
index 8bf23cc80c6..961fbe1a748 100644
--- a/arch/i386/kernel/cpu/nexgen.c
+++ b/arch/i386/kernel/cpu/nexgen.c
@@ -58,13 +58,3 @@ int __init nexgen_init_cpu(void)
58 cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev; 58 cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
59 return 0; 59 return 0;
60} 60}
61
62//early_arch_initcall(nexgen_init_cpu);
63
64static int __init nexgen_exit_cpu(void)
65{
66 cpu_devs[X86_VENDOR_NEXGEN] = NULL;
67 return 0;
68}
69
70late_initcall(nexgen_exit_cpu);
diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c
new file mode 100644
index 00000000000..2b04c8f1db6
--- /dev/null
+++ b/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -0,0 +1,658 @@
1/* local apic based NMI watchdog for various CPUs.
2 This file also handles reservation of performance counters for coordination
3 with other users (like oprofile).
4
5 Note that these events normally don't tick when the CPU idles. This means
6 the frequency varies with CPU load.
7
8 Original code for K7/P6 written by Keith Owens */
9
10#include <linux/percpu.h>
11#include <linux/module.h>
12#include <linux/kernel.h>
13#include <linux/bitops.h>
14#include <linux/smp.h>
15#include <linux/nmi.h>
16#include <asm/apic.h>
17#include <asm/intel_arch_perfmon.h>
18
19struct nmi_watchdog_ctlblk {
20 unsigned int cccr_msr;
21 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
22 unsigned int evntsel_msr; /* the MSR to select the events to handle */
23};
24
25/* Interface defining a CPU specific perfctr watchdog */
26struct wd_ops {
27 int (*reserve)(void);
28 void (*unreserve)(void);
29 int (*setup)(unsigned nmi_hz);
30 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
31 void (*stop)(void *);
32 unsigned perfctr;
33 unsigned evntsel;
34 u64 checkbit;
35};
36
37static struct wd_ops *wd_ops;
38
39/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
40 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
41 */
42#define NMI_MAX_COUNTER_BITS 66
43
44/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
45 * evtsel_nmi_owner tracks the ownership of the event selection
46 * - different performance counters/ event selection may be reserved for
47 * different subsystems this reservation system just tries to coordinate
48 * things a little
49 */
50static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
51static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
52
53static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
54
55/* converts an msr to an appropriate reservation bit */
56static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
57{
58 return wd_ops ? msr - wd_ops->perfctr : 0;
59}
60
61/* converts an msr to an appropriate reservation bit */
62/* returns the bit offset of the event selection register */
63static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
64{
65 return wd_ops ? msr - wd_ops->evntsel : 0;
66}
67
68/* checks for a bit availability (hack for oprofile) */
69int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
70{
71 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
72
73 return (!test_bit(counter, perfctr_nmi_owner));
74}
75
76/* checks the an msr for availability */
77int avail_to_resrv_perfctr_nmi(unsigned int msr)
78{
79 unsigned int counter;
80
81 counter = nmi_perfctr_msr_to_bit(msr);
82 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
83
84 return (!test_bit(counter, perfctr_nmi_owner));
85}
86
87int reserve_perfctr_nmi(unsigned int msr)
88{
89 unsigned int counter;
90
91 counter = nmi_perfctr_msr_to_bit(msr);
92 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
93
94 if (!test_and_set_bit(counter, perfctr_nmi_owner))
95 return 1;
96 return 0;
97}
98
99void release_perfctr_nmi(unsigned int msr)
100{
101 unsigned int counter;
102
103 counter = nmi_perfctr_msr_to_bit(msr);
104 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
105
106 clear_bit(counter, perfctr_nmi_owner);
107}
108
109int reserve_evntsel_nmi(unsigned int msr)
110{
111 unsigned int counter;
112
113 counter = nmi_evntsel_msr_to_bit(msr);
114 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
115
116 if (!test_and_set_bit(counter, evntsel_nmi_owner))
117 return 1;
118 return 0;
119}
120
121void release_evntsel_nmi(unsigned int msr)
122{
123 unsigned int counter;
124
125 counter = nmi_evntsel_msr_to_bit(msr);
126 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
127
128 clear_bit(counter, evntsel_nmi_owner);
129}
130
131EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
132EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
133EXPORT_SYMBOL(reserve_perfctr_nmi);
134EXPORT_SYMBOL(release_perfctr_nmi);
135EXPORT_SYMBOL(reserve_evntsel_nmi);
136EXPORT_SYMBOL(release_evntsel_nmi);
137
138void disable_lapic_nmi_watchdog(void)
139{
140 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
141
142 if (atomic_read(&nmi_active) <= 0)
143 return;
144
145 on_each_cpu(wd_ops->stop, NULL, 0, 1);
146 wd_ops->unreserve();
147
148 BUG_ON(atomic_read(&nmi_active) != 0);
149}
150
151void enable_lapic_nmi_watchdog(void)
152{
153 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
154
155 /* are we already enabled */
156 if (atomic_read(&nmi_active) != 0)
157 return;
158
159 /* are we lapic aware */
160 if (!wd_ops)
161 return;
162 if (!wd_ops->reserve()) {
163 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
164 return;
165 }
166
167 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
168 touch_nmi_watchdog();
169}
170
171/*
172 * Activate the NMI watchdog via the local APIC.
173 */
174
175static unsigned int adjust_for_32bit_ctr(unsigned int hz)
176{
177 u64 counter_val;
178 unsigned int retval = hz;
179
180 /*
181 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
182 * are writable, with higher bits sign extending from bit 31.
183 * So, we can only program the counter with 31 bit values and
184 * 32nd bit should be 1, for 33.. to be 1.
185 * Find the appropriate nmi_hz
186 */
187 counter_val = (u64)cpu_khz * 1000;
188 do_div(counter_val, retval);
189 if (counter_val > 0x7fffffffULL) {
190 u64 count = (u64)cpu_khz * 1000;
191 do_div(count, 0x7fffffffUL);
192 retval = count + 1;
193 }
194 return retval;
195}
196
197static void
198write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
199{
200 u64 count = (u64)cpu_khz * 1000;
201
202 do_div(count, nmi_hz);
203 if(descr)
204 Dprintk("setting %s to -0x%08Lx\n", descr, count);
205 wrmsrl(perfctr_msr, 0 - count);
206}
207
208static void write_watchdog_counter32(unsigned int perfctr_msr,
209 const char *descr, unsigned nmi_hz)
210{
211 u64 count = (u64)cpu_khz * 1000;
212
213 do_div(count, nmi_hz);
214 if(descr)
215 Dprintk("setting %s to -0x%08Lx\n", descr, count);
216 wrmsr(perfctr_msr, (u32)(-count), 0);
217}
218
219/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
220 nicely stable so there is not much variety */
221
222#define K7_EVNTSEL_ENABLE (1 << 22)
223#define K7_EVNTSEL_INT (1 << 20)
224#define K7_EVNTSEL_OS (1 << 17)
225#define K7_EVNTSEL_USR (1 << 16)
226#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
227#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
228
229static int setup_k7_watchdog(unsigned nmi_hz)
230{
231 unsigned int perfctr_msr, evntsel_msr;
232 unsigned int evntsel;
233 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
234
235 perfctr_msr = MSR_K7_PERFCTR0;
236 evntsel_msr = MSR_K7_EVNTSEL0;
237
238 wrmsrl(perfctr_msr, 0UL);
239
240 evntsel = K7_EVNTSEL_INT
241 | K7_EVNTSEL_OS
242 | K7_EVNTSEL_USR
243 | K7_NMI_EVENT;
244
245 /* setup the timer */
246 wrmsr(evntsel_msr, evntsel, 0);
247 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
248 apic_write(APIC_LVTPC, APIC_DM_NMI);
249 evntsel |= K7_EVNTSEL_ENABLE;
250 wrmsr(evntsel_msr, evntsel, 0);
251
252 wd->perfctr_msr = perfctr_msr;
253 wd->evntsel_msr = evntsel_msr;
254 wd->cccr_msr = 0; //unused
255 return 1;
256}
257
258static void single_msr_stop_watchdog(void *arg)
259{
260 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
261
262 wrmsr(wd->evntsel_msr, 0, 0);
263}
264
265static int single_msr_reserve(void)
266{
267 if (!reserve_perfctr_nmi(wd_ops->perfctr))
268 return 0;
269
270 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
271 release_perfctr_nmi(wd_ops->perfctr);
272 return 0;
273 }
274 return 1;
275}
276
277static void single_msr_unreserve(void)
278{
279 release_evntsel_nmi(wd_ops->perfctr);
280 release_perfctr_nmi(wd_ops->evntsel);
281}
282
283static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
284{
285 /* start the cycle over again */
286 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
287}
288
289static struct wd_ops k7_wd_ops = {
290 .reserve = single_msr_reserve,
291 .unreserve = single_msr_unreserve,
292 .setup = setup_k7_watchdog,
293 .rearm = single_msr_rearm,
294 .stop = single_msr_stop_watchdog,
295 .perfctr = MSR_K7_PERFCTR0,
296 .evntsel = MSR_K7_EVNTSEL0,
297 .checkbit = 1ULL<<63,
298};
299
300/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
301
302#define P6_EVNTSEL0_ENABLE (1 << 22)
303#define P6_EVNTSEL_INT (1 << 20)
304#define P6_EVNTSEL_OS (1 << 17)
305#define P6_EVNTSEL_USR (1 << 16)
306#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
307#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
308
309static int setup_p6_watchdog(unsigned nmi_hz)
310{
311 unsigned int perfctr_msr, evntsel_msr;
312 unsigned int evntsel;
313 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
314
315 perfctr_msr = MSR_P6_PERFCTR0;
316 evntsel_msr = MSR_P6_EVNTSEL0;
317
318 wrmsrl(perfctr_msr, 0UL);
319
320 evntsel = P6_EVNTSEL_INT
321 | P6_EVNTSEL_OS
322 | P6_EVNTSEL_USR
323 | P6_NMI_EVENT;
324
325 /* setup the timer */
326 wrmsr(evntsel_msr, evntsel, 0);
327 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
328 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
329 apic_write(APIC_LVTPC, APIC_DM_NMI);
330 evntsel |= P6_EVNTSEL0_ENABLE;
331 wrmsr(evntsel_msr, evntsel, 0);
332
333 wd->perfctr_msr = perfctr_msr;
334 wd->evntsel_msr = evntsel_msr;
335 wd->cccr_msr = 0; //unused
336 return 1;
337}
338
339static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
340{
341 /* P6 based Pentium M need to re-unmask
342 * the apic vector but it doesn't hurt
343 * other P6 variant.
344 * ArchPerfom/Core Duo also needs this */
345 apic_write(APIC_LVTPC, APIC_DM_NMI);
346 /* P6/ARCH_PERFMON has 32 bit counter write */
347 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
348}
349
350static struct wd_ops p6_wd_ops = {
351 .reserve = single_msr_reserve,
352 .unreserve = single_msr_unreserve,
353 .setup = setup_p6_watchdog,
354 .rearm = p6_rearm,
355 .stop = single_msr_stop_watchdog,
356 .perfctr = MSR_P6_PERFCTR0,
357 .evntsel = MSR_P6_EVNTSEL0,
358 .checkbit = 1ULL<<39,
359};
360
361/* Intel P4 performance counters. By far the most complicated of all. */
362
363#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
364#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
365#define P4_ESCR_OS (1<<3)
366#define P4_ESCR_USR (1<<2)
367#define P4_CCCR_OVF_PMI0 (1<<26)
368#define P4_CCCR_OVF_PMI1 (1<<27)
369#define P4_CCCR_THRESHOLD(N) ((N)<<20)
370#define P4_CCCR_COMPLEMENT (1<<19)
371#define P4_CCCR_COMPARE (1<<18)
372#define P4_CCCR_REQUIRED (3<<16)
373#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
374#define P4_CCCR_ENABLE (1<<12)
375#define P4_CCCR_OVF (1<<31)
376
377/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
378 CRU_ESCR0 (with any non-null event selector) through a complemented
379 max threshold. [IA32-Vol3, Section 14.9.9] */
380
381static int setup_p4_watchdog(unsigned nmi_hz)
382{
383 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
384 unsigned int evntsel, cccr_val;
385 unsigned int misc_enable, dummy;
386 unsigned int ht_num;
387 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
388
389 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
390 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
391 return 0;
392
393#ifdef CONFIG_SMP
394 /* detect which hyperthread we are on */
395 if (smp_num_siblings == 2) {
396 unsigned int ebx, apicid;
397
398 ebx = cpuid_ebx(1);
399 apicid = (ebx >> 24) & 0xff;
400 ht_num = apicid & 1;
401 } else
402#endif
403 ht_num = 0;
404
405 /* performance counters are shared resources
406 * assign each hyperthread its own set
407 * (re-use the ESCR0 register, seems safe
408 * and keeps the cccr_val the same)
409 */
410 if (!ht_num) {
411 /* logical cpu 0 */
412 perfctr_msr = MSR_P4_IQ_PERFCTR0;
413 evntsel_msr = MSR_P4_CRU_ESCR0;
414 cccr_msr = MSR_P4_IQ_CCCR0;
415 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
416 } else {
417 /* logical cpu 1 */
418 perfctr_msr = MSR_P4_IQ_PERFCTR1;
419 evntsel_msr = MSR_P4_CRU_ESCR0;
420 cccr_msr = MSR_P4_IQ_CCCR1;
421 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
422 }
423
424 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
425 | P4_ESCR_OS
426 | P4_ESCR_USR;
427
428 cccr_val |= P4_CCCR_THRESHOLD(15)
429 | P4_CCCR_COMPLEMENT
430 | P4_CCCR_COMPARE
431 | P4_CCCR_REQUIRED;
432
433 wrmsr(evntsel_msr, evntsel, 0);
434 wrmsr(cccr_msr, cccr_val, 0);
435 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
436 apic_write(APIC_LVTPC, APIC_DM_NMI);
437 cccr_val |= P4_CCCR_ENABLE;
438 wrmsr(cccr_msr, cccr_val, 0);
439 wd->perfctr_msr = perfctr_msr;
440 wd->evntsel_msr = evntsel_msr;
441 wd->cccr_msr = cccr_msr;
442 return 1;
443}
444
445static void stop_p4_watchdog(void *arg)
446{
447 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
448 wrmsr(wd->cccr_msr, 0, 0);
449 wrmsr(wd->evntsel_msr, 0, 0);
450}
451
452static int p4_reserve(void)
453{
454 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
455 return 0;
456#ifdef CONFIG_SMP
457 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
458 goto fail1;
459#endif
460 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
461 goto fail2;
462 /* RED-PEN why is ESCR1 not reserved here? */
463 return 1;
464 fail2:
465#ifdef CONFIG_SMP
466 if (smp_num_siblings > 1)
467 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
468 fail1:
469#endif
470 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
471 return 0;
472}
473
474static void p4_unreserve(void)
475{
476#ifdef CONFIG_SMP
477 if (smp_num_siblings > 1)
478 release_evntsel_nmi(MSR_P4_IQ_PERFCTR1);
479#endif
480 release_evntsel_nmi(MSR_P4_IQ_PERFCTR0);
481 release_perfctr_nmi(MSR_P4_CRU_ESCR0);
482}
483
484static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
485{
486 unsigned dummy;
487 /*
488 * P4 quirks:
489 * - An overflown perfctr will assert its interrupt
490 * until the OVF flag in its CCCR is cleared.
491 * - LVTPC is masked on interrupt and must be
492 * unmasked by the LVTPC handler.
493 */
494 rdmsrl(wd->cccr_msr, dummy);
495 dummy &= ~P4_CCCR_OVF;
496 wrmsrl(wd->cccr_msr, dummy);
497 apic_write(APIC_LVTPC, APIC_DM_NMI);
498 /* start the cycle over again */
499 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
500}
501
502static struct wd_ops p4_wd_ops = {
503 .reserve = p4_reserve,
504 .unreserve = p4_unreserve,
505 .setup = setup_p4_watchdog,
506 .rearm = p4_rearm,
507 .stop = stop_p4_watchdog,
508 /* RED-PEN this is wrong for the other sibling */
509 .perfctr = MSR_P4_BPU_PERFCTR0,
510 .evntsel = MSR_P4_BSU_ESCR0,
511 .checkbit = 1ULL<<39,
512};
513
514/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
515 all future Intel CPUs. */
516
517#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
518#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
519
520static int setup_intel_arch_watchdog(unsigned nmi_hz)
521{
522 unsigned int ebx;
523 union cpuid10_eax eax;
524 unsigned int unused;
525 unsigned int perfctr_msr, evntsel_msr;
526 unsigned int evntsel;
527 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
528
529 /*
530 * Check whether the Architectural PerfMon supports
531 * Unhalted Core Cycles Event or not.
532 * NOTE: Corresponding bit = 0 in ebx indicates event present.
533 */
534 cpuid(10, &(eax.full), &ebx, &unused, &unused);
535 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
536 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
537 return 0;
538
539 perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1;
540 evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1;
541
542 wrmsrl(perfctr_msr, 0UL);
543
544 evntsel = ARCH_PERFMON_EVENTSEL_INT
545 | ARCH_PERFMON_EVENTSEL_OS
546 | ARCH_PERFMON_EVENTSEL_USR
547 | ARCH_PERFMON_NMI_EVENT_SEL
548 | ARCH_PERFMON_NMI_EVENT_UMASK;
549
550 /* setup the timer */
551 wrmsr(evntsel_msr, evntsel, 0);
552 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
553 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
554 apic_write(APIC_LVTPC, APIC_DM_NMI);
555 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
556 wrmsr(evntsel_msr, evntsel, 0);
557
558 wd->perfctr_msr = perfctr_msr;
559 wd->evntsel_msr = evntsel_msr;
560 wd->cccr_msr = 0; //unused
561 wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1);
562 return 1;
563}
564
565static struct wd_ops intel_arch_wd_ops = {
566 .reserve = single_msr_reserve,
567 .unreserve = single_msr_unreserve,
568 .setup = setup_intel_arch_watchdog,
569 .rearm = p6_rearm,
570 .stop = single_msr_stop_watchdog,
571 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
572 .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
573};
574
575static void probe_nmi_watchdog(void)
576{
577 switch (boot_cpu_data.x86_vendor) {
578 case X86_VENDOR_AMD:
579 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
580 boot_cpu_data.x86 != 16)
581 return;
582 wd_ops = &k7_wd_ops;
583 break;
584 case X86_VENDOR_INTEL:
585 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
586 wd_ops = &intel_arch_wd_ops;
587 break;
588 }
589 switch (boot_cpu_data.x86) {
590 case 6:
591 if (boot_cpu_data.x86_model > 0xd)
592 return;
593
594 wd_ops = &p6_wd_ops;
595 break;
596 case 15:
597 if (boot_cpu_data.x86_model > 0x4)
598 return;
599
600 wd_ops = &p4_wd_ops;
601 break;
602 default:
603 return;
604 }
605 break;
606 }
607}
608
609/* Interface to nmi.c */
610
611int lapic_watchdog_init(unsigned nmi_hz)
612{
613 if (!wd_ops) {
614 probe_nmi_watchdog();
615 if (!wd_ops)
616 return -1;
617 }
618
619 if (!(wd_ops->setup(nmi_hz))) {
620 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
621 raw_smp_processor_id());
622 return -1;
623 }
624
625 return 0;
626}
627
628void lapic_watchdog_stop(void)
629{
630 if (wd_ops)
631 wd_ops->stop(NULL);
632}
633
634unsigned lapic_adjust_nmi_hz(unsigned hz)
635{
636 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
637 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
638 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
639 hz = adjust_for_32bit_ctr(hz);
640 return hz;
641}
642
643int lapic_wd_event(unsigned nmi_hz)
644{
645 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
646 u64 ctr;
647 rdmsrl(wd->perfctr_msr, ctr);
648 if (ctr & wd_ops->checkbit) { /* perfctr still running? */
649 return 0;
650 }
651 wd_ops->rearm(wd, nmi_hz);
652 return 1;
653}
654
655int lapic_watchdog_ok(void)
656{
657 return wd_ops != NULL;
658}
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 47e3ebbfb28..89d91e6cc97 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -72,8 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
72 "stc", 72 "stc",
73 "100mhzsteps", 73 "100mhzsteps",
74 "hwpstate", 74 "hwpstate",
75 NULL, 75 "", /* constant_tsc - moved to flags */
76 NULL, /* constant_tsc - moved to flags */
77 /* nothing */ 76 /* nothing */
78 }; 77 };
79 struct cpuinfo_x86 *c = v; 78 struct cpuinfo_x86 *c = v;
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
index 9317f741498..50076f22e90 100644
--- a/arch/i386/kernel/cpu/rise.c
+++ b/arch/i386/kernel/cpu/rise.c
@@ -50,12 +50,3 @@ int __init rise_init_cpu(void)
50 return 0; 50 return 0;
51} 51}
52 52
53//early_arch_initcall(rise_init_cpu);
54
55static int __init rise_exit_cpu(void)
56{
57 cpu_devs[X86_VENDOR_RISE] = NULL;
58 return 0;
59}
60
61late_initcall(rise_exit_cpu);
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 5678d46863c..6471a5a1320 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -112,13 +112,3 @@ int __init transmeta_init_cpu(void)
112 cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev; 112 cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev;
113 return 0; 113 return 0;
114} 114}
115
116//early_arch_initcall(transmeta_init_cpu);
117
118static int __init transmeta_exit_cpu(void)
119{
120 cpu_devs[X86_VENDOR_TRANSMETA] = NULL;
121 return 0;
122}
123
124late_initcall(transmeta_exit_cpu);
diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c
index 1bf3f87e9c5..a7a4e75bdcd 100644
--- a/arch/i386/kernel/cpu/umc.c
+++ b/arch/i386/kernel/cpu/umc.c
@@ -24,13 +24,3 @@ int __init umc_init_cpu(void)
24 cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev; 24 cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev;
25 return 0; 25 return 0;
26} 26}
27
28//early_arch_initcall(umc_init_cpu);
29
30static int __init umc_exit_cpu(void)
31{
32 cpu_devs[X86_VENDOR_UMC] = NULL;
33 return 0;
34}
35
36late_initcall(umc_exit_cpu);
diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c
index b4d14c2eb34..265c5597efb 100644
--- a/arch/i386/kernel/doublefault.c
+++ b/arch/i386/kernel/doublefault.c
@@ -33,7 +33,7 @@ static void doublefault_fn(void)
33 printk("double fault, tss at %08lx\n", tss); 33 printk("double fault, tss at %08lx\n", tss);
34 34
35 if (ptr_ok(tss)) { 35 if (ptr_ok(tss)) {
36 struct tss_struct *t = (struct tss_struct *)tss; 36 struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
37 37
38 printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp); 38 printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp);
39 39
@@ -49,18 +49,21 @@ static void doublefault_fn(void)
49} 49}
50 50
51struct tss_struct doublefault_tss __cacheline_aligned = { 51struct tss_struct doublefault_tss __cacheline_aligned = {
52 .esp0 = STACK_START, 52 .x86_tss = {
53 .ss0 = __KERNEL_DS, 53 .esp0 = STACK_START,
54 .ldt = 0, 54 .ss0 = __KERNEL_DS,
55 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 55 .ldt = 0,
56 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
56 57
57 .eip = (unsigned long) doublefault_fn, 58 .eip = (unsigned long) doublefault_fn,
58 .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */ 59 /* 0x2 bit is always set */
59 .esp = STACK_START, 60 .eflags = X86_EFLAGS_SF | 0x2,
60 .es = __USER_DS, 61 .esp = STACK_START,
61 .cs = __KERNEL_CS, 62 .es = __USER_DS,
62 .ss = __KERNEL_DS, 63 .cs = __KERNEL_CS,
63 .ds = __USER_DS, 64 .ss = __KERNEL_DS,
65 .ds = __USER_DS,
64 66
65 .__cr3 = __pa(swapper_pg_dir) 67 .__cr3 = __pa(swapper_pg_dir)
68 }
66}; 69};
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index 70f39560846..9645bb51f76 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -161,26 +161,27 @@ static struct resource standard_io_resources[] = { {
161 161
162static int __init romsignature(const unsigned char *rom) 162static int __init romsignature(const unsigned char *rom)
163{ 163{
164 const unsigned short * const ptr = (const unsigned short *)rom;
164 unsigned short sig; 165 unsigned short sig;
165 166
166 return probe_kernel_address((const unsigned short *)rom, sig) == 0 && 167 return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
167 sig == ROMSIGNATURE;
168} 168}
169 169
170static int __init romchecksum(unsigned char *rom, unsigned long length) 170static int __init romchecksum(const unsigned char *rom, unsigned long length)
171{ 171{
172 unsigned char sum; 172 unsigned char sum, c;
173 173
174 for (sum = 0; length; length--) 174 for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
175 sum += *rom++; 175 sum += c;
176 return sum == 0; 176 return !length && !sum;
177} 177}
178 178
179static void __init probe_roms(void) 179static void __init probe_roms(void)
180{ 180{
181 const unsigned char *rom;
181 unsigned long start, length, upper; 182 unsigned long start, length, upper;
182 unsigned char *rom; 183 unsigned char c;
183 int i; 184 int i;
184 185
185 /* video rom */ 186 /* video rom */
186 upper = adapter_rom_resources[0].start; 187 upper = adapter_rom_resources[0].start;
@@ -191,8 +192,11 @@ static void __init probe_roms(void)
191 192
192 video_rom_resource.start = start; 193 video_rom_resource.start = start;
193 194
195 if (probe_kernel_address(rom + 2, c) != 0)
196 continue;
197
194 /* 0 < length <= 0x7f * 512, historically */ 198 /* 0 < length <= 0x7f * 512, historically */
195 length = rom[2] * 512; 199 length = c * 512;
196 200
197 /* if checksum okay, trust length byte */ 201 /* if checksum okay, trust length byte */
198 if (length && romchecksum(rom, length)) 202 if (length && romchecksum(rom, length))
@@ -226,8 +230,11 @@ static void __init probe_roms(void)
226 if (!romsignature(rom)) 230 if (!romsignature(rom))
227 continue; 231 continue;
228 232
233 if (probe_kernel_address(rom + 2, c) != 0)
234 continue;
235
229 /* 0 < length <= 0x7f * 512, historically */ 236 /* 0 < length <= 0x7f * 512, historically */
230 length = rom[2] * 512; 237 length = c * 512;
231 238
232 /* but accept any length that fits if checksum okay */ 239 /* but accept any length that fits if checksum okay */
233 if (!length || start + length > upper || !romchecksum(rom, length)) 240 if (!length || start + length > upper || !romchecksum(rom, length))
@@ -386,10 +393,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
386 ____________________33__ 393 ____________________33__
387 ______________________4_ 394 ______________________4_
388 */ 395 */
389 printk("sanitize start\n");
390 /* if there's only one memory region, don't bother */ 396 /* if there's only one memory region, don't bother */
391 if (*pnr_map < 2) { 397 if (*pnr_map < 2) {
392 printk("sanitize bail 0\n");
393 return -1; 398 return -1;
394 } 399 }
395 400
@@ -398,7 +403,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
398 /* bail out if we find any unreasonable addresses in bios map */ 403 /* bail out if we find any unreasonable addresses in bios map */
399 for (i=0; i<old_nr; i++) 404 for (i=0; i<old_nr; i++)
400 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) { 405 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
401 printk("sanitize bail 1\n");
402 return -1; 406 return -1;
403 } 407 }
404 408
@@ -494,7 +498,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
494 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); 498 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
495 *pnr_map = new_nr; 499 *pnr_map = new_nr;
496 500
497 printk("sanitize end\n");
498 return 0; 501 return 0;
499} 502}
500 503
@@ -525,7 +528,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
525 unsigned long long size = biosmap->size; 528 unsigned long long size = biosmap->size;
526 unsigned long long end = start + size; 529 unsigned long long end = start + size;
527 unsigned long type = biosmap->type; 530 unsigned long type = biosmap->type;
528 printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
529 531
530 /* Overflow in 64 bits? Ignore the memory map. */ 532 /* Overflow in 64 bits? Ignore the memory map. */
531 if (start > end) 533 if (start > end)
@@ -536,17 +538,11 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
536 * Not right. Fix it up. 538 * Not right. Fix it up.
537 */ 539 */
538 if (type == E820_RAM) { 540 if (type == E820_RAM) {
539 printk("copy_e820_map() type is E820_RAM\n");
540 if (start < 0x100000ULL && end > 0xA0000ULL) { 541 if (start < 0x100000ULL && end > 0xA0000ULL) {
541 printk("copy_e820_map() lies in range...\n"); 542 if (start < 0xA0000ULL)
542 if (start < 0xA0000ULL) {
543 printk("copy_e820_map() start < 0xA0000ULL\n");
544 add_memory_region(start, 0xA0000ULL-start, type); 543 add_memory_region(start, 0xA0000ULL-start, type);
545 } 544 if (end <= 0x100000ULL)
546 if (end <= 0x100000ULL) {
547 printk("copy_e820_map() end <= 0x100000ULL\n");
548 continue; 545 continue;
549 }
550 start = 0x100000ULL; 546 start = 0x100000ULL;
551 size = end - start; 547 size = end - start;
552 } 548 }
@@ -818,6 +814,26 @@ void __init limit_regions(unsigned long long size)
818 print_memory_map("limit_regions endfunc"); 814 print_memory_map("limit_regions endfunc");
819} 815}
820 816
817/*
818 * This function checks if any part of the range <start,end> is mapped
819 * with type.
820 */
821int
822e820_any_mapped(u64 start, u64 end, unsigned type)
823{
824 int i;
825 for (i = 0; i < e820.nr_map; i++) {
826 const struct e820entry *ei = &e820.map[i];
827 if (type && ei->type != type)
828 continue;
829 if (ei->addr >= end || ei->addr + ei->size <= start)
830 continue;
831 return 1;
832 }
833 return 0;
834}
835EXPORT_SYMBOL_GPL(e820_any_mapped);
836
821 /* 837 /*
822 * This function checks if the entire range <start,end> is mapped with type. 838 * This function checks if the entire range <start,end> is mapped with type.
823 * 839 *
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8f9c624ace6..dd9e7faafa7 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -69,13 +69,11 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
69{ 69{
70 unsigned long cr4; 70 unsigned long cr4;
71 unsigned long temp; 71 unsigned long temp;
72 struct Xgt_desc_struct *cpu_gdt_descr; 72 struct Xgt_desc_struct gdt_descr;
73 73
74 spin_lock(&efi_rt_lock); 74 spin_lock(&efi_rt_lock);
75 local_irq_save(efi_rt_eflags); 75 local_irq_save(efi_rt_eflags);
76 76
77 cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
78
79 /* 77 /*
80 * If I don't have PSE, I should just duplicate two entries in page 78 * If I don't have PSE, I should just duplicate two entries in page
81 * directory. If I have PSE, I just need to duplicate one entry in 79 * directory. If I have PSE, I just need to duplicate one entry in
@@ -105,17 +103,19 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
105 */ 103 */
106 local_flush_tlb(); 104 local_flush_tlb();
107 105
108 cpu_gdt_descr->address = __pa(cpu_gdt_descr->address); 106 gdt_descr.address = __pa(get_cpu_gdt_table(0));
109 load_gdt(cpu_gdt_descr); 107 gdt_descr.size = GDT_SIZE - 1;
108 load_gdt(&gdt_descr);
110} 109}
111 110
112static void efi_call_phys_epilog(void) __releases(efi_rt_lock) 111static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
113{ 112{
114 unsigned long cr4; 113 unsigned long cr4;
115 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); 114 struct Xgt_desc_struct gdt_descr;
116 115
117 cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address); 116 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
118 load_gdt(cpu_gdt_descr); 117 gdt_descr.size = GDT_SIZE - 1;
118 load_gdt(&gdt_descr);
119 119
120 cr4 = read_cr4(); 120 cr4 = read_cr4();
121 121
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 18bddcb8e9e..b1f16ee65e4 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -15,7 +15,7 @@
15 * I changed all the .align's to 4 (16 byte alignment), as that's faster 15 * I changed all the .align's to 4 (16 byte alignment), as that's faster
16 * on a 486. 16 * on a 486.
17 * 17 *
18 * Stack layout in 'ret_from_system_call': 18 * Stack layout in 'syscall_exit':
19 * ptrace needs to have all regs on the stack. 19 * ptrace needs to have all regs on the stack.
20 * if the order here is changed, it needs to be 20 * if the order here is changed, it needs to be
21 * updated in fork.c:copy_process, signal.c:do_signal, 21 * updated in fork.c:copy_process, signal.c:do_signal,
@@ -132,7 +132,7 @@ VM_MASK = 0x00020000
132 movl $(__USER_DS), %edx; \ 132 movl $(__USER_DS), %edx; \
133 movl %edx, %ds; \ 133 movl %edx, %ds; \
134 movl %edx, %es; \ 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \ 135 movl $(__KERNEL_PERCPU), %edx; \
136 movl %edx, %fs 136 movl %edx, %fs
137 137
138#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
@@ -305,16 +305,12 @@ sysenter_past_esp:
305 pushl $(__USER_CS) 305 pushl $(__USER_CS)
306 CFI_ADJUST_CFA_OFFSET 4 306 CFI_ADJUST_CFA_OFFSET 4
307 /*CFI_REL_OFFSET cs, 0*/ 307 /*CFI_REL_OFFSET cs, 0*/
308#ifndef CONFIG_COMPAT_VDSO
309 /* 308 /*
310 * Push current_thread_info()->sysenter_return to the stack. 309 * Push current_thread_info()->sysenter_return to the stack.
311 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 310 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
312 * pushed above; +8 corresponds to copy_thread's esp0 setting. 311 * pushed above; +8 corresponds to copy_thread's esp0 setting.
313 */ 312 */
314 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) 313 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
315#else
316 pushl $SYSENTER_RETURN
317#endif
318 CFI_ADJUST_CFA_OFFSET 4 314 CFI_ADJUST_CFA_OFFSET 4
319 CFI_REL_OFFSET eip, 0 315 CFI_REL_OFFSET eip, 0
320 316
@@ -342,7 +338,7 @@ sysenter_past_esp:
342 jae syscall_badsys 338 jae syscall_badsys
343 call *sys_call_table(,%eax,4) 339 call *sys_call_table(,%eax,4)
344 movl %eax,PT_EAX(%esp) 340 movl %eax,PT_EAX(%esp)
345 DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) 341 DISABLE_INTERRUPTS(CLBR_ANY)
346 TRACE_IRQS_OFF 342 TRACE_IRQS_OFF
347 movl TI_flags(%ebp), %ecx 343 movl TI_flags(%ebp), %ecx
348 testw $_TIF_ALLWORK_MASK, %cx 344 testw $_TIF_ALLWORK_MASK, %cx
@@ -560,9 +556,7 @@ END(syscall_badsys)
560 556
561#define FIXUP_ESPFIX_STACK \ 557#define FIXUP_ESPFIX_STACK \
562 /* since we are on a wrong stack, we cant make it a C code :( */ \ 558 /* since we are on a wrong stack, we cant make it a C code :( */ \
563 movl %fs:PDA_cpu, %ebx; \ 559 PER_CPU(gdt_page, %ebx); \
564 PER_CPU(cpu_gdt_descr, %ebx); \
565 movl GDS_address(%ebx), %ebx; \
566 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ 560 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
567 addl %esp, %eax; \ 561 addl %esp, %eax; \
568 pushl $__KERNEL_DS; \ 562 pushl $__KERNEL_DS; \
@@ -635,7 +629,7 @@ ENTRY(name) \
635 SAVE_ALL; \ 629 SAVE_ALL; \
636 TRACE_IRQS_OFF \ 630 TRACE_IRQS_OFF \
637 movl %esp,%eax; \ 631 movl %esp,%eax; \
638 call smp_/**/name; \ 632 call smp_##name; \
639 jmp ret_from_intr; \ 633 jmp ret_from_intr; \
640 CFI_ENDPROC; \ 634 CFI_ENDPROC; \
641ENDPROC(name) 635ENDPROC(name)
@@ -643,11 +637,6 @@ ENDPROC(name)
643/* The include is where all of the SMP etc. interrupts come from */ 637/* The include is where all of the SMP etc. interrupts come from */
644#include "entry_arch.h" 638#include "entry_arch.h"
645 639
646/* This alternate entry is needed because we hijack the apic LVTT */
647#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
648BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
649#endif
650
651KPROBE_ENTRY(page_fault) 640KPROBE_ENTRY(page_fault)
652 RING0_EC_FRAME 641 RING0_EC_FRAME
653 pushl $do_page_fault 642 pushl $do_page_fault
@@ -686,7 +675,7 @@ error_code:
686 pushl %fs 675 pushl %fs
687 CFI_ADJUST_CFA_OFFSET 4 676 CFI_ADJUST_CFA_OFFSET 4
688 /*CFI_REL_OFFSET fs, 0*/ 677 /*CFI_REL_OFFSET fs, 0*/
689 movl $(__KERNEL_PDA), %ecx 678 movl $(__KERNEL_PERCPU), %ecx
690 movl %ecx, %fs 679 movl %ecx, %fs
691 UNWIND_ESPFIX_STACK 680 UNWIND_ESPFIX_STACK
692 popl %ecx 681 popl %ecx
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 3fa7f9389af..9b10af65faa 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -34,17 +34,32 @@
34 34
35/* 35/*
36 * This is how much memory *in addition to the memory covered up to 36 * This is how much memory *in addition to the memory covered up to
37 * and including _end* we need mapped initially. We need one bit for 37 * and including _end* we need mapped initially.
38 * each possible page, but only in low memory, which means 38 * We need:
39 * 2^32/4096/8 = 128K worst case (4G/4G split.) 39 * - one bit for each possible page, but only in low memory, which means
40 * 2^32/4096/8 = 128K worst case (4G/4G split.)
41 * - enough space to map all low memory, which means
42 * (2^32/4096) / 1024 pages (worst case, non PAE)
43 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
44 * - a few pages for allocator use before the kernel pagetable has
45 * been set up
40 * 46 *
41 * Modulo rounding, each megabyte assigned here requires a kilobyte of 47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
42 * memory, which is currently unreclaimed. 48 * memory, which is currently unreclaimed.
43 * 49 *
44 * This should be a multiple of a page. 50 * This should be a multiple of a page.
45 */ 51 */
46#define INIT_MAP_BEYOND_END (128*1024) 52LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
47 53
54#if PTRS_PER_PMD > 1
55PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
56#else
57PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
58#endif
59BOOTBITMAP_SIZE = LOW_PAGES / 8
60ALLOCATOR_SLOP = 4
61
62INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
48 63
49/* 64/*
50 * 32-bit kernel entrypoint; only used by the boot CPU. On entry, 65 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -147,8 +162,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
147/* 162/*
148 * Non-boot CPU entry point; entered from trampoline.S 163 * Non-boot CPU entry point; entered from trampoline.S
149 * We can't lgdt here, because lgdt itself uses a data segment, but 164 * We can't lgdt here, because lgdt itself uses a data segment, but
150 * we know the trampoline has already loaded the boot_gdt_table GDT 165 * we know the trampoline has already loaded the boot_gdt for us.
151 * for us.
152 * 166 *
153 * If cpu hotplug is not supported then this code can go in init section 167 * If cpu hotplug is not supported then this code can go in init section
154 * which will be freed later 168 * which will be freed later
@@ -318,12 +332,12 @@ is386: movl $2,%ecx # set MP
318 movl %eax,%cr0 332 movl %eax,%cr0
319 333
320 call check_x87 334 call check_x87
321 call setup_pda
322 lgdt early_gdt_descr 335 lgdt early_gdt_descr
323 lidt idt_descr 336 lidt idt_descr
324 ljmp $(__KERNEL_CS),$1f 337 ljmp $(__KERNEL_CS),$1f
3251: movl $(__KERNEL_DS),%eax # reload all the segment registers 3381: movl $(__KERNEL_DS),%eax # reload all the segment registers
326 movl %eax,%ss # after changing gdt. 339 movl %eax,%ss # after changing gdt.
340 movl %eax,%fs # gets reset once there's real percpu
327 341
328 movl $(__USER_DS),%eax # DS/ES contains default USER segment 342 movl $(__USER_DS),%eax # DS/ES contains default USER segment
329 movl %eax,%ds 343 movl %eax,%ds
@@ -333,16 +347,17 @@ is386: movl $2,%ecx # set MP
333 movl %eax,%gs 347 movl %eax,%gs
334 lldt %ax 348 lldt %ax
335 349
336 movl $(__KERNEL_PDA),%eax
337 mov %eax,%fs
338
339 cld # gcc2 wants the direction flag cleared at all times 350 cld # gcc2 wants the direction flag cleared at all times
340 pushl $0 # fake return address for unwinder 351 pushl $0 # fake return address for unwinder
341#ifdef CONFIG_SMP 352#ifdef CONFIG_SMP
342 movb ready, %cl 353 movb ready, %cl
343 movb $1, ready 354 movb $1, ready
344 cmpb $0,%cl # the first CPU calls start_kernel 355 cmpb $0,%cl # the first CPU calls start_kernel
345 jne initialize_secondary # all other CPUs call initialize_secondary 356 je 1f
357 movl $(__KERNEL_PERCPU), %eax
358 movl %eax,%fs # set this cpu's percpu
359 jmp initialize_secondary # all other CPUs call initialize_secondary
3601:
346#endif /* CONFIG_SMP */ 361#endif /* CONFIG_SMP */
347 jmp start_kernel 362 jmp start_kernel
348 363
@@ -366,23 +381,6 @@ check_x87:
366 ret 381 ret
367 382
368/* 383/*
369 * Point the GDT at this CPU's PDA. On boot this will be
370 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
371 * that CPU's GDT and PDA.
372 */
373ENTRY(setup_pda)
374 /* get the PDA pointer */
375 movl start_pda, %eax
376
377 /* slot the PDA address into the GDT */
378 mov early_gdt_descr+2, %ecx
379 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
380 shr $16, %eax
381 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
382 mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
383 ret
384
385/*
386 * setup_idt 384 * setup_idt
387 * 385 *
388 * sets up a idt with 256 entries pointing to 386 * sets up a idt with 256 entries pointing to
@@ -554,9 +552,6 @@ ENTRY(empty_zero_page)
554 * This starts the data section. 552 * This starts the data section.
555 */ 553 */
556.data 554.data
557ENTRY(start_pda)
558 .long boot_pda
559
560ENTRY(stack_start) 555ENTRY(stack_start)
561 .long init_thread_union+THREAD_SIZE 556 .long init_thread_union+THREAD_SIZE
562 .long __BOOT_DS 557 .long __BOOT_DS
@@ -588,7 +583,7 @@ fault_msg:
588 .word 0 # 32 bit align gdt_desc.address 583 .word 0 # 32 bit align gdt_desc.address
589boot_gdt_descr: 584boot_gdt_descr:
590 .word __BOOT_DS+7 585 .word __BOOT_DS+7
591 .long boot_gdt_table - __PAGE_OFFSET 586 .long boot_gdt - __PAGE_OFFSET
592 587
593 .word 0 # 32-bit align idt_desc.address 588 .word 0 # 32-bit align idt_desc.address
594idt_descr: 589idt_descr:
@@ -599,67 +594,14 @@ idt_descr:
599 .word 0 # 32 bit align gdt_desc.address 594 .word 0 # 32 bit align gdt_desc.address
600ENTRY(early_gdt_descr) 595ENTRY(early_gdt_descr)
601 .word GDT_ENTRIES*8-1 596 .word GDT_ENTRIES*8-1
602 .long cpu_gdt_table 597 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
603 598
604/* 599/*
605 * The boot_gdt_table must mirror the equivalent in setup.S and is 600 * The boot_gdt must mirror the equivalent in setup.S and is
606 * used only for booting. 601 * used only for booting.
607 */ 602 */
608 .align L1_CACHE_BYTES 603 .align L1_CACHE_BYTES
609ENTRY(boot_gdt_table) 604ENTRY(boot_gdt)
610 .fill GDT_ENTRY_BOOT_CS,8,0 605 .fill GDT_ENTRY_BOOT_CS,8,0
611 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ 606 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
612 .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ 607 .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
613
614/*
615 * The Global Descriptor Table contains 28 quadwords, per-CPU.
616 */
617 .align L1_CACHE_BYTES
618ENTRY(cpu_gdt_table)
619 .quad 0x0000000000000000 /* NULL descriptor */
620 .quad 0x0000000000000000 /* 0x0b reserved */
621 .quad 0x0000000000000000 /* 0x13 reserved */
622 .quad 0x0000000000000000 /* 0x1b reserved */
623 .quad 0x0000000000000000 /* 0x20 unused */
624 .quad 0x0000000000000000 /* 0x28 unused */
625 .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
626 .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
627 .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
628 .quad 0x0000000000000000 /* 0x4b reserved */
629 .quad 0x0000000000000000 /* 0x53 reserved */
630 .quad 0x0000000000000000 /* 0x5b reserved */
631
632 .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
633 .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
634 .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
635 .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
636
637 .quad 0x0000000000000000 /* 0x80 TSS descriptor */
638 .quad 0x0000000000000000 /* 0x88 LDT descriptor */
639
640 /*
641 * Segments used for calling PnP BIOS have byte granularity.
642 * They code segments and data segments have fixed 64k limits,
643 * the transfer segment sizes are set at run time.
644 */
645 .quad 0x00409a000000ffff /* 0x90 32-bit code */
646 .quad 0x00009a000000ffff /* 0x98 16-bit code */
647 .quad 0x000092000000ffff /* 0xa0 16-bit data */
648 .quad 0x0000920000000000 /* 0xa8 16-bit data */
649 .quad 0x0000920000000000 /* 0xb0 16-bit data */
650
651 /*
652 * The APM segments have byte granularity and their bases
653 * are set at run time. All have 64k limits.
654 */
655 .quad 0x00409a000000ffff /* 0xb8 APM CS code */
656 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
657 .quad 0x004092000000ffff /* 0xc8 APM DS data */
658
659 .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
660 .quad 0x00cf92000000ffff /* 0xd8 - PDA */
661 .quad 0x0000000000000000 /* 0xe0 - unused */
662 .quad 0x0000000000000000 /* 0xe8 - unused */
663 .quad 0x0000000000000000 /* 0xf0 - unused */
664 .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
665
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 4afe26e8626..e3d4b73bfdb 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -28,5 +28,3 @@ EXPORT_SYMBOL(__read_lock_failed);
28#endif 28#endif
29 29
30EXPORT_SYMBOL(csum_partial); 30EXPORT_SYMBOL(csum_partial);
31
32EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 89d85d24492..1b623cda3a6 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -35,6 +35,7 @@
35#include <linux/msi.h> 35#include <linux/msi.h>
36#include <linux/htirq.h> 36#include <linux/htirq.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/kthread.h>
38 39
39#include <asm/io.h> 40#include <asm/io.h>
40#include <asm/smp.h> 41#include <asm/smp.h>
@@ -661,8 +662,6 @@ static int balanced_irq(void *unused)
661 unsigned long prev_balance_time = jiffies; 662 unsigned long prev_balance_time = jiffies;
662 long time_remaining = balanced_irq_interval; 663 long time_remaining = balanced_irq_interval;
663 664
664 daemonize("kirqd");
665
666 /* push everything to CPU 0 to give us a starting point. */ 665 /* push everything to CPU 0 to give us a starting point. */
667 for (i = 0 ; i < NR_IRQS ; i++) { 666 for (i = 0 ; i < NR_IRQS ; i++) {
668 irq_desc[i].pending_mask = cpumask_of_cpu(0); 667 irq_desc[i].pending_mask = cpumask_of_cpu(0);
@@ -722,10 +721,9 @@ static int __init balanced_irq_init(void)
722 } 721 }
723 722
724 printk(KERN_INFO "Starting balanced_irq\n"); 723 printk(KERN_INFO "Starting balanced_irq\n");
725 if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 724 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
726 return 0; 725 return 0;
727 else 726 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
728 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
729failed: 727failed:
730 for_each_possible_cpu(i) { 728 for_each_possible_cpu(i) {
731 kfree(irq_cpu_data[i].irq_delta); 729 kfree(irq_cpu_data[i].irq_delta);
@@ -1403,10 +1401,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
1403 enable_8259A_irq(0); 1401 enable_8259A_irq(0);
1404} 1402}
1405 1403
1406static inline void UNEXPECTED_IO_APIC(void)
1407{
1408}
1409
1410void __init print_IO_APIC(void) 1404void __init print_IO_APIC(void)
1411{ 1405{
1412 int apic, i; 1406 int apic, i;
@@ -1446,34 +1440,12 @@ void __init print_IO_APIC(void)
1446 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1440 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1447 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1441 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1448 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); 1442 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1449 if (reg_00.bits.ID >= get_physical_broadcast())
1450 UNEXPECTED_IO_APIC();
1451 if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1452 UNEXPECTED_IO_APIC();
1453 1443
1454 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); 1444 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1455 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 1445 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1456 if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1457 (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1458 (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1459 (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1460 (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1461 (reg_01.bits.entries != 0x2E) &&
1462 (reg_01.bits.entries != 0x3F)
1463 )
1464 UNEXPECTED_IO_APIC();
1465 1446
1466 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 1447 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1467 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 1448 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1468 if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1469 (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1470 (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1471 (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1472 (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1473 )
1474 UNEXPECTED_IO_APIC();
1475 if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1476 UNEXPECTED_IO_APIC();
1477 1449
1478 /* 1450 /*
1479 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, 1451 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1483,8 +1455,6 @@ void __init print_IO_APIC(void)
1483 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { 1455 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1484 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); 1456 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1485 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); 1457 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1486 if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1487 UNEXPECTED_IO_APIC();
1488 } 1458 }
1489 1459
1490 /* 1460 /*
@@ -1496,8 +1466,6 @@ void __init print_IO_APIC(void)
1496 reg_03.raw != reg_01.raw) { 1466 reg_03.raw != reg_01.raw) {
1497 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); 1467 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1498 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); 1468 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1499 if (reg_03.bits.__reserved_1)
1500 UNEXPECTED_IO_APIC();
1501 } 1469 }
1502 1470
1503 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1471 printk(KERN_DEBUG ".... IRQ redirection table:\n");
diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
index 498e8bc197d..d1e42e0dbe6 100644
--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -16,6 +16,7 @@
16#include <linux/stddef.h> 16#include <linux/stddef.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/thread_info.h> 18#include <linux/thread_info.h>
19#include <linux/syscalls.h>
19 20
20/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 21/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
21static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) 22static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
@@ -113,7 +114,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
113 * Reset the owner so that a process switch will not set 114 * Reset the owner so that a process switch will not set
114 * tss->io_bitmap_base to IO_BITMAP_OFFSET. 115 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
115 */ 116 */
116 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; 117 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
117 tss->io_bitmap_owner = NULL; 118 tss->io_bitmap_owner = NULL;
118 119
119 put_cpu(); 120 put_cpu();
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 8db8d514c9c..d2daf672f4a 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -24,6 +24,9 @@
24DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; 24DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
25EXPORT_PER_CPU_SYMBOL(irq_stat); 25EXPORT_PER_CPU_SYMBOL(irq_stat);
26 26
27DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs);
29
27/* 30/*
28 * 'what should we do if we get a hw irq event on an illegal vector'. 31 * 'what should we do if we get a hw irq event on an illegal vector'.
29 * each architecture has to answer this themselves. 32 * each architecture has to answer this themselves.
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 4f5983c9866..0952eccd8f2 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
477 } 477 }
478 ++mpc_record; 478 ++mpc_record;
479 } 479 }
480 clustered_apic_check(); 480 setup_apic_routing();
481 if (!num_processors) 481 if (!num_processors)
482 printk(KERN_ERR "SMP mptable: no processors registered!\n"); 482 printk(KERN_ERR "SMP mptable: no processors registered!\n");
483 return num_processors; 483 return num_processors;
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 84c3497efb6..33cf2f3c444 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -20,7 +20,6 @@
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/sysctl.h> 21#include <linux/sysctl.h>
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/dmi.h>
24#include <linux/kprobes.h> 23#include <linux/kprobes.h>
25#include <linux/cpumask.h> 24#include <linux/cpumask.h>
26#include <linux/kernel_stat.h> 25#include <linux/kernel_stat.h>
@@ -28,30 +27,14 @@
28#include <asm/smp.h> 27#include <asm/smp.h>
29#include <asm/nmi.h> 28#include <asm/nmi.h>
30#include <asm/kdebug.h> 29#include <asm/kdebug.h>
31#include <asm/intel_arch_perfmon.h>
32 30
33#include "mach_traps.h" 31#include "mach_traps.h"
34 32
35int unknown_nmi_panic; 33int unknown_nmi_panic;
36int nmi_watchdog_enabled; 34int nmi_watchdog_enabled;
37 35
38/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
39 * evtsel_nmi_owner tracks the ownership of the event selection
40 * - different performance counters/ event selection may be reserved for
41 * different subsystems this reservation system just tries to coordinate
42 * things a little
43 */
44
45/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
46 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
47 */
48#define NMI_MAX_COUNTER_BITS 66
49#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS)
50
51static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]);
52static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]);
53
54static cpumask_t backtrace_mask = CPU_MASK_NONE; 36static cpumask_t backtrace_mask = CPU_MASK_NONE;
37
55/* nmi_active: 38/* nmi_active:
56 * >0: the lapic NMI watchdog is active, but can be disabled 39 * >0: the lapic NMI watchdog is active, but can be disabled
57 * <0: the lapic NMI watchdog has not been set up, and cannot 40 * <0: the lapic NMI watchdog has not been set up, and cannot
@@ -63,206 +46,11 @@ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
63unsigned int nmi_watchdog = NMI_DEFAULT; 46unsigned int nmi_watchdog = NMI_DEFAULT;
64static unsigned int nmi_hz = HZ; 47static unsigned int nmi_hz = HZ;
65 48
66struct nmi_watchdog_ctlblk { 49static DEFINE_PER_CPU(short, wd_enabled);
67 int enabled;
68 u64 check_bit;
69 unsigned int cccr_msr;
70 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
71 unsigned int evntsel_msr; /* the MSR to select the events to handle */
72};
73static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
74 50
75/* local prototypes */ 51/* local prototypes */
76static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); 52static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
77 53
78extern void show_registers(struct pt_regs *regs);
79extern int unknown_nmi_panic;
80
81/* converts an msr to an appropriate reservation bit */
82static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
83{
84 /* returns the bit offset of the performance counter register */
85 switch (boot_cpu_data.x86_vendor) {
86 case X86_VENDOR_AMD:
87 return (msr - MSR_K7_PERFCTR0);
88 case X86_VENDOR_INTEL:
89 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
90 return (msr - MSR_ARCH_PERFMON_PERFCTR0);
91
92 switch (boot_cpu_data.x86) {
93 case 6:
94 return (msr - MSR_P6_PERFCTR0);
95 case 15:
96 return (msr - MSR_P4_BPU_PERFCTR0);
97 }
98 }
99 return 0;
100}
101
102/* converts an msr to an appropriate reservation bit */
103static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
104{
105 /* returns the bit offset of the event selection register */
106 switch (boot_cpu_data.x86_vendor) {
107 case X86_VENDOR_AMD:
108 return (msr - MSR_K7_EVNTSEL0);
109 case X86_VENDOR_INTEL:
110 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
111 return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
112
113 switch (boot_cpu_data.x86) {
114 case 6:
115 return (msr - MSR_P6_EVNTSEL0);
116 case 15:
117 return (msr - MSR_P4_BSU_ESCR0);
118 }
119 }
120 return 0;
121}
122
123/* checks for a bit availability (hack for oprofile) */
124int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
125{
126 int cpu;
127 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
128 for_each_possible_cpu (cpu) {
129 if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
130 return 0;
131 }
132 return 1;
133}
134
135/* checks the an msr for availability */
136int avail_to_resrv_perfctr_nmi(unsigned int msr)
137{
138 unsigned int counter;
139 int cpu;
140
141 counter = nmi_perfctr_msr_to_bit(msr);
142 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
143
144 for_each_possible_cpu (cpu) {
145 if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
146 return 0;
147 }
148 return 1;
149}
150
151static int __reserve_perfctr_nmi(int cpu, unsigned int msr)
152{
153 unsigned int counter;
154 if (cpu < 0)
155 cpu = smp_processor_id();
156
157 counter = nmi_perfctr_msr_to_bit(msr);
158 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
159
160 if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
161 return 1;
162 return 0;
163}
164
165static void __release_perfctr_nmi(int cpu, unsigned int msr)
166{
167 unsigned int counter;
168 if (cpu < 0)
169 cpu = smp_processor_id();
170
171 counter = nmi_perfctr_msr_to_bit(msr);
172 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
173
174 clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]);
175}
176
177int reserve_perfctr_nmi(unsigned int msr)
178{
179 int cpu, i;
180 for_each_possible_cpu (cpu) {
181 if (!__reserve_perfctr_nmi(cpu, msr)) {
182 for_each_possible_cpu (i) {
183 if (i >= cpu)
184 break;
185 __release_perfctr_nmi(i, msr);
186 }
187 return 0;
188 }
189 }
190 return 1;
191}
192
193void release_perfctr_nmi(unsigned int msr)
194{
195 int cpu;
196 for_each_possible_cpu (cpu) {
197 __release_perfctr_nmi(cpu, msr);
198 }
199}
200
201int __reserve_evntsel_nmi(int cpu, unsigned int msr)
202{
203 unsigned int counter;
204 if (cpu < 0)
205 cpu = smp_processor_id();
206
207 counter = nmi_evntsel_msr_to_bit(msr);
208 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
209
210 if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]))
211 return 1;
212 return 0;
213}
214
215static void __release_evntsel_nmi(int cpu, unsigned int msr)
216{
217 unsigned int counter;
218 if (cpu < 0)
219 cpu = smp_processor_id();
220
221 counter = nmi_evntsel_msr_to_bit(msr);
222 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
223
224 clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]);
225}
226
227int reserve_evntsel_nmi(unsigned int msr)
228{
229 int cpu, i;
230 for_each_possible_cpu (cpu) {
231 if (!__reserve_evntsel_nmi(cpu, msr)) {
232 for_each_possible_cpu (i) {
233 if (i >= cpu)
234 break;
235 __release_evntsel_nmi(i, msr);
236 }
237 return 0;
238 }
239 }
240 return 1;
241}
242
243void release_evntsel_nmi(unsigned int msr)
244{
245 int cpu;
246 for_each_possible_cpu (cpu) {
247 __release_evntsel_nmi(cpu, msr);
248 }
249}
250
251static __cpuinit inline int nmi_known_cpu(void)
252{
253 switch (boot_cpu_data.x86_vendor) {
254 case X86_VENDOR_AMD:
255 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
256 || (boot_cpu_data.x86 == 16));
257 case X86_VENDOR_INTEL:
258 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
259 return 1;
260 else
261 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
262 }
263 return 0;
264}
265
266static int endflag __initdata = 0; 54static int endflag __initdata = 0;
267 55
268#ifdef CONFIG_SMP 56#ifdef CONFIG_SMP
@@ -284,28 +72,6 @@ static __init void nmi_cpu_busy(void *data)
284} 72}
285#endif 73#endif
286 74
287static unsigned int adjust_for_32bit_ctr(unsigned int hz)
288{
289 u64 counter_val;
290 unsigned int retval = hz;
291
292 /*
293 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
294 * are writable, with higher bits sign extending from bit 31.
295 * So, we can only program the counter with 31 bit values and
296 * 32nd bit should be 1, for 33.. to be 1.
297 * Find the appropriate nmi_hz
298 */
299 counter_val = (u64)cpu_khz * 1000;
300 do_div(counter_val, retval);
301 if (counter_val > 0x7fffffffULL) {
302 u64 count = (u64)cpu_khz * 1000;
303 do_div(count, 0x7fffffffUL);
304 retval = count + 1;
305 }
306 return retval;
307}
308
309static int __init check_nmi_watchdog(void) 75static int __init check_nmi_watchdog(void)
310{ 76{
311 unsigned int *prev_nmi_count; 77 unsigned int *prev_nmi_count;
@@ -338,14 +104,14 @@ static int __init check_nmi_watchdog(void)
338 if (!cpu_isset(cpu, cpu_callin_map)) 104 if (!cpu_isset(cpu, cpu_callin_map))
339 continue; 105 continue;
340#endif 106#endif
341 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) 107 if (!per_cpu(wd_enabled, cpu))
342 continue; 108 continue;
343 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { 109 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
344 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", 110 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
345 cpu, 111 cpu,
346 prev_nmi_count[cpu], 112 prev_nmi_count[cpu],
347 nmi_count(cpu)); 113 nmi_count(cpu));
348 per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; 114 per_cpu(wd_enabled, cpu) = 0;
349 atomic_dec(&nmi_active); 115 atomic_dec(&nmi_active);
350 } 116 }
351 } 117 }
@@ -359,16 +125,8 @@ static int __init check_nmi_watchdog(void)
359 125
360 /* now that we know it works we can reduce NMI frequency to 126 /* now that we know it works we can reduce NMI frequency to
361 something more reasonable; makes a difference in some configs */ 127 something more reasonable; makes a difference in some configs */
362 if (nmi_watchdog == NMI_LOCAL_APIC) { 128 if (nmi_watchdog == NMI_LOCAL_APIC)
363 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 129 nmi_hz = lapic_adjust_nmi_hz(1);
364
365 nmi_hz = 1;
366
367 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
368 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
369 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
370 }
371 }
372 130
373 kfree(prev_nmi_count); 131 kfree(prev_nmi_count);
374 return 0; 132 return 0;
@@ -391,85 +149,8 @@ static int __init setup_nmi_watchdog(char *str)
391 149
392__setup("nmi_watchdog=", setup_nmi_watchdog); 150__setup("nmi_watchdog=", setup_nmi_watchdog);
393 151
394static void disable_lapic_nmi_watchdog(void)
395{
396 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
397
398 if (atomic_read(&nmi_active) <= 0)
399 return;
400
401 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
402
403 BUG_ON(atomic_read(&nmi_active) != 0);
404}
405
406static void enable_lapic_nmi_watchdog(void)
407{
408 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
409
410 /* are we already enabled */
411 if (atomic_read(&nmi_active) != 0)
412 return;
413
414 /* are we lapic aware */
415 if (nmi_known_cpu() <= 0)
416 return;
417 152
418 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); 153/* Suspend/resume support */
419 touch_nmi_watchdog();
420}
421
422void disable_timer_nmi_watchdog(void)
423{
424 BUG_ON(nmi_watchdog != NMI_IO_APIC);
425
426 if (atomic_read(&nmi_active) <= 0)
427 return;
428
429 disable_irq(0);
430 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
431
432 BUG_ON(atomic_read(&nmi_active) != 0);
433}
434
435void enable_timer_nmi_watchdog(void)
436{
437 BUG_ON(nmi_watchdog != NMI_IO_APIC);
438
439 if (atomic_read(&nmi_active) == 0) {
440 touch_nmi_watchdog();
441 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
442 enable_irq(0);
443 }
444}
445
446static void __acpi_nmi_disable(void *__unused)
447{
448 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
449}
450
451/*
452 * Disable timer based NMIs on all CPUs:
453 */
454void acpi_nmi_disable(void)
455{
456 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
457 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
458}
459
460static void __acpi_nmi_enable(void *__unused)
461{
462 apic_write_around(APIC_LVT0, APIC_DM_NMI);
463}
464
465/*
466 * Enable timer based NMIs on all CPUs:
467 */
468void acpi_nmi_enable(void)
469{
470 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
471 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
472}
473 154
474#ifdef CONFIG_PM 155#ifdef CONFIG_PM
475 156
@@ -516,7 +197,7 @@ static int __init init_lapic_nmi_sysfs(void)
516 if (nmi_watchdog != NMI_LOCAL_APIC) 197 if (nmi_watchdog != NMI_LOCAL_APIC)
517 return 0; 198 return 0;
518 199
519 if ( atomic_read(&nmi_active) < 0 ) 200 if (atomic_read(&nmi_active) < 0)
520 return 0; 201 return 0;
521 202
522 error = sysdev_class_register(&nmi_sysclass); 203 error = sysdev_class_register(&nmi_sysclass);
@@ -529,433 +210,69 @@ late_initcall(init_lapic_nmi_sysfs);
529 210
530#endif /* CONFIG_PM */ 211#endif /* CONFIG_PM */
531 212
532/* 213static void __acpi_nmi_enable(void *__unused)
533 * Activate the NMI watchdog via the local APIC.
534 * Original code written by Keith Owens.
535 */
536
537static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
538{
539 u64 count = (u64)cpu_khz * 1000;
540
541 do_div(count, nmi_hz);
542 if(descr)
543 Dprintk("setting %s to -0x%08Lx\n", descr, count);
544 wrmsrl(perfctr_msr, 0 - count);
545}
546
547static void write_watchdog_counter32(unsigned int perfctr_msr,
548 const char *descr)
549{
550 u64 count = (u64)cpu_khz * 1000;
551
552 do_div(count, nmi_hz);
553 if(descr)
554 Dprintk("setting %s to -0x%08Lx\n", descr, count);
555 wrmsr(perfctr_msr, (u32)(-count), 0);
556}
557
558/* Note that these events don't tick when the CPU idles. This means
559 the frequency varies with CPU load. */
560
561#define K7_EVNTSEL_ENABLE (1 << 22)
562#define K7_EVNTSEL_INT (1 << 20)
563#define K7_EVNTSEL_OS (1 << 17)
564#define K7_EVNTSEL_USR (1 << 16)
565#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
566#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
567
568static int setup_k7_watchdog(void)
569{
570 unsigned int perfctr_msr, evntsel_msr;
571 unsigned int evntsel;
572 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
573
574 perfctr_msr = MSR_K7_PERFCTR0;
575 evntsel_msr = MSR_K7_EVNTSEL0;
576 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
577 goto fail;
578
579 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
580 goto fail1;
581
582 wrmsrl(perfctr_msr, 0UL);
583
584 evntsel = K7_EVNTSEL_INT
585 | K7_EVNTSEL_OS
586 | K7_EVNTSEL_USR
587 | K7_NMI_EVENT;
588
589 /* setup the timer */
590 wrmsr(evntsel_msr, evntsel, 0);
591 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
592 apic_write(APIC_LVTPC, APIC_DM_NMI);
593 evntsel |= K7_EVNTSEL_ENABLE;
594 wrmsr(evntsel_msr, evntsel, 0);
595
596 wd->perfctr_msr = perfctr_msr;
597 wd->evntsel_msr = evntsel_msr;
598 wd->cccr_msr = 0; //unused
599 wd->check_bit = 1ULL<<63;
600 return 1;
601fail1:
602 __release_perfctr_nmi(-1, perfctr_msr);
603fail:
604 return 0;
605}
606
607static void stop_k7_watchdog(void)
608{
609 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
610
611 wrmsr(wd->evntsel_msr, 0, 0);
612
613 __release_evntsel_nmi(-1, wd->evntsel_msr);
614 __release_perfctr_nmi(-1, wd->perfctr_msr);
615}
616
617#define P6_EVNTSEL0_ENABLE (1 << 22)
618#define P6_EVNTSEL_INT (1 << 20)
619#define P6_EVNTSEL_OS (1 << 17)
620#define P6_EVNTSEL_USR (1 << 16)
621#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
622#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
623
624static int setup_p6_watchdog(void)
625{
626 unsigned int perfctr_msr, evntsel_msr;
627 unsigned int evntsel;
628 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
629
630 perfctr_msr = MSR_P6_PERFCTR0;
631 evntsel_msr = MSR_P6_EVNTSEL0;
632 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
633 goto fail;
634
635 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
636 goto fail1;
637
638 wrmsrl(perfctr_msr, 0UL);
639
640 evntsel = P6_EVNTSEL_INT
641 | P6_EVNTSEL_OS
642 | P6_EVNTSEL_USR
643 | P6_NMI_EVENT;
644
645 /* setup the timer */
646 wrmsr(evntsel_msr, evntsel, 0);
647 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
648 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
649 apic_write(APIC_LVTPC, APIC_DM_NMI);
650 evntsel |= P6_EVNTSEL0_ENABLE;
651 wrmsr(evntsel_msr, evntsel, 0);
652
653 wd->perfctr_msr = perfctr_msr;
654 wd->evntsel_msr = evntsel_msr;
655 wd->cccr_msr = 0; //unused
656 wd->check_bit = 1ULL<<39;
657 return 1;
658fail1:
659 __release_perfctr_nmi(-1, perfctr_msr);
660fail:
661 return 0;
662}
663
664static void stop_p6_watchdog(void)
665{
666 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
667
668 wrmsr(wd->evntsel_msr, 0, 0);
669
670 __release_evntsel_nmi(-1, wd->evntsel_msr);
671 __release_perfctr_nmi(-1, wd->perfctr_msr);
672}
673
674/* Note that these events don't tick when the CPU idles. This means
675 the frequency varies with CPU load. */
676
677#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
678#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
679#define P4_ESCR_OS (1<<3)
680#define P4_ESCR_USR (1<<2)
681#define P4_CCCR_OVF_PMI0 (1<<26)
682#define P4_CCCR_OVF_PMI1 (1<<27)
683#define P4_CCCR_THRESHOLD(N) ((N)<<20)
684#define P4_CCCR_COMPLEMENT (1<<19)
685#define P4_CCCR_COMPARE (1<<18)
686#define P4_CCCR_REQUIRED (3<<16)
687#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
688#define P4_CCCR_ENABLE (1<<12)
689#define P4_CCCR_OVF (1<<31)
690/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
691 CRU_ESCR0 (with any non-null event selector) through a complemented
692 max threshold. [IA32-Vol3, Section 14.9.9] */
693
694static int setup_p4_watchdog(void)
695{ 214{
696 unsigned int perfctr_msr, evntsel_msr, cccr_msr; 215 apic_write_around(APIC_LVT0, APIC_DM_NMI);
697 unsigned int evntsel, cccr_val;
698 unsigned int misc_enable, dummy;
699 unsigned int ht_num;
700 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
701
702 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
703 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
704 return 0;
705
706#ifdef CONFIG_SMP
707 /* detect which hyperthread we are on */
708 if (smp_num_siblings == 2) {
709 unsigned int ebx, apicid;
710
711 ebx = cpuid_ebx(1);
712 apicid = (ebx >> 24) & 0xff;
713 ht_num = apicid & 1;
714 } else
715#endif
716 ht_num = 0;
717
718 /* performance counters are shared resources
719 * assign each hyperthread its own set
720 * (re-use the ESCR0 register, seems safe
721 * and keeps the cccr_val the same)
722 */
723 if (!ht_num) {
724 /* logical cpu 0 */
725 perfctr_msr = MSR_P4_IQ_PERFCTR0;
726 evntsel_msr = MSR_P4_CRU_ESCR0;
727 cccr_msr = MSR_P4_IQ_CCCR0;
728 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
729 } else {
730 /* logical cpu 1 */
731 perfctr_msr = MSR_P4_IQ_PERFCTR1;
732 evntsel_msr = MSR_P4_CRU_ESCR0;
733 cccr_msr = MSR_P4_IQ_CCCR1;
734 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
735 }
736
737 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
738 goto fail;
739
740 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
741 goto fail1;
742
743 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
744 | P4_ESCR_OS
745 | P4_ESCR_USR;
746
747 cccr_val |= P4_CCCR_THRESHOLD(15)
748 | P4_CCCR_COMPLEMENT
749 | P4_CCCR_COMPARE
750 | P4_CCCR_REQUIRED;
751
752 wrmsr(evntsel_msr, evntsel, 0);
753 wrmsr(cccr_msr, cccr_val, 0);
754 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
755 apic_write(APIC_LVTPC, APIC_DM_NMI);
756 cccr_val |= P4_CCCR_ENABLE;
757 wrmsr(cccr_msr, cccr_val, 0);
758 wd->perfctr_msr = perfctr_msr;
759 wd->evntsel_msr = evntsel_msr;
760 wd->cccr_msr = cccr_msr;
761 wd->check_bit = 1ULL<<39;
762 return 1;
763fail1:
764 __release_perfctr_nmi(-1, perfctr_msr);
765fail:
766 return 0;
767} 216}
768 217
769static void stop_p4_watchdog(void) 218/*
219 * Enable timer based NMIs on all CPUs:
220 */
221void acpi_nmi_enable(void)
770{ 222{
771 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 223 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
772 224 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
773 wrmsr(wd->cccr_msr, 0, 0);
774 wrmsr(wd->evntsel_msr, 0, 0);
775
776 __release_evntsel_nmi(-1, wd->evntsel_msr);
777 __release_perfctr_nmi(-1, wd->perfctr_msr);
778} 225}
779 226
780#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 227static void __acpi_nmi_disable(void *__unused)
781#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
782
783static int setup_intel_arch_watchdog(void)
784{ 228{
785 unsigned int ebx; 229 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
786 union cpuid10_eax eax;
787 unsigned int unused;
788 unsigned int perfctr_msr, evntsel_msr;
789 unsigned int evntsel;
790 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
791
792 /*
793 * Check whether the Architectural PerfMon supports
794 * Unhalted Core Cycles Event or not.
795 * NOTE: Corresponding bit = 0 in ebx indicates event present.
796 */
797 cpuid(10, &(eax.full), &ebx, &unused, &unused);
798 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
799 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
800 goto fail;
801
802 perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
803 evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
804
805 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
806 goto fail;
807
808 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
809 goto fail1;
810
811 wrmsrl(perfctr_msr, 0UL);
812
813 evntsel = ARCH_PERFMON_EVENTSEL_INT
814 | ARCH_PERFMON_EVENTSEL_OS
815 | ARCH_PERFMON_EVENTSEL_USR
816 | ARCH_PERFMON_NMI_EVENT_SEL
817 | ARCH_PERFMON_NMI_EVENT_UMASK;
818
819 /* setup the timer */
820 wrmsr(evntsel_msr, evntsel, 0);
821 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
822 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
823 apic_write(APIC_LVTPC, APIC_DM_NMI);
824 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
825 wrmsr(evntsel_msr, evntsel, 0);
826
827 wd->perfctr_msr = perfctr_msr;
828 wd->evntsel_msr = evntsel_msr;
829 wd->cccr_msr = 0; //unused
830 wd->check_bit = 1ULL << (eax.split.bit_width - 1);
831 return 1;
832fail1:
833 __release_perfctr_nmi(-1, perfctr_msr);
834fail:
835 return 0;
836} 230}
837 231
838static void stop_intel_arch_watchdog(void) 232/*
233 * Disable timer based NMIs on all CPUs:
234 */
235void acpi_nmi_disable(void)
839{ 236{
840 unsigned int ebx; 237 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
841 union cpuid10_eax eax; 238 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
842 unsigned int unused;
843 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
844
845 /*
846 * Check whether the Architectural PerfMon supports
847 * Unhalted Core Cycles Event or not.
848 * NOTE: Corresponding bit = 0 in ebx indicates event present.
849 */
850 cpuid(10, &(eax.full), &ebx, &unused, &unused);
851 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
852 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
853 return;
854
855 wrmsr(wd->evntsel_msr, 0, 0);
856 __release_evntsel_nmi(-1, wd->evntsel_msr);
857 __release_perfctr_nmi(-1, wd->perfctr_msr);
858} 239}
859 240
860void setup_apic_nmi_watchdog (void *unused) 241void setup_apic_nmi_watchdog (void *unused)
861{ 242{
862 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 243 if (__get_cpu_var(wd_enabled))
863 244 return;
864 /* only support LOCAL and IO APICs for now */
865 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
866 (nmi_watchdog != NMI_IO_APIC))
867 return;
868
869 if (wd->enabled == 1)
870 return;
871 245
872 /* cheap hack to support suspend/resume */ 246 /* cheap hack to support suspend/resume */
873 /* if cpu0 is not active neither should the other cpus */ 247 /* if cpu0 is not active neither should the other cpus */
874 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) 248 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
875 return; 249 return;
876 250
877 if (nmi_watchdog == NMI_LOCAL_APIC) { 251 switch (nmi_watchdog) {
878 switch (boot_cpu_data.x86_vendor) { 252 case NMI_LOCAL_APIC:
879 case X86_VENDOR_AMD: 253 __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
880 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 254 if (lapic_watchdog_init(nmi_hz) < 0) {
881 boot_cpu_data.x86 != 16) 255 __get_cpu_var(wd_enabled) = 0;
882 return;
883 if (!setup_k7_watchdog())
884 return;
885 break;
886 case X86_VENDOR_INTEL:
887 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
888 if (!setup_intel_arch_watchdog())
889 return;
890 break;
891 }
892 switch (boot_cpu_data.x86) {
893 case 6:
894 if (boot_cpu_data.x86_model > 0xd)
895 return;
896
897 if (!setup_p6_watchdog())
898 return;
899 break;
900 case 15:
901 if (boot_cpu_data.x86_model > 0x4)
902 return;
903
904 if (!setup_p4_watchdog())
905 return;
906 break;
907 default:
908 return;
909 }
910 break;
911 default:
912 return; 256 return;
913 } 257 }
258 /* FALL THROUGH */
259 case NMI_IO_APIC:
260 __get_cpu_var(wd_enabled) = 1;
261 atomic_inc(&nmi_active);
914 } 262 }
915 wd->enabled = 1;
916 atomic_inc(&nmi_active);
917} 263}
918 264
919void stop_apic_nmi_watchdog(void *unused) 265void stop_apic_nmi_watchdog(void *unused)
920{ 266{
921 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
922
923 /* only support LOCAL and IO APICs for now */ 267 /* only support LOCAL and IO APICs for now */
924 if ((nmi_watchdog != NMI_LOCAL_APIC) && 268 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
925 (nmi_watchdog != NMI_IO_APIC)) 269 (nmi_watchdog != NMI_IO_APIC))
926 return; 270 return;
927 271 if (__get_cpu_var(wd_enabled) == 0)
928 if (wd->enabled == 0)
929 return; 272 return;
930 273 if (nmi_watchdog == NMI_LOCAL_APIC)
931 if (nmi_watchdog == NMI_LOCAL_APIC) { 274 lapic_watchdog_stop();
932 switch (boot_cpu_data.x86_vendor) { 275 __get_cpu_var(wd_enabled) = 0;
933 case X86_VENDOR_AMD:
934 stop_k7_watchdog();
935 break;
936 case X86_VENDOR_INTEL:
937 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
938 stop_intel_arch_watchdog();
939 break;
940 }
941 switch (boot_cpu_data.x86) {
942 case 6:
943 if (boot_cpu_data.x86_model > 0xd)
944 break;
945 stop_p6_watchdog();
946 break;
947 case 15:
948 if (boot_cpu_data.x86_model > 0x4)
949 break;
950 stop_p4_watchdog();
951 break;
952 }
953 break;
954 default:
955 return;
956 }
957 }
958 wd->enabled = 0;
959 atomic_dec(&nmi_active); 276 atomic_dec(&nmi_active);
960} 277}
961 278
@@ -1011,8 +328,6 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
1011 unsigned int sum; 328 unsigned int sum;
1012 int touched = 0; 329 int touched = 0;
1013 int cpu = smp_processor_id(); 330 int cpu = smp_processor_id();
1014 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
1015 u64 dummy;
1016 int rc=0; 331 int rc=0;
1017 332
1018 /* check for other users first */ 333 /* check for other users first */
@@ -1055,53 +370,20 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
1055 alert_counter[cpu] = 0; 370 alert_counter[cpu] = 0;
1056 } 371 }
1057 /* see if the nmi watchdog went off */ 372 /* see if the nmi watchdog went off */
1058 if (wd->enabled) { 373 if (!__get_cpu_var(wd_enabled))
1059 if (nmi_watchdog == NMI_LOCAL_APIC) { 374 return rc;
1060 rdmsrl(wd->perfctr_msr, dummy); 375 switch (nmi_watchdog) {
1061 if (dummy & wd->check_bit){ 376 case NMI_LOCAL_APIC:
1062 /* this wasn't a watchdog timer interrupt */ 377 rc |= lapic_wd_event(nmi_hz);
1063 goto done; 378 break;
1064 } 379 case NMI_IO_APIC:
1065 380 /* don't know how to accurately check for this.
1066 /* only Intel P4 uses the cccr msr */ 381 * just assume it was a watchdog timer interrupt
1067 if (wd->cccr_msr != 0) { 382 * This matches the old behaviour.
1068 /* 383 */
1069 * P4 quirks: 384 rc = 1;
1070 * - An overflown perfctr will assert its interrupt 385 break;
1071 * until the OVF flag in its CCCR is cleared.
1072 * - LVTPC is masked on interrupt and must be
1073 * unmasked by the LVTPC handler.
1074 */
1075 rdmsrl(wd->cccr_msr, dummy);
1076 dummy &= ~P4_CCCR_OVF;
1077 wrmsrl(wd->cccr_msr, dummy);
1078 apic_write(APIC_LVTPC, APIC_DM_NMI);
1079 /* start the cycle over again */
1080 write_watchdog_counter(wd->perfctr_msr, NULL);
1081 }
1082 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
1083 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
1084 /* P6 based Pentium M need to re-unmask
1085 * the apic vector but it doesn't hurt
1086 * other P6 variant.
1087 * ArchPerfom/Core Duo also needs this */
1088 apic_write(APIC_LVTPC, APIC_DM_NMI);
1089 /* P6/ARCH_PERFMON has 32 bit counter write */
1090 write_watchdog_counter32(wd->perfctr_msr, NULL);
1091 } else {
1092 /* start the cycle over again */
1093 write_watchdog_counter(wd->perfctr_msr, NULL);
1094 }
1095 rc = 1;
1096 } else if (nmi_watchdog == NMI_IO_APIC) {
1097 /* don't know how to accurately check for this.
1098 * just assume it was a watchdog timer interrupt
1099 * This matches the old behaviour.
1100 */
1101 rc = 1;
1102 }
1103 } 386 }
1104done:
1105 return rc; 387 return rc;
1106} 388}
1107 389
@@ -1146,7 +428,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1146 } 428 }
1147 429
1148 if (nmi_watchdog == NMI_DEFAULT) { 430 if (nmi_watchdog == NMI_DEFAULT) {
1149 if (nmi_known_cpu() > 0) 431 if (lapic_watchdog_ok())
1150 nmi_watchdog = NMI_LOCAL_APIC; 432 nmi_watchdog = NMI_LOCAL_APIC;
1151 else 433 else
1152 nmi_watchdog = NMI_IO_APIC; 434 nmi_watchdog = NMI_IO_APIC;
@@ -1182,11 +464,3 @@ void __trigger_all_cpu_backtrace(void)
1182 464
1183EXPORT_SYMBOL(nmi_active); 465EXPORT_SYMBOL(nmi_active);
1184EXPORT_SYMBOL(nmi_watchdog); 466EXPORT_SYMBOL(nmi_watchdog);
1185EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1186EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1187EXPORT_SYMBOL(reserve_perfctr_nmi);
1188EXPORT_SYMBOL(release_perfctr_nmi);
1189EXPORT_SYMBOL(reserve_evntsel_nmi);
1190EXPORT_SYMBOL(release_evntsel_nmi);
1191EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1192EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 2ec331e03fa..5c10f376bce 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -20,6 +20,7 @@
20#include <linux/efi.h> 20#include <linux/efi.h>
21#include <linux/bcd.h> 21#include <linux/bcd.h>
22#include <linux/start_kernel.h> 22#include <linux/start_kernel.h>
23#include <linux/highmem.h>
23 24
24#include <asm/bug.h> 25#include <asm/bug.h>
25#include <asm/paravirt.h> 26#include <asm/paravirt.h>
@@ -35,7 +36,7 @@
35#include <asm/timer.h> 36#include <asm/timer.h>
36 37
37/* nop stub */ 38/* nop stub */
38static void native_nop(void) 39void _paravirt_nop(void)
39{ 40{
40} 41}
41 42
@@ -54,331 +55,148 @@ char *memory_setup(void)
54#define DEF_NATIVE(name, code) \ 55#define DEF_NATIVE(name, code) \
55 extern const char start_##name[], end_##name[]; \ 56 extern const char start_##name[], end_##name[]; \
56 asm("start_" #name ": " code "; end_" #name ":") 57 asm("start_" #name ": " code "; end_" #name ":")
57DEF_NATIVE(cli, "cli"); 58
58DEF_NATIVE(sti, "sti"); 59DEF_NATIVE(irq_disable, "cli");
59DEF_NATIVE(popf, "push %eax; popf"); 60DEF_NATIVE(irq_enable, "sti");
60DEF_NATIVE(pushf, "pushf; pop %eax"); 61DEF_NATIVE(restore_fl, "push %eax; popf");
61DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); 62DEF_NATIVE(save_fl, "pushf; pop %eax");
62DEF_NATIVE(iret, "iret"); 63DEF_NATIVE(iret, "iret");
63DEF_NATIVE(sti_sysexit, "sti; sysexit"); 64DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
65DEF_NATIVE(read_cr2, "mov %cr2, %eax");
66DEF_NATIVE(write_cr3, "mov %eax, %cr3");
67DEF_NATIVE(read_cr3, "mov %cr3, %eax");
68DEF_NATIVE(clts, "clts");
69DEF_NATIVE(read_tsc, "rdtsc");
64 70
65static const struct native_insns 71DEF_NATIVE(ud2a, "ud2a");
66{
67 const char *start, *end;
68} native_insns[] = {
69 [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
70 [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
71 [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
72 [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
73 [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
74 [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
75 [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
76};
77 72
78static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) 73static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
79{ 74{
80 unsigned int insn_len; 75 const unsigned char *start, *end;
81 76 unsigned ret;
82 /* Don't touch it if we don't have a replacement */ 77
83 if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) 78 switch(type) {
84 return len; 79#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
85 80 SITE(irq_disable);
86 insn_len = native_insns[type].end - native_insns[type].start; 81 SITE(irq_enable);
87 82 SITE(restore_fl);
88 /* Similarly if we can't fit replacement. */ 83 SITE(save_fl);
89 if (len < insn_len) 84 SITE(iret);
90 return len; 85 SITE(irq_enable_sysexit);
86 SITE(read_cr2);
87 SITE(read_cr3);
88 SITE(write_cr3);
89 SITE(clts);
90 SITE(read_tsc);
91#undef SITE
92
93 patch_site:
94 ret = paravirt_patch_insns(insns, len, start, end);
95 break;
91 96
92 memcpy(insns, native_insns[type].start, insn_len); 97 case PARAVIRT_PATCH(make_pgd):
93 return insn_len; 98 case PARAVIRT_PATCH(make_pte):
94} 99 case PARAVIRT_PATCH(pgd_val):
100 case PARAVIRT_PATCH(pte_val):
101#ifdef CONFIG_X86_PAE
102 case PARAVIRT_PATCH(make_pmd):
103 case PARAVIRT_PATCH(pmd_val):
104#endif
105 /* These functions end up returning exactly what
106 they're passed, in the same registers. */
107 ret = paravirt_patch_nop();
108 break;
95 109
96static unsigned long native_get_debugreg(int regno)
97{
98 unsigned long val = 0; /* Damn you, gcc! */
99
100 switch (regno) {
101 case 0:
102 asm("movl %%db0, %0" :"=r" (val)); break;
103 case 1:
104 asm("movl %%db1, %0" :"=r" (val)); break;
105 case 2:
106 asm("movl %%db2, %0" :"=r" (val)); break;
107 case 3:
108 asm("movl %%db3, %0" :"=r" (val)); break;
109 case 6:
110 asm("movl %%db6, %0" :"=r" (val)); break;
111 case 7:
112 asm("movl %%db7, %0" :"=r" (val)); break;
113 default: 110 default:
114 BUG(); 111 ret = paravirt_patch_default(type, clobbers, insns, len);
115 }
116 return val;
117}
118
119static void native_set_debugreg(int regno, unsigned long value)
120{
121 switch (regno) {
122 case 0:
123 asm("movl %0,%%db0" : /* no output */ :"r" (value));
124 break;
125 case 1:
126 asm("movl %0,%%db1" : /* no output */ :"r" (value));
127 break;
128 case 2:
129 asm("movl %0,%%db2" : /* no output */ :"r" (value));
130 break; 112 break;
131 case 3:
132 asm("movl %0,%%db3" : /* no output */ :"r" (value));
133 break;
134 case 6:
135 asm("movl %0,%%db6" : /* no output */ :"r" (value));
136 break;
137 case 7:
138 asm("movl %0,%%db7" : /* no output */ :"r" (value));
139 break;
140 default:
141 BUG();
142 } 113 }
143}
144
145void init_IRQ(void)
146{
147 paravirt_ops.init_IRQ();
148}
149
150static void native_clts(void)
151{
152 asm volatile ("clts");
153}
154
155static unsigned long native_read_cr0(void)
156{
157 unsigned long val;
158 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
159 return val;
160}
161
162static void native_write_cr0(unsigned long val)
163{
164 asm volatile("movl %0,%%cr0": :"r" (val));
165}
166
167static unsigned long native_read_cr2(void)
168{
169 unsigned long val;
170 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
171 return val;
172}
173
174static void native_write_cr2(unsigned long val)
175{
176 asm volatile("movl %0,%%cr2": :"r" (val));
177}
178
179static unsigned long native_read_cr3(void)
180{
181 unsigned long val;
182 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
183 return val;
184}
185
186static void native_write_cr3(unsigned long val)
187{
188 asm volatile("movl %0,%%cr3": :"r" (val));
189}
190
191static unsigned long native_read_cr4(void)
192{
193 unsigned long val;
194 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
195 return val;
196}
197
198static unsigned long native_read_cr4_safe(void)
199{
200 unsigned long val;
201 /* This could fault if %cr4 does not exist */
202 asm("1: movl %%cr4, %0 \n"
203 "2: \n"
204 ".section __ex_table,\"a\" \n"
205 ".long 1b,2b \n"
206 ".previous \n"
207 : "=r" (val): "0" (0));
208 return val;
209}
210
211static void native_write_cr4(unsigned long val)
212{
213 asm volatile("movl %0,%%cr4": :"r" (val));
214}
215
216static unsigned long native_save_fl(void)
217{
218 unsigned long f;
219 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
220 return f;
221}
222
223static void native_restore_fl(unsigned long f)
224{
225 asm volatile("pushl %0 ; popfl": /* no output */
226 :"g" (f)
227 :"memory", "cc");
228}
229
230static void native_irq_disable(void)
231{
232 asm volatile("cli": : :"memory");
233}
234
235static void native_irq_enable(void)
236{
237 asm volatile("sti": : :"memory");
238}
239
240static void native_safe_halt(void)
241{
242 asm volatile("sti; hlt": : :"memory");
243}
244 114
245static void native_halt(void) 115 return ret;
246{
247 asm volatile("hlt": : :"memory");
248} 116}
249 117
250static void native_wbinvd(void) 118unsigned paravirt_patch_nop(void)
251{ 119{
252 asm volatile("wbinvd": : :"memory"); 120 return 0;
253} 121}
254 122
255static unsigned long long native_read_msr(unsigned int msr, int *err) 123unsigned paravirt_patch_ignore(unsigned len)
256{ 124{
257 unsigned long long val; 125 return len;
258
259 asm volatile("2: rdmsr ; xorl %0,%0\n"
260 "1:\n\t"
261 ".section .fixup,\"ax\"\n\t"
262 "3: movl %3,%0 ; jmp 1b\n\t"
263 ".previous\n\t"
264 ".section __ex_table,\"a\"\n"
265 " .align 4\n\t"
266 " .long 2b,3b\n\t"
267 ".previous"
268 : "=r" (*err), "=A" (val)
269 : "c" (msr), "i" (-EFAULT));
270
271 return val;
272} 126}
273 127
274static int native_write_msr(unsigned int msr, unsigned long long val) 128unsigned paravirt_patch_call(void *target, u16 tgt_clobbers,
129 void *site, u16 site_clobbers,
130 unsigned len)
275{ 131{
276 int err; 132 unsigned char *call = site;
277 asm volatile("2: wrmsr ; xorl %0,%0\n" 133 unsigned long delta = (unsigned long)target - (unsigned long)(call+5);
278 "1:\n\t"
279 ".section .fixup,\"ax\"\n\t"
280 "3: movl %4,%0 ; jmp 1b\n\t"
281 ".previous\n\t"
282 ".section __ex_table,\"a\"\n"
283 " .align 4\n\t"
284 " .long 2b,3b\n\t"
285 ".previous"
286 : "=a" (err)
287 : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
288 "i" (-EFAULT));
289 return err;
290}
291 134
292static unsigned long long native_read_tsc(void) 135 if (tgt_clobbers & ~site_clobbers)
293{ 136 return len; /* target would clobber too much for this site */
294 unsigned long long val; 137 if (len < 5)
295 asm volatile("rdtsc" : "=A" (val)); 138 return len; /* call too long for patch site */
296 return val;
297}
298 139
299static unsigned long long native_read_pmc(void) 140 *call++ = 0xe8; /* call */
300{ 141 *(unsigned long *)call = delta;
301 unsigned long long val;
302 asm volatile("rdpmc" : "=A" (val));
303 return val;
304}
305 142
306static void native_load_tr_desc(void) 143 return 5;
307{
308 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
309} 144}
310 145
311static void native_load_gdt(const struct Xgt_desc_struct *dtr) 146unsigned paravirt_patch_jmp(void *target, void *site, unsigned len)
312{ 147{
313 asm volatile("lgdt %0"::"m" (*dtr)); 148 unsigned char *jmp = site;
314} 149 unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5);
315 150
316static void native_load_idt(const struct Xgt_desc_struct *dtr) 151 if (len < 5)
317{ 152 return len; /* call too long for patch site */
318 asm volatile("lidt %0"::"m" (*dtr));
319}
320 153
321static void native_store_gdt(struct Xgt_desc_struct *dtr) 154 *jmp++ = 0xe9; /* jmp */
322{ 155 *(unsigned long *)jmp = delta;
323 asm ("sgdt %0":"=m" (*dtr));
324}
325 156
326static void native_store_idt(struct Xgt_desc_struct *dtr) 157 return 5;
327{
328 asm ("sidt %0":"=m" (*dtr));
329} 158}
330 159
331static unsigned long native_store_tr(void) 160unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
332{ 161{
333 unsigned long tr; 162 void *opfunc = *((void **)&paravirt_ops + type);
334 asm ("str %0":"=r" (tr)); 163 unsigned ret;
335 return tr;
336}
337 164
338static void native_load_tls(struct thread_struct *t, unsigned int cpu) 165 if (opfunc == NULL)
339{ 166 /* If there's no function, patch it with a ud2a (BUG) */
340#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] 167 ret = paravirt_patch_insns(site, len, start_ud2a, end_ud2a);
341 C(0); C(1); C(2); 168 else if (opfunc == paravirt_nop)
342#undef C 169 /* If the operation is a nop, then nop the callsite */
343} 170 ret = paravirt_patch_nop();
171 else if (type == PARAVIRT_PATCH(iret) ||
172 type == PARAVIRT_PATCH(irq_enable_sysexit))
173 /* If operation requires a jmp, then jmp */
174 ret = paravirt_patch_jmp(opfunc, site, len);
175 else
176 /* Otherwise call the function; assume target could
177 clobber any caller-save reg */
178 ret = paravirt_patch_call(opfunc, CLBR_ANY,
179 site, clobbers, len);
344 180
345static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) 181 return ret;
346{
347 u32 *lp = (u32 *)((char *)dt + entry*8);
348 lp[0] = entry_low;
349 lp[1] = entry_high;
350} 182}
351 183
352static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) 184unsigned paravirt_patch_insns(void *site, unsigned len,
185 const char *start, const char *end)
353{ 186{
354 native_write_dt_entry(dt, entrynum, low, high); 187 unsigned insn_len = end - start;
355}
356 188
357static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) 189 if (insn_len > len || start == NULL)
358{ 190 insn_len = len;
359 native_write_dt_entry(dt, entrynum, low, high); 191 else
360} 192 memcpy(site, start, insn_len);
361
362static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
363{
364 native_write_dt_entry(dt, entrynum, low, high);
365}
366 193
367static void native_load_esp0(struct tss_struct *tss, 194 return insn_len;
368 struct thread_struct *thread)
369{
370 tss->esp0 = thread->esp0;
371
372 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
373 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
374 tss->ss1 = thread->sysenter_cs;
375 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
376 }
377} 195}
378 196
379static void native_io_delay(void) 197void init_IRQ(void)
380{ 198{
381 asm volatile("outb %al,$0x80"); 199 paravirt_ops.init_IRQ();
382} 200}
383 201
384static void native_flush_tlb(void) 202static void native_flush_tlb(void)
@@ -395,83 +213,11 @@ static void native_flush_tlb_global(void)
395 __native_flush_tlb_global(); 213 __native_flush_tlb_global();
396} 214}
397 215
398static void native_flush_tlb_single(u32 addr) 216static void native_flush_tlb_single(unsigned long addr)
399{ 217{
400 __native_flush_tlb_single(addr); 218 __native_flush_tlb_single(addr);
401} 219}
402 220
403#ifndef CONFIG_X86_PAE
404static void native_set_pte(pte_t *ptep, pte_t pteval)
405{
406 *ptep = pteval;
407}
408
409static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
410{
411 *ptep = pteval;
412}
413
414static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
415{
416 *pmdp = pmdval;
417}
418
419#else /* CONFIG_X86_PAE */
420
421static void native_set_pte(pte_t *ptep, pte_t pte)
422{
423 ptep->pte_high = pte.pte_high;
424 smp_wmb();
425 ptep->pte_low = pte.pte_low;
426}
427
428static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
429{
430 ptep->pte_high = pte.pte_high;
431 smp_wmb();
432 ptep->pte_low = pte.pte_low;
433}
434
435static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
436{
437 ptep->pte_low = 0;
438 smp_wmb();
439 ptep->pte_high = pte.pte_high;
440 smp_wmb();
441 ptep->pte_low = pte.pte_low;
442}
443
444static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
445{
446 set_64bit((unsigned long long *)ptep,pte_val(pteval));
447}
448
449static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
450{
451 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
452}
453
454static void native_set_pud(pud_t *pudp, pud_t pudval)
455{
456 *pudp = pudval;
457}
458
459static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
460{
461 ptep->pte_low = 0;
462 smp_wmb();
463 ptep->pte_high = 0;
464}
465
466static void native_pmd_clear(pmd_t *pmd)
467{
468 u32 *tmp = (u32 *)pmd;
469 *tmp = 0;
470 smp_wmb();
471 *(tmp + 1) = 0;
472}
473#endif /* CONFIG_X86_PAE */
474
475/* These are in entry.S */ 221/* These are in entry.S */
476extern void native_iret(void); 222extern void native_iret(void);
477extern void native_irq_enable_sysexit(void); 223extern void native_irq_enable_sysexit(void);
@@ -487,10 +233,11 @@ struct paravirt_ops paravirt_ops = {
487 .name = "bare hardware", 233 .name = "bare hardware",
488 .paravirt_enabled = 0, 234 .paravirt_enabled = 0,
489 .kernel_rpl = 0, 235 .kernel_rpl = 0,
236 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
490 237
491 .patch = native_patch, 238 .patch = native_patch,
492 .banner = default_banner, 239 .banner = default_banner,
493 .arch_setup = native_nop, 240 .arch_setup = paravirt_nop,
494 .memory_setup = machine_specific_memory_setup, 241 .memory_setup = machine_specific_memory_setup,
495 .get_wallclock = native_get_wallclock, 242 .get_wallclock = native_get_wallclock,
496 .set_wallclock = native_set_wallclock, 243 .set_wallclock = native_set_wallclock,
@@ -517,8 +264,8 @@ struct paravirt_ops paravirt_ops = {
517 .safe_halt = native_safe_halt, 264 .safe_halt = native_safe_halt,
518 .halt = native_halt, 265 .halt = native_halt,
519 .wbinvd = native_wbinvd, 266 .wbinvd = native_wbinvd,
520 .read_msr = native_read_msr, 267 .read_msr = native_read_msr_safe,
521 .write_msr = native_write_msr, 268 .write_msr = native_write_msr_safe,
522 .read_tsc = native_read_tsc, 269 .read_tsc = native_read_tsc,
523 .read_pmc = native_read_pmc, 270 .read_pmc = native_read_pmc,
524 .get_scheduled_cycles = native_read_tsc, 271 .get_scheduled_cycles = native_read_tsc,
@@ -531,9 +278,9 @@ struct paravirt_ops paravirt_ops = {
531 .store_idt = native_store_idt, 278 .store_idt = native_store_idt,
532 .store_tr = native_store_tr, 279 .store_tr = native_store_tr,
533 .load_tls = native_load_tls, 280 .load_tls = native_load_tls,
534 .write_ldt_entry = native_write_ldt_entry, 281 .write_ldt_entry = write_dt_entry,
535 .write_gdt_entry = native_write_gdt_entry, 282 .write_gdt_entry = write_dt_entry,
536 .write_idt_entry = native_write_idt_entry, 283 .write_idt_entry = write_dt_entry,
537 .load_esp0 = native_load_esp0, 284 .load_esp0 = native_load_esp0,
538 285
539 .set_iopl_mask = native_set_iopl_mask, 286 .set_iopl_mask = native_set_iopl_mask,
@@ -545,44 +292,57 @@ struct paravirt_ops paravirt_ops = {
545 .apic_read = native_apic_read, 292 .apic_read = native_apic_read,
546 .setup_boot_clock = setup_boot_APIC_clock, 293 .setup_boot_clock = setup_boot_APIC_clock,
547 .setup_secondary_clock = setup_secondary_APIC_clock, 294 .setup_secondary_clock = setup_secondary_APIC_clock,
295 .startup_ipi_hook = paravirt_nop,
548#endif 296#endif
549 .set_lazy_mode = (void *)native_nop, 297 .set_lazy_mode = paravirt_nop,
298
299 .pagetable_setup_start = native_pagetable_setup_start,
300 .pagetable_setup_done = native_pagetable_setup_done,
550 301
551 .flush_tlb_user = native_flush_tlb, 302 .flush_tlb_user = native_flush_tlb,
552 .flush_tlb_kernel = native_flush_tlb_global, 303 .flush_tlb_kernel = native_flush_tlb_global,
553 .flush_tlb_single = native_flush_tlb_single, 304 .flush_tlb_single = native_flush_tlb_single,
305 .flush_tlb_others = native_flush_tlb_others,
554 306
555 .map_pt_hook = (void *)native_nop, 307 .alloc_pt = paravirt_nop,
556 308 .alloc_pd = paravirt_nop,
557 .alloc_pt = (void *)native_nop, 309 .alloc_pd_clone = paravirt_nop,
558 .alloc_pd = (void *)native_nop, 310 .release_pt = paravirt_nop,
559 .alloc_pd_clone = (void *)native_nop, 311 .release_pd = paravirt_nop,
560 .release_pt = (void *)native_nop,
561 .release_pd = (void *)native_nop,
562 312
563 .set_pte = native_set_pte, 313 .set_pte = native_set_pte,
564 .set_pte_at = native_set_pte_at, 314 .set_pte_at = native_set_pte_at,
565 .set_pmd = native_set_pmd, 315 .set_pmd = native_set_pmd,
566 .pte_update = (void *)native_nop, 316 .pte_update = paravirt_nop,
567 .pte_update_defer = (void *)native_nop, 317 .pte_update_defer = paravirt_nop,
318
319#ifdef CONFIG_HIGHPTE
320 .kmap_atomic_pte = kmap_atomic,
321#endif
322
568#ifdef CONFIG_X86_PAE 323#ifdef CONFIG_X86_PAE
569 .set_pte_atomic = native_set_pte_atomic, 324 .set_pte_atomic = native_set_pte_atomic,
570 .set_pte_present = native_set_pte_present, 325 .set_pte_present = native_set_pte_present,
571 .set_pud = native_set_pud, 326 .set_pud = native_set_pud,
572 .pte_clear = native_pte_clear, 327 .pte_clear = native_pte_clear,
573 .pmd_clear = native_pmd_clear, 328 .pmd_clear = native_pmd_clear,
329
330 .pmd_val = native_pmd_val,
331 .make_pmd = native_make_pmd,
574#endif 332#endif
575 333
334 .pte_val = native_pte_val,
335 .pgd_val = native_pgd_val,
336
337 .make_pte = native_make_pte,
338 .make_pgd = native_make_pgd,
339
576 .irq_enable_sysexit = native_irq_enable_sysexit, 340 .irq_enable_sysexit = native_irq_enable_sysexit,
577 .iret = native_iret, 341 .iret = native_iret,
578 342
579 .startup_ipi_hook = (void *)native_nop, 343 .dup_mmap = paravirt_nop,
344 .exit_mmap = paravirt_nop,
345 .activate_mm = paravirt_nop,
580}; 346};
581 347
582/* 348EXPORT_SYMBOL(paravirt_ops);
583 * NOTE: CONFIG_PARAVIRT is experimental and the paravirt_ops
584 * semantics are subject to change. Hence we only do this
585 * internal-only export of this, until it gets sorted out and
586 * all lowlevel CPU ops used by modules are separately exported.
587 */
588EXPORT_SYMBOL_GPL(paravirt_ops);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 393a67d5d94..61999479b7a 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -39,6 +39,7 @@
39#include <linux/random.h> 39#include <linux/random.h>
40#include <linux/personality.h> 40#include <linux/personality.h>
41#include <linux/tick.h> 41#include <linux/tick.h>
42#include <linux/percpu.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/pgtable.h> 45#include <asm/pgtable.h>
@@ -57,7 +58,6 @@
57 58
58#include <asm/tlbflush.h> 59#include <asm/tlbflush.h>
59#include <asm/cpu.h> 60#include <asm/cpu.h>
60#include <asm/pda.h>
61 61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 63
@@ -66,6 +66,12 @@ static int hlt_counter;
66unsigned long boot_option_idle_override = 0; 66unsigned long boot_option_idle_override = 0;
67EXPORT_SYMBOL(boot_option_idle_override); 67EXPORT_SYMBOL(boot_option_idle_override);
68 68
69DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
70EXPORT_PER_CPU_SYMBOL(current_task);
71
72DEFINE_PER_CPU(int, cpu_number);
73EXPORT_PER_CPU_SYMBOL(cpu_number);
74
69/* 75/*
70 * Return saved PC of a blocked thread. 76 * Return saved PC of a blocked thread.
71 */ 77 */
@@ -272,25 +278,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
272 } 278 }
273} 279}
274 280
275static int __init idle_setup (char *str) 281static int __init idle_setup(char *str)
276{ 282{
277 if (!strncmp(str, "poll", 4)) { 283 if (!strcmp(str, "poll")) {
278 printk("using polling idle threads.\n"); 284 printk("using polling idle threads.\n");
279 pm_idle = poll_idle; 285 pm_idle = poll_idle;
280#ifdef CONFIG_X86_SMP 286#ifdef CONFIG_X86_SMP
281 if (smp_num_siblings > 1) 287 if (smp_num_siblings > 1)
282 printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); 288 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
283#endif 289#endif
284 } else if (!strncmp(str, "halt", 4)) { 290 } else if (!strcmp(str, "mwait"))
285 printk("using halt in idle threads.\n"); 291 force_mwait = 1;
286 pm_idle = default_idle; 292 else
287 } 293 return -1;
288 294
289 boot_option_idle_override = 1; 295 boot_option_idle_override = 1;
290 return 1; 296 return 0;
291} 297}
292 298early_param("idle", idle_setup);
293__setup("idle=", idle_setup);
294 299
295void show_regs(struct pt_regs * regs) 300void show_regs(struct pt_regs * regs)
296{ 301{
@@ -343,7 +348,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
343 348
344 regs.xds = __USER_DS; 349 regs.xds = __USER_DS;
345 regs.xes = __USER_DS; 350 regs.xes = __USER_DS;
346 regs.xfs = __KERNEL_PDA; 351 regs.xfs = __KERNEL_PERCPU;
347 regs.orig_eax = -1; 352 regs.orig_eax = -1;
348 regs.eip = (unsigned long) kernel_thread_helper; 353 regs.eip = (unsigned long) kernel_thread_helper;
349 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 354 regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -376,7 +381,7 @@ void exit_thread(void)
376 t->io_bitmap_max = 0; 381 t->io_bitmap_max = 0;
377 tss->io_bitmap_owner = NULL; 382 tss->io_bitmap_owner = NULL;
378 tss->io_bitmap_max = 0; 383 tss->io_bitmap_max = 0;
379 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 384 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
380 put_cpu(); 385 put_cpu();
381 } 386 }
382} 387}
@@ -555,7 +560,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
555 * Disable the bitmap via an invalid offset. We still cache 560 * Disable the bitmap via an invalid offset. We still cache
556 * the previous bitmap owner and the IO bitmap contents: 561 * the previous bitmap owner and the IO bitmap contents:
557 */ 562 */
558 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 563 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
559 return; 564 return;
560 } 565 }
561 566
@@ -565,7 +570,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
565 * matches the next task, we dont have to do anything but 570 * matches the next task, we dont have to do anything but
566 * to set a valid offset in the TSS: 571 * to set a valid offset in the TSS:
567 */ 572 */
568 tss->io_bitmap_base = IO_BITMAP_OFFSET; 573 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
569 return; 574 return;
570 } 575 }
571 /* 576 /*
@@ -577,7 +582,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
577 * redundant copies when the currently switched task does not 582 * redundant copies when the currently switched task does not
578 * perform any I/O during its timeslice. 583 * perform any I/O during its timeslice.
579 */ 584 */
580 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; 585 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
581} 586}
582 587
583/* 588/*
@@ -712,7 +717,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
712 if (prev->gs | next->gs) 717 if (prev->gs | next->gs)
713 loadsegment(gs, next->gs); 718 loadsegment(gs, next->gs);
714 719
715 write_pda(pcurrent, next_p); 720 x86_write_percpu(current_task, next_p);
716 721
717 return prev_p; 722 return prev_p;
718} 723}
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 34874c398b4..9f6ab1789bb 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,12 +3,10 @@
3 */ 3 */
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6#include <asm/pci-direct.h>
7#include <asm/genapic.h>
8#include <asm/cpu.h>
9 6
10#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
11static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) 8
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
12{ 10{
13 u8 config, rev; 11 u8 config, rev;
14 u32 word; 12 u32 word;
@@ -16,12 +14,14 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
16 /* BIOS may enable hardware IRQ balancing for 14 /* BIOS may enable hardware IRQ balancing for
17 * E7520/E7320/E7525(revision ID 0x9 and below) 15 * E7520/E7320/E7525(revision ID 0x9 and below)
18 * based platforms. 16 * based platforms.
19 * For those platforms, make sure that the genapic is set to 'flat' 17 * Disable SW irqbalance/affinity on those platforms.
20 */ 18 */
21 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
22 if (rev > 0x9) 20 if (rev > 0x9)
23 return; 21 return;
24 22
23 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
24
25 /* enable access to config space*/ 25 /* enable access to config space*/
26 pci_read_config_byte(dev, 0xf4, &config); 26 pci_read_config_byte(dev, 0xf4, &config);
27 pci_write_config_byte(dev, 0xf4, config|0x2); 27 pci_write_config_byte(dev, 0xf4, config|0x2);
@@ -30,44 +30,6 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
31 31
32 if (!(word & (1 << 13))) { 32 if (!(word & (1 << 13))) {
33#ifdef CONFIG_X86_64
34 if (genapic != &apic_flat)
35 panic("APIC mode must be flat on this system\n");
36#elif defined(CONFIG_X86_GENERICARCH)
37 if (genapic != &apic_default)
38 panic("APIC mode must be default(flat) on this system. Use apic=default\n");
39#endif
40 }
41
42 /* put back the original value for config space*/
43 if (!(config & 0x2))
44 pci_write_config_byte(dev, 0xf4, config);
45}
46
47void __init quirk_intel_irqbalance(void)
48{
49 u8 config, rev;
50 u32 word;
51
52 /* BIOS may enable hardware IRQ balancing for
53 * E7520/E7320/E7525(revision ID 0x9 and below)
54 * based platforms.
55 * Disable SW irqbalance/affinity on those platforms.
56 */
57 rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
58 if (rev > 0x9)
59 return;
60
61 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
62
63 /* enable access to config space */
64 config = read_pci_config_byte(0, 0, 0, 0xf4);
65 write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
66
67 /* read xTPR register */
68 word = read_pci_config_16(0, 0, 0x40, 0x4c);
69
70 if (!(word & (1 << 13))) {
71 printk(KERN_INFO "Disabling irq balancing and affinity\n"); 33 printk(KERN_INFO "Disabling irq balancing and affinity\n");
72#ifdef CONFIG_IRQBALANCE 34#ifdef CONFIG_IRQBALANCE
73 irqbalance_disable(""); 35 irqbalance_disable("");
@@ -76,24 +38,13 @@ void __init quirk_intel_irqbalance(void)
76#ifdef CONFIG_PROC_FS 38#ifdef CONFIG_PROC_FS
77 no_irq_affinity = 1; 39 no_irq_affinity = 1;
78#endif 40#endif
79#ifdef CONFIG_HOTPLUG_CPU
80 printk(KERN_INFO "Disabling cpu hotplug control\n");
81 enable_cpu_hotplug = 0;
82#endif
83#ifdef CONFIG_X86_64
84 /* force the genapic selection to flat mode so that
85 * interrupts can be redirected to more than one CPU.
86 */
87 genapic_force = &apic_flat;
88#endif
89 } 41 }
90 42
91 /* put back the original value for config space */ 43 /* put back the original value for config space*/
92 if (!(config & 0x2)) 44 if (!(config & 0x2))
93 write_pci_config_byte(0, 0, 0, 0xf4, config); 45 pci_write_config_byte(dev, 0xf4, config);
94} 46}
95DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); 47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
96DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); 48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
97DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); 49DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
98
99#endif 50#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 3514b4153f7..50dfc65319c 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -17,7 +17,8 @@
17#include <asm/apic.h> 17#include <asm/apic.h>
18#include <asm/desc.h> 18#include <asm/desc.h>
19#include "mach_reboot.h" 19#include "mach_reboot.h"
20#include <linux/reboot_fixups.h> 20#include <asm/reboot_fixups.h>
21#include <asm/reboot.h>
21 22
22/* 23/*
23 * Power off function, if any 24 * Power off function, if any
@@ -197,8 +198,6 @@ static unsigned char jump_to_bios [] =
197 */ 198 */
198void machine_real_restart(unsigned char *code, int length) 199void machine_real_restart(unsigned char *code, int length)
199{ 200{
200 unsigned long flags;
201
202 local_irq_disable(); 201 local_irq_disable();
203 202
204 /* Write zero to CMOS register number 0x0f, which the BIOS POST 203 /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -211,9 +210,9 @@ void machine_real_restart(unsigned char *code, int length)
211 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) 210 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
212 */ 211 */
213 212
214 spin_lock_irqsave(&rtc_lock, flags); 213 spin_lock(&rtc_lock);
215 CMOS_WRITE(0x00, 0x8f); 214 CMOS_WRITE(0x00, 0x8f);
216 spin_unlock_irqrestore(&rtc_lock, flags); 215 spin_unlock(&rtc_lock);
217 216
218 /* Remap the kernel at virtual address zero, as well as offset zero 217 /* Remap the kernel at virtual address zero, as well as offset zero
219 from the kernel segment. This assumes the kernel segment starts at 218 from the kernel segment. This assumes the kernel segment starts at
@@ -280,7 +279,7 @@ void machine_real_restart(unsigned char *code, int length)
280EXPORT_SYMBOL(machine_real_restart); 279EXPORT_SYMBOL(machine_real_restart);
281#endif 280#endif
282 281
283void machine_shutdown(void) 282static void native_machine_shutdown(void)
284{ 283{
285#ifdef CONFIG_SMP 284#ifdef CONFIG_SMP
286 int reboot_cpu_id; 285 int reboot_cpu_id;
@@ -316,7 +315,11 @@ void machine_shutdown(void)
316#endif 315#endif
317} 316}
318 317
319void machine_emergency_restart(void) 318void __attribute__((weak)) mach_reboot_fixups(void)
319{
320}
321
322static void native_machine_emergency_restart(void)
320{ 323{
321 if (!reboot_thru_bios) { 324 if (!reboot_thru_bios) {
322 if (efi_enabled) { 325 if (efi_enabled) {
@@ -340,17 +343,17 @@ void machine_emergency_restart(void)
340 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 343 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
341} 344}
342 345
343void machine_restart(char * __unused) 346static void native_machine_restart(char * __unused)
344{ 347{
345 machine_shutdown(); 348 machine_shutdown();
346 machine_emergency_restart(); 349 machine_emergency_restart();
347} 350}
348 351
349void machine_halt(void) 352static void native_machine_halt(void)
350{ 353{
351} 354}
352 355
353void machine_power_off(void) 356static void native_machine_power_off(void)
354{ 357{
355 if (pm_power_off) { 358 if (pm_power_off) {
356 machine_shutdown(); 359 machine_shutdown();
@@ -359,3 +362,35 @@ void machine_power_off(void)
359} 362}
360 363
361 364
365struct machine_ops machine_ops = {
366 .power_off = native_machine_power_off,
367 .shutdown = native_machine_shutdown,
368 .emergency_restart = native_machine_emergency_restart,
369 .restart = native_machine_restart,
370 .halt = native_machine_halt,
371};
372
373void machine_power_off(void)
374{
375 machine_ops.power_off();
376}
377
378void machine_shutdown(void)
379{
380 machine_ops.shutdown();
381}
382
383void machine_emergency_restart(void)
384{
385 machine_ops.emergency_restart();
386}
387
388void machine_restart(char *cmd)
389{
390 machine_ops.restart(cmd);
391}
392
393void machine_halt(void)
394{
395 machine_ops.halt();
396}
diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c
index 99aab41a05b..2d78d918340 100644
--- a/arch/i386/kernel/reboot_fixups.c
+++ b/arch/i386/kernel/reboot_fixups.c
@@ -10,7 +10,7 @@
10 10
11#include <asm/delay.h> 11#include <asm/delay.h>
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14 14
15static void cs5530a_warm_reset(struct pci_dev *dev) 15static void cs5530a_warm_reset(struct pci_dev *dev)
16{ 16{
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 0e8977871b1..89a45a9ddcd 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -165,20 +165,20 @@ void fastcall send_IPI_self(int vector)
165} 165}
166 166
167/* 167/*
168 * This is only used on smaller machines. 168 * This is used to send an IPI with no shorthand notation (the destination is
169 * specified in bits 56 to 63 of the ICR).
169 */ 170 */
170void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) 171static inline void __send_IPI_dest_field(unsigned long mask, int vector)
171{ 172{
172 unsigned long mask = cpus_addr(cpumask)[0];
173 unsigned long cfg; 173 unsigned long cfg;
174 unsigned long flags;
175 174
176 local_irq_save(flags);
177 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
178 /* 175 /*
179 * Wait for idle. 176 * Wait for idle.
180 */ 177 */
181 apic_wait_icr_idle(); 178 if (unlikely(vector == NMI_VECTOR))
179 safe_apic_wait_icr_idle();
180 else
181 apic_wait_icr_idle();
182 182
183 /* 183 /*
184 * prepare target chip field 184 * prepare target chip field
@@ -195,13 +195,25 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
195 * Send the IPI. The write to APIC_ICR fires this off. 195 * Send the IPI. The write to APIC_ICR fires this off.
196 */ 196 */
197 apic_write_around(APIC_ICR, cfg); 197 apic_write_around(APIC_ICR, cfg);
198}
199
200/*
201 * This is only used on smaller machines.
202 */
203void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
204{
205 unsigned long mask = cpus_addr(cpumask)[0];
206 unsigned long flags;
198 207
208 local_irq_save(flags);
209 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
210 __send_IPI_dest_field(mask, vector);
199 local_irq_restore(flags); 211 local_irq_restore(flags);
200} 212}
201 213
202void send_IPI_mask_sequence(cpumask_t mask, int vector) 214void send_IPI_mask_sequence(cpumask_t mask, int vector)
203{ 215{
204 unsigned long cfg, flags; 216 unsigned long flags;
205 unsigned int query_cpu; 217 unsigned int query_cpu;
206 218
207 /* 219 /*
@@ -211,30 +223,10 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
211 */ 223 */
212 224
213 local_irq_save(flags); 225 local_irq_save(flags);
214
215 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { 226 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
216 if (cpu_isset(query_cpu, mask)) { 227 if (cpu_isset(query_cpu, mask)) {
217 228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
218 /* 229 vector);
219 * Wait for idle.
220 */
221 apic_wait_icr_idle();
222
223 /*
224 * prepare target chip field
225 */
226 cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
227 apic_write_around(APIC_ICR2, cfg);
228
229 /*
230 * program the ICR
231 */
232 cfg = __prepare_ICR(0, vector);
233
234 /*
235 * Send the IPI. The write to APIC_ICR fires this off.
236 */
237 apic_write_around(APIC_ICR, cfg);
238 } 230 }
239 } 231 }
240 local_irq_restore(flags); 232 local_irq_restore(flags);
@@ -256,7 +248,6 @@ static cpumask_t flush_cpumask;
256static struct mm_struct * flush_mm; 248static struct mm_struct * flush_mm;
257static unsigned long flush_va; 249static unsigned long flush_va;
258static DEFINE_SPINLOCK(tlbstate_lock); 250static DEFINE_SPINLOCK(tlbstate_lock);
259#define FLUSH_ALL 0xffffffff
260 251
261/* 252/*
262 * We cannot call mmdrop() because we are in interrupt context, 253 * We cannot call mmdrop() because we are in interrupt context,
@@ -338,7 +329,7 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
338 329
339 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { 330 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
340 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { 331 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
341 if (flush_va == FLUSH_ALL) 332 if (flush_va == TLB_FLUSH_ALL)
342 local_flush_tlb(); 333 local_flush_tlb();
343 else 334 else
344 __flush_tlb_one(flush_va); 335 __flush_tlb_one(flush_va);
@@ -353,9 +344,11 @@ out:
353 put_cpu_no_resched(); 344 put_cpu_no_resched();
354} 345}
355 346
356static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 347void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
357 unsigned long va) 348 unsigned long va)
358{ 349{
350 cpumask_t cpumask = *cpumaskp;
351
359 /* 352 /*
360 * A couple of (to be removed) sanity checks: 353 * A couple of (to be removed) sanity checks:
361 * 354 *
@@ -366,10 +359,12 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
366 BUG_ON(cpu_isset(smp_processor_id(), cpumask)); 359 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
367 BUG_ON(!mm); 360 BUG_ON(!mm);
368 361
362#ifdef CONFIG_HOTPLUG_CPU
369 /* If a CPU which we ran on has gone down, OK. */ 363 /* If a CPU which we ran on has gone down, OK. */
370 cpus_and(cpumask, cpumask, cpu_online_map); 364 cpus_and(cpumask, cpumask, cpu_online_map);
371 if (cpus_empty(cpumask)) 365 if (unlikely(cpus_empty(cpumask)))
372 return; 366 return;
367#endif
373 368
374 /* 369 /*
375 * i'm not happy about this global shared spinlock in the 370 * i'm not happy about this global shared spinlock in the
@@ -380,17 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
380 375
381 flush_mm = mm; 376 flush_mm = mm;
382 flush_va = va; 377 flush_va = va;
383#if NR_CPUS <= BITS_PER_LONG 378 cpus_or(flush_cpumask, cpumask, flush_cpumask);
384 atomic_set_mask(cpumask, &flush_cpumask);
385#else
386 {
387 int k;
388 unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
389 unsigned long *cpu_mask = (unsigned long *)&cpumask;
390 for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
391 atomic_set_mask(cpu_mask[k], &flush_mask[k]);
392 }
393#endif
394 /* 379 /*
395 * We have to send the IPI only to 380 * We have to send the IPI only to
396 * CPUs affected. 381 * CPUs affected.
@@ -417,7 +402,7 @@ void flush_tlb_current_task(void)
417 402
418 local_flush_tlb(); 403 local_flush_tlb();
419 if (!cpus_empty(cpu_mask)) 404 if (!cpus_empty(cpu_mask))
420 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 405 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
421 preempt_enable(); 406 preempt_enable();
422} 407}
423 408
@@ -436,7 +421,7 @@ void flush_tlb_mm (struct mm_struct * mm)
436 leave_mm(smp_processor_id()); 421 leave_mm(smp_processor_id());
437 } 422 }
438 if (!cpus_empty(cpu_mask)) 423 if (!cpus_empty(cpu_mask))
439 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 424 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
440 425
441 preempt_enable(); 426 preempt_enable();
442} 427}
@@ -483,7 +468,7 @@ void flush_tlb_all(void)
483 * it goes straight through and wastes no time serializing 468 * it goes straight through and wastes no time serializing
484 * anything. Worst case is that we lose a reschedule ... 469 * anything. Worst case is that we lose a reschedule ...
485 */ 470 */
486void smp_send_reschedule(int cpu) 471void native_smp_send_reschedule(int cpu)
487{ 472{
488 WARN_ON(cpu_is_offline(cpu)); 473 WARN_ON(cpu_is_offline(cpu));
489 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); 474 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
@@ -515,36 +500,78 @@ void unlock_ipi_call_lock(void)
515 500
516static struct call_data_struct *call_data; 501static struct call_data_struct *call_data;
517 502
503static void __smp_call_function(void (*func) (void *info), void *info,
504 int nonatomic, int wait)
505{
506 struct call_data_struct data;
507 int cpus = num_online_cpus() - 1;
508
509 if (!cpus)
510 return;
511
512 data.func = func;
513 data.info = info;
514 atomic_set(&data.started, 0);
515 data.wait = wait;
516 if (wait)
517 atomic_set(&data.finished, 0);
518
519 call_data = &data;
520 mb();
521
522 /* Send a message to all other CPUs and wait for them to respond */
523 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
524
525 /* Wait for response */
526 while (atomic_read(&data.started) != cpus)
527 cpu_relax();
528
529 if (wait)
530 while (atomic_read(&data.finished) != cpus)
531 cpu_relax();
532}
533
534
518/** 535/**
519 * smp_call_function(): Run a function on all other CPUs. 536 * smp_call_function_mask(): Run a function on a set of other CPUs.
537 * @mask: The set of cpus to run on. Must not include the current cpu.
520 * @func: The function to run. This must be fast and non-blocking. 538 * @func: The function to run. This must be fast and non-blocking.
521 * @info: An arbitrary pointer to pass to the function. 539 * @info: An arbitrary pointer to pass to the function.
522 * @nonatomic: currently unused.
523 * @wait: If true, wait (atomically) until function has completed on other CPUs. 540 * @wait: If true, wait (atomically) until function has completed on other CPUs.
524 * 541 *
525 * Returns 0 on success, else a negative status code. Does not return until 542 * Returns 0 on success, else a negative status code.
526 * remote CPUs are nearly ready to execute <<func>> or are or have executed. 543 *
544 * If @wait is true, then returns once @func has returned; otherwise
545 * it returns just before the target cpu calls @func.
527 * 546 *
528 * You must not call this function with disabled interrupts or from a 547 * You must not call this function with disabled interrupts or from a
529 * hardware interrupt handler or from a bottom half handler. 548 * hardware interrupt handler or from a bottom half handler.
530 */ 549 */
531int smp_call_function (void (*func) (void *info), void *info, int nonatomic, 550int native_smp_call_function_mask(cpumask_t mask,
532 int wait) 551 void (*func)(void *), void *info,
552 int wait)
533{ 553{
534 struct call_data_struct data; 554 struct call_data_struct data;
555 cpumask_t allbutself;
535 int cpus; 556 int cpus;
536 557
558 /* Can deadlock when called with interrupts disabled */
559 WARN_ON(irqs_disabled());
560
537 /* Holding any lock stops cpus from going down. */ 561 /* Holding any lock stops cpus from going down. */
538 spin_lock(&call_lock); 562 spin_lock(&call_lock);
539 cpus = num_online_cpus() - 1; 563
564 allbutself = cpu_online_map;
565 cpu_clear(smp_processor_id(), allbutself);
566
567 cpus_and(mask, mask, allbutself);
568 cpus = cpus_weight(mask);
569
540 if (!cpus) { 570 if (!cpus) {
541 spin_unlock(&call_lock); 571 spin_unlock(&call_lock);
542 return 0; 572 return 0;
543 } 573 }
544 574
545 /* Can deadlock when called with interrupts disabled */
546 WARN_ON(irqs_disabled());
547
548 data.func = func; 575 data.func = func;
549 data.info = info; 576 data.info = info;
550 atomic_set(&data.started, 0); 577 atomic_set(&data.started, 0);
@@ -554,9 +581,12 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
554 581
555 call_data = &data; 582 call_data = &data;
556 mb(); 583 mb();
557 584
558 /* Send a message to all other CPUs and wait for them to respond */ 585 /* Send a message to other CPUs */
559 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 586 if (cpus_equal(mask, allbutself))
587 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
588 else
589 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
560 590
561 /* Wait for response */ 591 /* Wait for response */
562 while (atomic_read(&data.started) != cpus) 592 while (atomic_read(&data.started) != cpus)
@@ -569,15 +599,68 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
569 599
570 return 0; 600 return 0;
571} 601}
602
603/**
604 * smp_call_function(): Run a function on all other CPUs.
605 * @func: The function to run. This must be fast and non-blocking.
606 * @info: An arbitrary pointer to pass to the function.
607 * @nonatomic: Unused.
608 * @wait: If true, wait (atomically) until function has completed on other CPUs.
609 *
610 * Returns 0 on success, else a negative status code.
611 *
612 * If @wait is true, then returns once @func has returned; otherwise
613 * it returns just before the target cpu calls @func.
614 *
615 * You must not call this function with disabled interrupts or from a
616 * hardware interrupt handler or from a bottom half handler.
617 */
618int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
619 int wait)
620{
621 return smp_call_function_mask(cpu_online_map, func, info, wait);
622}
572EXPORT_SYMBOL(smp_call_function); 623EXPORT_SYMBOL(smp_call_function);
573 624
625/**
626 * smp_call_function_single - Run a function on another CPU
627 * @cpu: The target CPU. Cannot be the calling CPU.
628 * @func: The function to run. This must be fast and non-blocking.
629 * @info: An arbitrary pointer to pass to the function.
630 * @nonatomic: Unused.
631 * @wait: If true, wait until function has completed on other CPUs.
632 *
633 * Returns 0 on success, else a negative status code.
634 *
635 * If @wait is true, then returns once @func has returned; otherwise
636 * it returns just before the target cpu calls @func.
637 */
638int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
639 int nonatomic, int wait)
640{
641 /* prevent preemption and reschedule on another processor */
642 int ret;
643 int me = get_cpu();
644 if (cpu == me) {
645 WARN_ON(1);
646 put_cpu();
647 return -EBUSY;
648 }
649
650 ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
651
652 put_cpu();
653 return ret;
654}
655EXPORT_SYMBOL(smp_call_function_single);
656
574static void stop_this_cpu (void * dummy) 657static void stop_this_cpu (void * dummy)
575{ 658{
659 local_irq_disable();
576 /* 660 /*
577 * Remove this CPU: 661 * Remove this CPU:
578 */ 662 */
579 cpu_clear(smp_processor_id(), cpu_online_map); 663 cpu_clear(smp_processor_id(), cpu_online_map);
580 local_irq_disable();
581 disable_local_APIC(); 664 disable_local_APIC();
582 if (cpu_data[smp_processor_id()].hlt_works_ok) 665 if (cpu_data[smp_processor_id()].hlt_works_ok)
583 for(;;) halt(); 666 for(;;) halt();
@@ -588,13 +671,18 @@ static void stop_this_cpu (void * dummy)
588 * this function calls the 'stop' function on all other CPUs in the system. 671 * this function calls the 'stop' function on all other CPUs in the system.
589 */ 672 */
590 673
591void smp_send_stop(void) 674void native_smp_send_stop(void)
592{ 675{
593 smp_call_function(stop_this_cpu, NULL, 1, 0); 676 /* Don't deadlock on the call lock in panic */
677 int nolock = !spin_trylock(&call_lock);
678 unsigned long flags;
594 679
595 local_irq_disable(); 680 local_irq_save(flags);
681 __smp_call_function(stop_this_cpu, NULL, 0, 0);
682 if (!nolock)
683 spin_unlock(&call_lock);
596 disable_local_APIC(); 684 disable_local_APIC();
597 local_irq_enable(); 685 local_irq_restore(flags);
598} 686}
599 687
600/* 688/*
@@ -633,77 +721,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
633 } 721 }
634} 722}
635 723
636/*
637 * this function sends a 'generic call function' IPI to one other CPU
638 * in the system.
639 *
640 * cpu is a standard Linux logical CPU number.
641 */
642static void
643__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
644 int nonatomic, int wait)
645{
646 struct call_data_struct data;
647 int cpus = 1;
648
649 data.func = func;
650 data.info = info;
651 atomic_set(&data.started, 0);
652 data.wait = wait;
653 if (wait)
654 atomic_set(&data.finished, 0);
655
656 call_data = &data;
657 wmb();
658 /* Send a message to all other CPUs and wait for them to respond */
659 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
660
661 /* Wait for response */
662 while (atomic_read(&data.started) != cpus)
663 cpu_relax();
664
665 if (!wait)
666 return;
667
668 while (atomic_read(&data.finished) != cpus)
669 cpu_relax();
670}
671
672/*
673 * smp_call_function_single - Run a function on another CPU
674 * @func: The function to run. This must be fast and non-blocking.
675 * @info: An arbitrary pointer to pass to the function.
676 * @nonatomic: Currently unused.
677 * @wait: If true, wait until function has completed on other CPUs.
678 *
679 * Retrurns 0 on success, else a negative status code.
680 *
681 * Does not return until the remote CPU is nearly ready to execute <func>
682 * or is or has executed.
683 */
684
685int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
686 int nonatomic, int wait)
687{
688 /* prevent preemption and reschedule on another processor */
689 int me = get_cpu();
690 if (cpu == me) {
691 WARN_ON(1);
692 put_cpu();
693 return -EBUSY;
694 }
695
696 /* Can deadlock when called with interrupts disabled */
697 WARN_ON(irqs_disabled());
698
699 spin_lock_bh(&call_lock);
700 __smp_call_function_single(cpu, func, info, nonatomic, wait);
701 spin_unlock_bh(&call_lock);
702 put_cpu();
703 return 0;
704}
705EXPORT_SYMBOL(smp_call_function_single);
706
707static int convert_apicid_to_cpu(int apic_id) 724static int convert_apicid_to_cpu(int apic_id)
708{ 725{
709 int i; 726 int i;
@@ -730,3 +747,14 @@ int safe_smp_processor_id(void)
730 747
731 return cpuid >= 0 ? cpuid : 0; 748 return cpuid >= 0 ? cpuid : 0;
732} 749}
750
751struct smp_ops smp_ops = {
752 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
753 .smp_prepare_cpus = native_smp_prepare_cpus,
754 .cpu_up = native_cpu_up,
755 .smp_cpus_done = native_smp_cpus_done,
756
757 .smp_send_stop = native_smp_send_stop,
758 .smp_send_reschedule = native_smp_send_reschedule,
759 .smp_call_function_mask = native_smp_call_function_mask,
760};
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4ff55e67557..a4b7ad283f4 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -53,13 +53,12 @@
53#include <asm/desc.h> 53#include <asm/desc.h>
54#include <asm/arch_hooks.h> 54#include <asm/arch_hooks.h>
55#include <asm/nmi.h> 55#include <asm/nmi.h>
56#include <asm/pda.h>
57#include <asm/genapic.h>
58 56
59#include <mach_apic.h> 57#include <mach_apic.h>
60#include <mach_wakecpu.h> 58#include <mach_wakecpu.h>
61#include <smpboot_hooks.h> 59#include <smpboot_hooks.h>
62#include <asm/vmi.h> 60#include <asm/vmi.h>
61#include <asm/mtrr.h>
63 62
64/* Set if we find a B stepping CPU */ 63/* Set if we find a B stepping CPU */
65static int __devinitdata smp_b_stepping; 64static int __devinitdata smp_b_stepping;
@@ -100,6 +99,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
100 99
101u8 apicid_2_node[MAX_APICID]; 100u8 apicid_2_node[MAX_APICID];
102 101
102DEFINE_PER_CPU(unsigned long, this_cpu_off);
103EXPORT_PER_CPU_SYMBOL(this_cpu_off);
104
103/* 105/*
104 * Trampoline 80x86 program as an array. 106 * Trampoline 80x86 program as an array.
105 */ 107 */
@@ -156,7 +158,7 @@ static void __cpuinit smp_store_cpu_info(int id)
156 158
157 *c = boot_cpu_data; 159 *c = boot_cpu_data;
158 if (id!=0) 160 if (id!=0)
159 identify_cpu(c); 161 identify_secondary_cpu(c);
160 /* 162 /*
161 * Mask B, Pentium, but not Pentium MMX 163 * Mask B, Pentium, but not Pentium MMX
162 */ 164 */
@@ -379,14 +381,14 @@ set_cpu_sibling_map(int cpu)
379static void __cpuinit start_secondary(void *unused) 381static void __cpuinit start_secondary(void *unused)
380{ 382{
381 /* 383 /*
382 * Don't put *anything* before secondary_cpu_init(), SMP 384 * Don't put *anything* before cpu_init(), SMP booting is too
383 * booting is too fragile that we want to limit the 385 * fragile that we want to limit the things done here to the
384 * things done here to the most necessary things. 386 * most necessary things.
385 */ 387 */
386#ifdef CONFIG_VMI 388#ifdef CONFIG_VMI
387 vmi_bringup(); 389 vmi_bringup();
388#endif 390#endif
389 secondary_cpu_init(); 391 cpu_init();
390 preempt_disable(); 392 preempt_disable();
391 smp_callin(); 393 smp_callin();
392 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 394 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -441,12 +443,6 @@ static void __cpuinit start_secondary(void *unused)
441void __devinit initialize_secondary(void) 443void __devinit initialize_secondary(void)
442{ 444{
443 /* 445 /*
444 * switch to the per CPU GDT we already set up
445 * in do_boot_cpu()
446 */
447 cpu_set_gdt(current_thread_info()->cpu);
448
449 /*
450 * We don't actually need to load the full TSS, 446 * We don't actually need to load the full TSS,
451 * basically just the stack pointer and the eip. 447 * basically just the stack pointer and the eip.
452 */ 448 */
@@ -463,7 +459,6 @@ extern struct {
463 void * esp; 459 void * esp;
464 unsigned short ss; 460 unsigned short ss;
465} stack_start; 461} stack_start;
466extern struct i386_pda *start_pda;
467 462
468#ifdef CONFIG_NUMA 463#ifdef CONFIG_NUMA
469 464
@@ -521,12 +516,12 @@ static void unmap_cpu_to_logical_apicid(int cpu)
521 unmap_cpu_to_node(cpu); 516 unmap_cpu_to_node(cpu);
522} 517}
523 518
524#if APIC_DEBUG
525static inline void __inquire_remote_apic(int apicid) 519static inline void __inquire_remote_apic(int apicid)
526{ 520{
527 int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 521 int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
528 char *names[] = { "ID", "VERSION", "SPIV" }; 522 char *names[] = { "ID", "VERSION", "SPIV" };
529 int timeout, status; 523 int timeout;
524 unsigned long status;
530 525
531 printk("Inquiring remote APIC #%d...\n", apicid); 526 printk("Inquiring remote APIC #%d...\n", apicid);
532 527
@@ -536,7 +531,9 @@ static inline void __inquire_remote_apic(int apicid)
536 /* 531 /*
537 * Wait for idle. 532 * Wait for idle.
538 */ 533 */
539 apic_wait_icr_idle(); 534 status = safe_apic_wait_icr_idle();
535 if (status)
536 printk("a previous APIC delivery may have failed\n");
540 537
541 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 538 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
542 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 539 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -550,14 +547,13 @@ static inline void __inquire_remote_apic(int apicid)
550 switch (status) { 547 switch (status) {
551 case APIC_ICR_RR_VALID: 548 case APIC_ICR_RR_VALID:
552 status = apic_read(APIC_RRR); 549 status = apic_read(APIC_RRR);
553 printk("%08x\n", status); 550 printk("%lx\n", status);
554 break; 551 break;
555 default: 552 default:
556 printk("failed\n"); 553 printk("failed\n");
557 } 554 }
558 } 555 }
559} 556}
560#endif
561 557
562#ifdef WAKE_SECONDARY_VIA_NMI 558#ifdef WAKE_SECONDARY_VIA_NMI
563/* 559/*
@@ -568,8 +564,8 @@ static inline void __inquire_remote_apic(int apicid)
568static int __devinit 564static int __devinit
569wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) 565wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
570{ 566{
571 unsigned long send_status = 0, accept_status = 0; 567 unsigned long send_status, accept_status = 0;
572 int timeout, maxlvt; 568 int maxlvt;
573 569
574 /* Target chip */ 570 /* Target chip */
575 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 571 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
@@ -579,12 +575,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 575 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
580 576
581 Dprintk("Waiting for send to finish...\n"); 577 Dprintk("Waiting for send to finish...\n");
582 timeout = 0; 578 send_status = safe_apic_wait_icr_idle();
583 do {
584 Dprintk("+");
585 udelay(100);
586 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
587 } while (send_status && (timeout++ < 1000));
588 579
589 /* 580 /*
590 * Give the other CPU some time to accept the IPI. 581 * Give the other CPU some time to accept the IPI.
@@ -614,8 +605,8 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
614static int __devinit 605static int __devinit
615wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) 606wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
616{ 607{
617 unsigned long send_status = 0, accept_status = 0; 608 unsigned long send_status, accept_status = 0;
618 int maxlvt, timeout, num_starts, j; 609 int maxlvt, num_starts, j;
619 610
620 /* 611 /*
621 * Be paranoid about clearing APIC errors. 612 * Be paranoid about clearing APIC errors.
@@ -640,12 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
640 | APIC_DM_INIT); 631 | APIC_DM_INIT);
641 632
642 Dprintk("Waiting for send to finish...\n"); 633 Dprintk("Waiting for send to finish...\n");
643 timeout = 0; 634 send_status = safe_apic_wait_icr_idle();
644 do {
645 Dprintk("+");
646 udelay(100);
647 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
648 } while (send_status && (timeout++ < 1000));
649 635
650 mdelay(10); 636 mdelay(10);
651 637
@@ -658,12 +644,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
658 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 644 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
659 645
660 Dprintk("Waiting for send to finish...\n"); 646 Dprintk("Waiting for send to finish...\n");
661 timeout = 0; 647 send_status = safe_apic_wait_icr_idle();
662 do {
663 Dprintk("+");
664 udelay(100);
665 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
666 } while (send_status && (timeout++ < 1000));
667 648
668 atomic_set(&init_deasserted, 1); 649 atomic_set(&init_deasserted, 1);
669 650
@@ -719,12 +700,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
719 Dprintk("Startup point 1.\n"); 700 Dprintk("Startup point 1.\n");
720 701
721 Dprintk("Waiting for send to finish...\n"); 702 Dprintk("Waiting for send to finish...\n");
722 timeout = 0; 703 send_status = safe_apic_wait_icr_idle();
723 do {
724 Dprintk("+");
725 udelay(100);
726 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
727 } while (send_status && (timeout++ < 1000));
728 704
729 /* 705 /*
730 * Give the other CPU some time to accept the IPI. 706 * Give the other CPU some time to accept the IPI.
@@ -788,6 +764,25 @@ static inline struct task_struct * alloc_idle_task(int cpu)
788#define alloc_idle_task(cpu) fork_idle(cpu) 764#define alloc_idle_task(cpu) fork_idle(cpu)
789#endif 765#endif
790 766
767/* Initialize the CPU's GDT. This is either the boot CPU doing itself
768 (still using the master per-cpu area), or a CPU doing it for a
769 secondary which will soon come up. */
770static __cpuinit void init_gdt(int cpu)
771{
772 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
773
774 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
775 (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
776 __per_cpu_offset[cpu], 0xFFFFF,
777 0x80 | DESCTYPE_S | 0x2, 0x8);
778
779 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
780 per_cpu(cpu_number, cpu) = cpu;
781}
782
783/* Defined in head.S */
784extern struct Xgt_desc_struct early_gdt_descr;
785
791static int __cpuinit do_boot_cpu(int apicid, int cpu) 786static int __cpuinit do_boot_cpu(int apicid, int cpu)
792/* 787/*
793 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 788 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -802,6 +797,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
802 unsigned short nmi_high = 0, nmi_low = 0; 797 unsigned short nmi_high = 0, nmi_low = 0;
803 798
804 /* 799 /*
800 * Save current MTRR state in case it was changed since early boot
801 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
802 */
803 mtrr_save_state();
804
805 /*
805 * We can't use kernel_thread since we must avoid to 806 * We can't use kernel_thread since we must avoid to
806 * reschedule the child. 807 * reschedule the child.
807 */ 808 */
@@ -809,13 +810,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
809 if (IS_ERR(idle)) 810 if (IS_ERR(idle))
810 panic("failed fork for CPU %d", cpu); 811 panic("failed fork for CPU %d", cpu);
811 812
812 /* Pre-allocate and initialize the CPU's GDT and PDA so it 813 init_gdt(cpu);
813 doesn't have to do any memory allocation during the 814 per_cpu(current_task, cpu) = idle;
814 delicate CPU-bringup phase. */ 815 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
815 if (!init_gdt(cpu, idle)) {
816 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
817 return -1; /* ? */
818 }
819 816
820 idle->thread.eip = (unsigned long) start_secondary; 817 idle->thread.eip = (unsigned long) start_secondary;
821 /* start_eip had better be page-aligned! */ 818 /* start_eip had better be page-aligned! */
@@ -941,7 +938,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
941 DECLARE_COMPLETION_ONSTACK(done); 938 DECLARE_COMPLETION_ONSTACK(done);
942 struct warm_boot_cpu_info info; 939 struct warm_boot_cpu_info info;
943 int apicid, ret; 940 int apicid, ret;
944 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
945 941
946 apicid = x86_cpu_to_apicid[cpu]; 942 apicid = x86_cpu_to_apicid[cpu];
947 if (apicid == BAD_APICID) { 943 if (apicid == BAD_APICID) {
@@ -949,18 +945,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
949 goto exit; 945 goto exit;
950 } 946 }
951 947
952 /*
953 * the CPU isn't initialized at boot time, allocate gdt table here.
954 * cpu_init will initialize it
955 */
956 if (!cpu_gdt_descr->address) {
957 cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
958 if (!cpu_gdt_descr->address)
959 printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
960 ret = -ENOMEM;
961 goto exit;
962 }
963
964 info.complete = &done; 948 info.complete = &done;
965 info.apicid = apicid; 949 info.apicid = apicid;
966 info.cpu = cpu; 950 info.cpu = cpu;
@@ -1173,7 +1157,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1173 1157
1174/* These are wrappers to interface to the new boot process. Someone 1158/* These are wrappers to interface to the new boot process. Someone
1175 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ 1159 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
1176void __init smp_prepare_cpus(unsigned int max_cpus) 1160void __init native_smp_prepare_cpus(unsigned int max_cpus)
1177{ 1161{
1178 smp_commenced_mask = cpumask_of_cpu(0); 1162 smp_commenced_mask = cpumask_of_cpu(0);
1179 cpu_callin_map = cpumask_of_cpu(0); 1163 cpu_callin_map = cpumask_of_cpu(0);
@@ -1181,13 +1165,18 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
1181 smp_boot_cpus(max_cpus); 1165 smp_boot_cpus(max_cpus);
1182} 1166}
1183 1167
1184void __devinit smp_prepare_boot_cpu(void) 1168void __init native_smp_prepare_boot_cpu(void)
1185{ 1169{
1186 cpu_set(smp_processor_id(), cpu_online_map); 1170 unsigned int cpu = smp_processor_id();
1187 cpu_set(smp_processor_id(), cpu_callout_map); 1171
1188 cpu_set(smp_processor_id(), cpu_present_map); 1172 init_gdt(cpu);
1189 cpu_set(smp_processor_id(), cpu_possible_map); 1173 switch_to_new_gdt();
1190 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 1174
1175 cpu_set(cpu, cpu_online_map);
1176 cpu_set(cpu, cpu_callout_map);
1177 cpu_set(cpu, cpu_present_map);
1178 cpu_set(cpu, cpu_possible_map);
1179 __get_cpu_var(cpu_state) = CPU_ONLINE;
1191} 1180}
1192 1181
1193#ifdef CONFIG_HOTPLUG_CPU 1182#ifdef CONFIG_HOTPLUG_CPU
@@ -1277,7 +1266,7 @@ void __cpu_die(unsigned int cpu)
1277} 1266}
1278#endif /* CONFIG_HOTPLUG_CPU */ 1267#endif /* CONFIG_HOTPLUG_CPU */
1279 1268
1280int __cpuinit __cpu_up(unsigned int cpu) 1269int __cpuinit native_cpu_up(unsigned int cpu)
1281{ 1270{
1282 unsigned long flags; 1271 unsigned long flags;
1283#ifdef CONFIG_HOTPLUG_CPU 1272#ifdef CONFIG_HOTPLUG_CPU
@@ -1319,15 +1308,10 @@ int __cpuinit __cpu_up(unsigned int cpu)
1319 touch_nmi_watchdog(); 1308 touch_nmi_watchdog();
1320 } 1309 }
1321 1310
1322#ifdef CONFIG_X86_GENERICARCH
1323 if (num_online_cpus() > 8 && genapic == &apic_default)
1324 panic("Default flat APIC routing can't be used with > 8 cpus\n");
1325#endif
1326
1327 return 0; 1311 return 0;
1328} 1312}
1329 1313
1330void __init smp_cpus_done(unsigned int max_cpus) 1314void __init native_smp_cpus_done(unsigned int max_cpus)
1331{ 1315{
1332#ifdef CONFIG_X86_IO_APIC 1316#ifdef CONFIG_X86_IO_APIC
1333 setup_ioapic_dest(); 1317 setup_ioapic_dest();
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 13ca54a85a1..ff4ee6f3326 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -22,16 +22,26 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/unistd.h> 24#include <asm/unistd.h>
25#include <asm/elf.h>
26#include <asm/tlbflush.h>
27
28enum {
29 VDSO_DISABLED = 0,
30 VDSO_ENABLED = 1,
31 VDSO_COMPAT = 2,
32};
33
34#ifdef CONFIG_COMPAT_VDSO
35#define VDSO_DEFAULT VDSO_COMPAT
36#else
37#define VDSO_DEFAULT VDSO_ENABLED
38#endif
25 39
26/* 40/*
27 * Should the kernel map a VDSO page into processes and pass its 41 * Should the kernel map a VDSO page into processes and pass its
28 * address down to glibc upon exec()? 42 * address down to glibc upon exec()?
29 */ 43 */
30#ifdef CONFIG_PARAVIRT 44unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
31unsigned int __read_mostly vdso_enabled = 0;
32#else
33unsigned int __read_mostly vdso_enabled = 1;
34#endif
35 45
36EXPORT_SYMBOL_GPL(vdso_enabled); 46EXPORT_SYMBOL_GPL(vdso_enabled);
37 47
@@ -46,6 +56,123 @@ __setup("vdso=", vdso_setup);
46 56
47extern asmlinkage void sysenter_entry(void); 57extern asmlinkage void sysenter_entry(void);
48 58
59static __init void reloc_symtab(Elf32_Ehdr *ehdr,
60 unsigned offset, unsigned size)
61{
62 Elf32_Sym *sym = (void *)ehdr + offset;
63 unsigned nsym = size / sizeof(*sym);
64 unsigned i;
65
66 for(i = 0; i < nsym; i++, sym++) {
67 if (sym->st_shndx == SHN_UNDEF ||
68 sym->st_shndx == SHN_ABS)
69 continue; /* skip */
70
71 if (sym->st_shndx > SHN_LORESERVE) {
72 printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
73 sym->st_shndx);
74 continue;
75 }
76
77 switch(ELF_ST_TYPE(sym->st_info)) {
78 case STT_OBJECT:
79 case STT_FUNC:
80 case STT_SECTION:
81 case STT_FILE:
82 sym->st_value += VDSO_HIGH_BASE;
83 }
84 }
85}
86
87static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
88{
89 Elf32_Dyn *dyn = (void *)ehdr + offset;
90
91 for(; dyn->d_tag != DT_NULL; dyn++)
92 switch(dyn->d_tag) {
93 case DT_PLTGOT:
94 case DT_HASH:
95 case DT_STRTAB:
96 case DT_SYMTAB:
97 case DT_RELA:
98 case DT_INIT:
99 case DT_FINI:
100 case DT_REL:
101 case DT_DEBUG:
102 case DT_JMPREL:
103 case DT_VERSYM:
104 case DT_VERDEF:
105 case DT_VERNEED:
106 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
107 /* definitely pointers needing relocation */
108 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
109 break;
110
111 case DT_ENCODING ... OLD_DT_LOOS-1:
112 case DT_LOOS ... DT_HIOS-1:
113 /* Tags above DT_ENCODING are pointers if
114 they're even */
115 if (dyn->d_tag >= DT_ENCODING &&
116 (dyn->d_tag & 1) == 0)
117 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
118 break;
119
120 case DT_VERDEFNUM:
121 case DT_VERNEEDNUM:
122 case DT_FLAGS_1:
123 case DT_RELACOUNT:
124 case DT_RELCOUNT:
125 case DT_VALRNGLO ... DT_VALRNGHI:
126 /* definitely not pointers */
127 break;
128
129 case OLD_DT_LOOS ... DT_LOOS-1:
130 case DT_HIOS ... DT_VALRNGLO-1:
131 default:
132 if (dyn->d_tag > DT_ENCODING)
133 printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
134 dyn->d_tag);
135 break;
136 }
137}
138
139static __init void relocate_vdso(Elf32_Ehdr *ehdr)
140{
141 Elf32_Phdr *phdr;
142 Elf32_Shdr *shdr;
143 int i;
144
145 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
146 !elf_check_arch(ehdr) ||
147 ehdr->e_type != ET_DYN);
148
149 ehdr->e_entry += VDSO_HIGH_BASE;
150
151 /* rebase phdrs */
152 phdr = (void *)ehdr + ehdr->e_phoff;
153 for (i = 0; i < ehdr->e_phnum; i++) {
154 phdr[i].p_vaddr += VDSO_HIGH_BASE;
155
156 /* relocate dynamic stuff */
157 if (phdr[i].p_type == PT_DYNAMIC)
158 reloc_dyn(ehdr, phdr[i].p_offset);
159 }
160
161 /* rebase sections */
162 shdr = (void *)ehdr + ehdr->e_shoff;
163 for(i = 0; i < ehdr->e_shnum; i++) {
164 if (!(shdr[i].sh_flags & SHF_ALLOC))
165 continue;
166
167 shdr[i].sh_addr += VDSO_HIGH_BASE;
168
169 if (shdr[i].sh_type == SHT_SYMTAB ||
170 shdr[i].sh_type == SHT_DYNSYM)
171 reloc_symtab(ehdr, shdr[i].sh_offset,
172 shdr[i].sh_size);
173 }
174}
175
49void enable_sep_cpu(void) 176void enable_sep_cpu(void)
50{ 177{
51 int cpu = get_cpu(); 178 int cpu = get_cpu();
@@ -56,14 +183,33 @@ void enable_sep_cpu(void)
56 return; 183 return;
57 } 184 }
58 185
59 tss->ss1 = __KERNEL_CS; 186 tss->x86_tss.ss1 = __KERNEL_CS;
60 tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; 187 tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
61 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 188 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
62 wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0); 189 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
63 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); 190 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
64 put_cpu(); 191 put_cpu();
65} 192}
66 193
194static struct vm_area_struct gate_vma;
195
196static int __init gate_vma_init(void)
197{
198 gate_vma.vm_mm = NULL;
199 gate_vma.vm_start = FIXADDR_USER_START;
200 gate_vma.vm_end = FIXADDR_USER_END;
201 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
202 gate_vma.vm_page_prot = __P101;
203 /*
204 * Make sure the vDSO gets into every core dump.
205 * Dumping its contents makes post-mortem fully interpretable later
206 * without matching up the same kernel and hardware config to see
207 * what PC values meant.
208 */
209 gate_vma.vm_flags |= VM_ALWAYSDUMP;
210 return 0;
211}
212
67/* 213/*
68 * These symbols are defined by vsyscall.o to mark the bounds 214 * These symbols are defined by vsyscall.o to mark the bounds
69 * of the ELF DSO images included therein. 215 * of the ELF DSO images included therein.
@@ -72,31 +218,48 @@ extern const char vsyscall_int80_start, vsyscall_int80_end;
72extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; 218extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
73static struct page *syscall_pages[1]; 219static struct page *syscall_pages[1];
74 220
221static void map_compat_vdso(int map)
222{
223 static int vdso_mapped;
224
225 if (map == vdso_mapped)
226 return;
227
228 vdso_mapped = map;
229
230 __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
231 map ? PAGE_READONLY_EXEC : PAGE_NONE);
232
233 /* flush stray tlbs */
234 flush_tlb_all();
235}
236
75int __init sysenter_setup(void) 237int __init sysenter_setup(void)
76{ 238{
77 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); 239 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
240 const void *vsyscall;
241 size_t vsyscall_len;
242
78 syscall_pages[0] = virt_to_page(syscall_page); 243 syscall_pages[0] = virt_to_page(syscall_page);
79 244
80#ifdef CONFIG_COMPAT_VDSO 245 gate_vma_init();
81 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); 246
82 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); 247 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
83#endif
84 248
85 if (!boot_cpu_has(X86_FEATURE_SEP)) { 249 if (!boot_cpu_has(X86_FEATURE_SEP)) {
86 memcpy(syscall_page, 250 vsyscall = &vsyscall_int80_start;
87 &vsyscall_int80_start, 251 vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
88 &vsyscall_int80_end - &vsyscall_int80_start); 252 } else {
89 return 0; 253 vsyscall = &vsyscall_sysenter_start;
254 vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
90 } 255 }
91 256
92 memcpy(syscall_page, 257 memcpy(syscall_page, vsyscall, vsyscall_len);
93 &vsyscall_sysenter_start, 258 relocate_vdso(syscall_page);
94 &vsyscall_sysenter_end - &vsyscall_sysenter_start);
95 259
96 return 0; 260 return 0;
97} 261}
98 262
99#ifndef CONFIG_COMPAT_VDSO
100/* Defined in vsyscall-sysenter.S */ 263/* Defined in vsyscall-sysenter.S */
101extern void SYSENTER_RETURN; 264extern void SYSENTER_RETURN;
102 265
@@ -105,36 +268,52 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
105{ 268{
106 struct mm_struct *mm = current->mm; 269 struct mm_struct *mm = current->mm;
107 unsigned long addr; 270 unsigned long addr;
108 int ret; 271 int ret = 0;
272 bool compat;
109 273
110 down_write(&mm->mmap_sem); 274 down_write(&mm->mmap_sem);
111 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
112 if (IS_ERR_VALUE(addr)) {
113 ret = addr;
114 goto up_fail;
115 }
116 275
117 /* 276 /* Test compat mode once here, in case someone
118 * MAYWRITE to allow gdb to COW and set breakpoints 277 changes it via sysctl */
119 * 278 compat = (vdso_enabled == VDSO_COMPAT);
120 * Make sure the vDSO gets into every core dump. 279
121 * Dumping its contents makes post-mortem fully interpretable later 280 map_compat_vdso(compat);
122 * without matching up the same kernel and hardware config to see 281
123 * what PC values meant. 282 if (compat)
124 */ 283 addr = VDSO_HIGH_BASE;
125 ret = install_special_mapping(mm, addr, PAGE_SIZE, 284 else {
126 VM_READ|VM_EXEC| 285 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
127 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 286 if (IS_ERR_VALUE(addr)) {
128 VM_ALWAYSDUMP, 287 ret = addr;
129 syscall_pages); 288 goto up_fail;
130 if (ret) 289 }
131 goto up_fail; 290
291 /*
292 * MAYWRITE to allow gdb to COW and set breakpoints
293 *
294 * Make sure the vDSO gets into every core dump.
295 * Dumping its contents makes post-mortem fully
296 * interpretable later without matching up the same
297 * kernel and hardware config to see what PC values
298 * meant.
299 */
300 ret = install_special_mapping(mm, addr, PAGE_SIZE,
301 VM_READ|VM_EXEC|
302 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
303 VM_ALWAYSDUMP,
304 syscall_pages);
305
306 if (ret)
307 goto up_fail;
308 }
132 309
133 current->mm->context.vdso = (void *)addr; 310 current->mm->context.vdso = (void *)addr;
134 current_thread_info()->sysenter_return = 311 current_thread_info()->sysenter_return =
135 (void *)VDSO_SYM(&SYSENTER_RETURN); 312 (void *)VDSO_SYM(&SYSENTER_RETURN);
136up_fail: 313
314 up_fail:
137 up_write(&mm->mmap_sem); 315 up_write(&mm->mmap_sem);
316
138 return ret; 317 return ret;
139} 318}
140 319
@@ -147,6 +326,11 @@ const char *arch_vma_name(struct vm_area_struct *vma)
147 326
148struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 327struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
149{ 328{
329 struct mm_struct *mm = tsk->mm;
330
331 /* Check to see if this task was created in compat vdso mode */
332 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
333 return &gate_vma;
150 return NULL; 334 return NULL;
151} 335}
152 336
@@ -159,4 +343,3 @@ int in_gate_area_no_task(unsigned long addr)
159{ 343{
160 return 0; 344 return 0;
161} 345}
162#endif
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 94e5cb09110..a665df61f08 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -70,8 +70,6 @@
70 70
71#include <asm/i8259.h> 71#include <asm/i8259.h>
72 72
73int pit_latch_buggy; /* extern */
74
75#include "do_timer.h" 73#include "do_timer.h"
76 74
77unsigned int cpu_khz; /* Detected as we calibrate the TSC */ 75unsigned int cpu_khz; /* Detected as we calibrate the TSC */
diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S
index 2f1814c5cfd..f62815f8d06 100644
--- a/arch/i386/kernel/trampoline.S
+++ b/arch/i386/kernel/trampoline.S
@@ -29,7 +29,7 @@
29 * 29 *
30 * TYPE VALUE 30 * TYPE VALUE
31 * R_386_32 startup_32_smp 31 * R_386_32 startup_32_smp
32 * R_386_32 boot_gdt_table 32 * R_386_32 boot_gdt
33 */ 33 */
34 34
35#include <linux/linkage.h> 35#include <linux/linkage.h>
@@ -62,8 +62,8 @@ r_base = .
62 * to 32 bit. 62 * to 32 bit.
63 */ 63 */
64 64
65 lidtl boot_idt - r_base # load idt with 0, 0 65 lidtl boot_idt_descr - r_base # load idt with 0, 0
66 lgdtl boot_gdt - r_base # load gdt with whatever is appropriate 66 lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
67 67
68 xor %ax, %ax 68 xor %ax, %ax
69 inc %ax # protected mode (PE) bit 69 inc %ax # protected mode (PE) bit
@@ -73,11 +73,11 @@ r_base = .
73 73
74 # These need to be in the same 64K segment as the above; 74 # These need to be in the same 64K segment as the above;
75 # hence we don't use the boot_gdt_descr defined in head.S 75 # hence we don't use the boot_gdt_descr defined in head.S
76boot_gdt: 76boot_gdt_descr:
77 .word __BOOT_DS + 7 # gdt limit 77 .word __BOOT_DS + 7 # gdt limit
78 .long boot_gdt_table-__PAGE_OFFSET # gdt base 78 .long boot_gdt - __PAGE_OFFSET # gdt base
79 79
80boot_idt: 80boot_idt_descr:
81 .word 0 # idt limit = 0 81 .word 0 # idt limit = 0
82 .long 0 # idt base = 0L 82 .long 0 # idt base = 0L
83 83
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index af0d3f70a81..f21b41e7770 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -476,8 +476,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
476 siginfo_t *info) 476 siginfo_t *info)
477{ 477{
478 struct task_struct *tsk = current; 478 struct task_struct *tsk = current;
479 tsk->thread.error_code = error_code;
480 tsk->thread.trap_no = trapnr;
481 479
482 if (regs->eflags & VM_MASK) { 480 if (regs->eflags & VM_MASK) {
483 if (vm86) 481 if (vm86)
@@ -489,6 +487,18 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
489 goto kernel_trap; 487 goto kernel_trap;
490 488
491 trap_signal: { 489 trap_signal: {
490 /*
491 * We want error_code and trap_no set for userspace faults and
492 * kernelspace faults which result in die(), but not
493 * kernelspace faults which are fixed up. die() gives the
494 * process no chance to handle the signal and notice the
495 * kernel fault information, so that won't result in polluting
496 * the information about previously queued, but not yet
497 * delivered, faults. See also do_general_protection below.
498 */
499 tsk->thread.error_code = error_code;
500 tsk->thread.trap_no = trapnr;
501
492 if (info) 502 if (info)
493 force_sig_info(signr, info, tsk); 503 force_sig_info(signr, info, tsk);
494 else 504 else
@@ -497,8 +507,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
497 } 507 }
498 508
499 kernel_trap: { 509 kernel_trap: {
500 if (!fixup_exception(regs)) 510 if (!fixup_exception(regs)) {
511 tsk->thread.error_code = error_code;
512 tsk->thread.trap_no = trapnr;
501 die(str, regs, error_code); 513 die(str, regs, error_code);
514 }
502 return; 515 return;
503 } 516 }
504 517
@@ -583,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
583 * and we set the offset field correctly. Then we let the CPU to 596 * and we set the offset field correctly. Then we let the CPU to
584 * restart the faulting instruction. 597 * restart the faulting instruction.
585 */ 598 */
586 if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && 599 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
587 thread->io_bitmap_ptr) { 600 thread->io_bitmap_ptr) {
588 memcpy(tss->io_bitmap, thread->io_bitmap_ptr, 601 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
589 thread->io_bitmap_max); 602 thread->io_bitmap_max);
@@ -596,16 +609,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
596 thread->io_bitmap_max, 0xff, 609 thread->io_bitmap_max, 0xff,
597 tss->io_bitmap_max - thread->io_bitmap_max); 610 tss->io_bitmap_max - thread->io_bitmap_max);
598 tss->io_bitmap_max = thread->io_bitmap_max; 611 tss->io_bitmap_max = thread->io_bitmap_max;
599 tss->io_bitmap_base = IO_BITMAP_OFFSET; 612 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
600 tss->io_bitmap_owner = thread; 613 tss->io_bitmap_owner = thread;
601 put_cpu(); 614 put_cpu();
602 return; 615 return;
603 } 616 }
604 put_cpu(); 617 put_cpu();
605 618
606 current->thread.error_code = error_code;
607 current->thread.trap_no = 13;
608
609 if (regs->eflags & VM_MASK) 619 if (regs->eflags & VM_MASK)
610 goto gp_in_vm86; 620 goto gp_in_vm86;
611 621
@@ -624,6 +634,8 @@ gp_in_vm86:
624 634
625gp_in_kernel: 635gp_in_kernel:
626 if (!fixup_exception(regs)) { 636 if (!fixup_exception(regs)) {
637 current->thread.error_code = error_code;
638 current->thread.trap_no = 13;
627 if (notify_die(DIE_GPF, "general protection fault", regs, 639 if (notify_die(DIE_GPF, "general protection fault", regs,
628 error_code, 13, SIGSEGV) == NOTIFY_STOP) 640 error_code, 13, SIGSEGV) == NOTIFY_STOP)
629 return; 641 return;
@@ -1018,9 +1030,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1018fastcall unsigned long patch_espfix_desc(unsigned long uesp, 1030fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1019 unsigned long kesp) 1031 unsigned long kesp)
1020{ 1032{
1021 int cpu = smp_processor_id(); 1033 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
1022 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1023 struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
1024 unsigned long base = (kesp - uesp) & -THREAD_SIZE; 1034 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1025 unsigned long new_kesp = kesp - base; 1035 unsigned long new_kesp = kesp - base;
1026 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; 1036 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 6cb8f533673..f64b81f3033 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -200,13 +200,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
200{ 200{
201 struct cpufreq_freqs *freq = data; 201 struct cpufreq_freqs *freq = data;
202 202
203 if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
204 write_seqlock_irq(&xtime_lock);
205
206 if (!ref_freq) { 203 if (!ref_freq) {
207 if (!freq->old){ 204 if (!freq->old){
208 ref_freq = freq->new; 205 ref_freq = freq->new;
209 goto end; 206 return 0;
210 } 207 }
211 ref_freq = freq->old; 208 ref_freq = freq->old;
212 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; 209 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
@@ -233,13 +230,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
233 * TSC based sched_clock turns 230 * TSC based sched_clock turns
234 * to junk w/ cpufreq 231 * to junk w/ cpufreq
235 */ 232 */
236 mark_tsc_unstable(); 233 mark_tsc_unstable("cpufreq changes");
237 } 234 }
238 } 235 }
239 } 236 }
240end:
241 if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
242 write_sequnlock_irq(&xtime_lock);
243 237
244 return 0; 238 return 0;
245} 239}
@@ -281,11 +275,12 @@ static struct clocksource clocksource_tsc = {
281 CLOCK_SOURCE_MUST_VERIFY, 275 CLOCK_SOURCE_MUST_VERIFY,
282}; 276};
283 277
284void mark_tsc_unstable(void) 278void mark_tsc_unstable(char *reason)
285{ 279{
286 if (!tsc_unstable) { 280 if (!tsc_unstable) {
287 tsc_unstable = 1; 281 tsc_unstable = 1;
288 tsc_enabled = 0; 282 tsc_enabled = 0;
283 printk("Marking TSC unstable due to: %s.\n", reason);
289 /* Can be called before registration */ 284 /* Can be called before registration */
290 if (clocksource_tsc.mult) 285 if (clocksource_tsc.mult)
291 clocksource_change_rating(&clocksource_tsc, 0); 286 clocksource_change_rating(&clocksource_tsc, 0);
diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S
new file mode 100644
index 00000000000..e51a8695d54
--- /dev/null
+++ b/arch/i386/kernel/verify_cpu.S
@@ -0,0 +1,65 @@
1/* Check if CPU has some minimum CPUID bits
2 This runs in 16bit mode so that the caller can still use the BIOS
3 to output errors on the screen */
4#include <asm/cpufeature.h>
5
6verify_cpu:
7 pushfl # Save caller passed flags
8 pushl $0 # Kill any dangerous flags
9 popfl
10
11#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4
12 pushfl
13 orl $(1<<18),(%esp) # try setting AC
14 popfl
15 pushfl
16 popl %eax
17 testl $(1<<18),%eax
18 jz bad
19#endif
20#if REQUIRED_MASK1 != 0
21 pushfl # standard way to check for cpuid
22 popl %eax
23 movl %eax,%ebx
24 xorl $0x200000,%eax
25 pushl %eax
26 popfl
27 pushfl
28 popl %eax
29 cmpl %eax,%ebx
30 pushfl # standard way to check for cpuid
31 popl %eax
32 movl %eax,%ebx
33 xorl $0x200000,%eax
34 pushl %eax
35 popfl
36 pushfl
37 popl %eax
38 cmpl %eax,%ebx
39 jz bad # REQUIRED_MASK1 != 0 requires CPUID
40
41 movl $0x0,%eax # See if cpuid 1 is implemented
42 cpuid
43 cmpl $0x1,%eax
44 jb bad # no cpuid 1
45
46 movl $0x1,%eax # Does the cpu have what it takes
47 cpuid
48
49#if CONFIG_X86_MINIMUM_CPU_MODEL > 4
50#error add proper model checking here
51#endif
52
53 andl $REQUIRED_MASK1,%edx
54 xorl $REQUIRED_MASK1,%edx
55 jnz bad
56#endif /* REQUIRED_MASK1 */
57
58 popfl
59 xor %eax,%eax
60 ret
61
62bad:
63 popfl
64 movl $1,%eax
65 ret
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index 697a70e8c0c..c8726c424b3 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -26,6 +26,7 @@
26#include <linux/cpu.h> 26#include <linux/cpu.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/highmem.h>
29#include <asm/vmi.h> 30#include <asm/vmi.h>
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/fixmap.h> 32#include <asm/fixmap.h>
@@ -56,7 +57,7 @@ static int disable_noidle;
56static int disable_vmi_timer; 57static int disable_vmi_timer;
57 58
58/* Cached VMI operations */ 59/* Cached VMI operations */
59struct { 60static struct {
60 void (*cpuid)(void /* non-c */); 61 void (*cpuid)(void /* non-c */);
61 void (*_set_ldt)(u32 selector); 62 void (*_set_ldt)(u32 selector);
62 void (*set_tr)(u32 selector); 63 void (*set_tr)(u32 selector);
@@ -65,16 +66,15 @@ struct {
65 void (*release_page)(u32, u32); 66 void (*release_page)(u32, u32);
66 void (*set_pte)(pte_t, pte_t *, unsigned); 67 void (*set_pte)(pte_t, pte_t *, unsigned);
67 void (*update_pte)(pte_t *, unsigned); 68 void (*update_pte)(pte_t *, unsigned);
68 void (*set_linear_mapping)(int, u32, u32, u32); 69 void (*set_linear_mapping)(int, void *, u32, u32);
69 void (*flush_tlb)(int); 70 void (*_flush_tlb)(int);
70 void (*set_initial_ap_state)(int, int); 71 void (*set_initial_ap_state)(int, int);
71 void (*halt)(void); 72 void (*halt)(void);
72 void (*set_lazy_mode)(int mode); 73 void (*set_lazy_mode)(int mode);
73} vmi_ops; 74} vmi_ops;
74 75
75/* XXX move this to alternative.h */ 76/* Cached VMI operations */
76extern struct paravirt_patch __start_parainstructions[], 77struct vmi_timer_ops vmi_timer_ops;
77 __stop_parainstructions[];
78 78
79/* 79/*
80 * VMI patching routines. 80 * VMI patching routines.
@@ -83,11 +83,6 @@ extern struct paravirt_patch __start_parainstructions[],
83#define MNEM_JMP 0xe9 83#define MNEM_JMP 0xe9
84#define MNEM_RET 0xc3 84#define MNEM_RET 0xc3
85 85
86static char irq_save_disable_callout[] = {
87 MNEM_CALL, 0, 0, 0, 0,
88 MNEM_CALL, 0, 0, 0, 0,
89 MNEM_RET
90};
91#define IRQ_PATCH_INT_MASK 0 86#define IRQ_PATCH_INT_MASK 0
92#define IRQ_PATCH_DISABLE 5 87#define IRQ_PATCH_DISABLE 5
93 88
@@ -135,33 +130,17 @@ static unsigned patch_internal(int call, unsigned len, void *insns)
135static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len) 130static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
136{ 131{
137 switch (type) { 132 switch (type) {
138 case PARAVIRT_IRQ_DISABLE: 133 case PARAVIRT_PATCH(irq_disable):
139 return patch_internal(VMI_CALL_DisableInterrupts, len, insns); 134 return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
140 case PARAVIRT_IRQ_ENABLE: 135 case PARAVIRT_PATCH(irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len, insns); 136 return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
142 case PARAVIRT_RESTORE_FLAGS: 137 case PARAVIRT_PATCH(restore_fl):
143 return patch_internal(VMI_CALL_SetInterruptMask, len, insns); 138 return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
144 case PARAVIRT_SAVE_FLAGS: 139 case PARAVIRT_PATCH(save_fl):
145 return patch_internal(VMI_CALL_GetInterruptMask, len, insns); 140 return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
146 case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE: 141 case PARAVIRT_PATCH(iret):
147 if (len >= 10) {
148 patch_internal(VMI_CALL_GetInterruptMask, len, insns);
149 patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
150 return 10;
151 } else {
152 /*
153 * You bastards didn't leave enough room to
154 * patch save_flags_irq_disable inline. Patch
155 * to a helper
156 */
157 BUG_ON(len < 5);
158 *(char *)insns = MNEM_CALL;
159 patch_offset(insns, irq_save_disable_callout);
160 return 5;
161 }
162 case PARAVIRT_INTERRUPT_RETURN:
163 return patch_internal(VMI_CALL_IRET, len, insns); 142 return patch_internal(VMI_CALL_IRET, len, insns);
164 case PARAVIRT_STI_SYSEXIT: 143 case PARAVIRT_PATCH(irq_enable_sysexit):
165 return patch_internal(VMI_CALL_SYSEXIT, len, insns); 144 return patch_internal(VMI_CALL_SYSEXIT, len, insns);
166 default: 145 default:
167 break; 146 break;
@@ -230,24 +209,24 @@ static void vmi_set_tr(void)
230static void vmi_load_esp0(struct tss_struct *tss, 209static void vmi_load_esp0(struct tss_struct *tss,
231 struct thread_struct *thread) 210 struct thread_struct *thread)
232{ 211{
233 tss->esp0 = thread->esp0; 212 tss->x86_tss.esp0 = thread->esp0;
234 213
235 /* This can only happen when SEP is enabled, no need to test "SEP"arately */ 214 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
236 if (unlikely(tss->ss1 != thread->sysenter_cs)) { 215 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
237 tss->ss1 = thread->sysenter_cs; 216 tss->x86_tss.ss1 = thread->sysenter_cs;
238 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); 217 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
239 } 218 }
240 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0); 219 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
241} 220}
242 221
243static void vmi_flush_tlb_user(void) 222static void vmi_flush_tlb_user(void)
244{ 223{
245 vmi_ops.flush_tlb(VMI_FLUSH_TLB); 224 vmi_ops._flush_tlb(VMI_FLUSH_TLB);
246} 225}
247 226
248static void vmi_flush_tlb_kernel(void) 227static void vmi_flush_tlb_kernel(void)
249{ 228{
250 vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); 229 vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
251} 230}
252 231
253/* Stub to do nothing at all; used for delays and unimplemented calls */ 232/* Stub to do nothing at all; used for delays and unimplemented calls */
@@ -255,18 +234,6 @@ static void vmi_nop(void)
255{ 234{
256} 235}
257 236
258/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
259static fastcall void vmi_safe_halt(void)
260{
261 int idle = vmi_stop_hz_timer();
262 vmi_ops.halt();
263 if (idle) {
264 local_irq_disable();
265 vmi_account_time_restart_hz_timer();
266 local_irq_enable();
267 }
268}
269
270#ifdef CONFIG_DEBUG_PAGE_TYPE 237#ifdef CONFIG_DEBUG_PAGE_TYPE
271 238
272#ifdef CONFIG_X86_PAE 239#ifdef CONFIG_X86_PAE
@@ -370,8 +337,11 @@ static void vmi_check_page_type(u32 pfn, int type)
370#define vmi_check_page_type(p,t) do { } while (0) 337#define vmi_check_page_type(p,t) do { } while (0)
371#endif 338#endif
372 339
373static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn) 340#ifdef CONFIG_HIGHPTE
341static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
374{ 342{
343 void *va = kmap_atomic(page, type);
344
375 /* 345 /*
376 * Internally, the VMI ROM must map virtual addresses to physical 346 * Internally, the VMI ROM must map virtual addresses to physical
377 * addresses for processing MMU updates. By the time MMU updates 347 * addresses for processing MMU updates. By the time MMU updates
@@ -385,8 +355,11 @@ static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn)
385 * args: SLOT VA COUNT PFN 355 * args: SLOT VA COUNT PFN
386 */ 356 */
387 BUG_ON(type != KM_PTE0 && type != KM_PTE1); 357 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
388 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, (u32)va, 1, pfn); 358 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
359
360 return va;
389} 361}
362#endif
390 363
391static void vmi_allocate_pt(u32 pfn) 364static void vmi_allocate_pt(u32 pfn)
392{ 365{
@@ -443,13 +416,13 @@ static void vmi_release_pd(u32 pfn)
443 ((level) | (is_current_as(mm, user) ? \ 416 ((level) | (is_current_as(mm, user) ? \
444 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) 417 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
445 418
446static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep) 419static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
447{ 420{
448 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 421 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
449 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 422 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
450} 423}
451 424
452static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) 425static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
453{ 426{
454 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 427 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
455 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); 428 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
@@ -462,7 +435,7 @@ static void vmi_set_pte(pte_t *ptep, pte_t pte)
462 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); 435 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
463} 436}
464 437
465static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) 438static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
466{ 439{
467 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 440 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
468 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 441 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
@@ -516,7 +489,7 @@ static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
516 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 489 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
517} 490}
518 491
519void vmi_pmd_clear(pmd_t *pmd) 492static void vmi_pmd_clear(pmd_t *pmd)
520{ 493{
521 const pte_t pte = { 0 }; 494 const pte_t pte = { 0 };
522 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); 495 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
@@ -525,8 +498,6 @@ void vmi_pmd_clear(pmd_t *pmd)
525#endif 498#endif
526 499
527#ifdef CONFIG_SMP 500#ifdef CONFIG_SMP
528extern void setup_pda(void);
529
530static void __devinit 501static void __devinit
531vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, 502vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
532 unsigned long start_esp) 503 unsigned long start_esp)
@@ -551,13 +522,11 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
551 522
552 ap.ds = __USER_DS; 523 ap.ds = __USER_DS;
553 ap.es = __USER_DS; 524 ap.es = __USER_DS;
554 ap.fs = __KERNEL_PDA; 525 ap.fs = __KERNEL_PERCPU;
555 ap.gs = 0; 526 ap.gs = 0;
556 527
557 ap.eflags = 0; 528 ap.eflags = 0;
558 529
559 setup_pda();
560
561#ifdef CONFIG_X86_PAE 530#ifdef CONFIG_X86_PAE
562 /* efer should match BSP efer. */ 531 /* efer should match BSP efer. */
563 if (cpu_has_nx) { 532 if (cpu_has_nx) {
@@ -575,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
575} 544}
576#endif 545#endif
577 546
578static void vmi_set_lazy_mode(int mode) 547static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
579{ 548{
580 static DEFINE_PER_CPU(int, lazy_mode); 549 static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
581 550
582 if (!vmi_ops.set_lazy_mode) 551 if (!vmi_ops.set_lazy_mode)
583 return; 552 return;
@@ -685,7 +654,7 @@ void vmi_bringup(void)
685{ 654{
686 /* We must establish the lowmem mapping for MMU ops to work */ 655 /* We must establish the lowmem mapping for MMU ops to work */
687 if (vmi_ops.set_linear_mapping) 656 if (vmi_ops.set_linear_mapping)
688 vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0); 657 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
689} 658}
690 659
691/* 660/*
@@ -740,7 +709,6 @@ do { \
740 } \ 709 } \
741} while (0) 710} while (0)
742 711
743
744/* 712/*
745 * Activate the VMI interface and switch into paravirtualized mode 713 * Activate the VMI interface and switch into paravirtualized mode
746 */ 714 */
@@ -796,12 +764,6 @@ static inline int __init activate_vmi(void)
796 para_fill(irq_disable, DisableInterrupts); 764 para_fill(irq_disable, DisableInterrupts);
797 para_fill(irq_enable, EnableInterrupts); 765 para_fill(irq_enable, EnableInterrupts);
798 766
799 /* irq_save_disable !!! sheer pain */
800 patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
801 (char *)paravirt_ops.save_fl);
802 patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
803 (char *)paravirt_ops.irq_disable);
804
805 para_fill(wbinvd, WBINVD); 767 para_fill(wbinvd, WBINVD);
806 para_fill(read_tsc, RDTSC); 768 para_fill(read_tsc, RDTSC);
807 769
@@ -831,8 +793,8 @@ static inline int __init activate_vmi(void)
831 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); 793 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
832 794
833 /* user and kernel flush are just handled with different flags to FlushTLB */ 795 /* user and kernel flush are just handled with different flags to FlushTLB */
834 para_wrap(flush_tlb_user, vmi_flush_tlb_user, flush_tlb, FlushTLB); 796 para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
835 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, flush_tlb, FlushTLB); 797 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
836 para_fill(flush_tlb_single, InvalPage); 798 para_fill(flush_tlb_single, InvalPage);
837 799
838 /* 800 /*
@@ -878,8 +840,13 @@ static inline int __init activate_vmi(void)
878 paravirt_ops.release_pt = vmi_release_pt; 840 paravirt_ops.release_pt = vmi_release_pt;
879 paravirt_ops.release_pd = vmi_release_pd; 841 paravirt_ops.release_pd = vmi_release_pd;
880 } 842 }
881 para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping, 843
882 SetLinearMapping); 844 /* Set linear is needed in all cases */
845 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
846#ifdef CONFIG_HIGHPTE
847 if (vmi_ops.set_linear_mapping)
848 paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
849#endif
883 850
884 /* 851 /*
885 * These MUST always be patched. Don't support indirect jumps 852 * These MUST always be patched. Don't support indirect jumps
@@ -920,8 +887,8 @@ static inline int __init activate_vmi(void)
920 paravirt_ops.get_wallclock = vmi_get_wallclock; 887 paravirt_ops.get_wallclock = vmi_get_wallclock;
921 paravirt_ops.set_wallclock = vmi_set_wallclock; 888 paravirt_ops.set_wallclock = vmi_set_wallclock;
922#ifdef CONFIG_X86_LOCAL_APIC 889#ifdef CONFIG_X86_LOCAL_APIC
923 paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm; 890 paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
924 paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm; 891 paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
925#endif 892#endif
926 paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; 893 paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
927 paravirt_ops.get_cpu_khz = vmi_cpu_khz; 894 paravirt_ops.get_cpu_khz = vmi_cpu_khz;
@@ -933,11 +900,7 @@ static inline int __init activate_vmi(void)
933 disable_vmi_timer = 1; 900 disable_vmi_timer = 1;
934 } 901 }
935 902
936 /* No idle HZ mode only works if VMI timer and no idle is enabled */ 903 para_fill(safe_halt, Halt);
937 if (disable_noidle || disable_vmi_timer)
938 para_fill(safe_halt, Halt);
939 else
940 para_wrap(safe_halt, vmi_safe_halt, halt, Halt);
941 904
942 /* 905 /*
943 * Alternative instruction rewriting doesn't happen soon enough 906 * Alternative instruction rewriting doesn't happen soon enough
@@ -945,7 +908,7 @@ static inline int __init activate_vmi(void)
945 * to do this before IRQs get reenabled. Fortunately, it is 908 * to do this before IRQs get reenabled. Fortunately, it is
946 * idempotent. 909 * idempotent.
947 */ 910 */
948 apply_paravirt(__start_parainstructions, __stop_parainstructions); 911 apply_paravirt(__parainstructions, __parainstructions_end);
949 912
950 vmi_bringup(); 913 vmi_bringup();
951 914
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
new file mode 100644
index 00000000000..26a37f8a876
--- /dev/null
+++ b/arch/i386/kernel/vmiclock.c
@@ -0,0 +1,318 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2007, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23#include <linux/smp.h>
24#include <linux/interrupt.h>
25#include <linux/cpumask.h>
26#include <linux/clocksource.h>
27#include <linux/clockchips.h>
28
29#include <asm/vmi.h>
30#include <asm/vmi_time.h>
31#include <asm/arch_hooks.h>
32#include <asm/apicdef.h>
33#include <asm/apic.h>
34#include <asm/timer.h>
35
36#include <irq_vectors.h>
37#include "io_ports.h"
38
39#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
40#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
41
42static DEFINE_PER_CPU(struct clock_event_device, local_events);
43
44static inline u32 vmi_counter(u32 flags)
45{
46 /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
47 * cycle counter. */
48 return flags & VMI_ALARM_COUNTER_MASK;
49}
50
51/* paravirt_ops.get_wallclock = vmi_get_wallclock */
52unsigned long vmi_get_wallclock(void)
53{
54 unsigned long long wallclock;
55 wallclock = vmi_timer_ops.get_wallclock(); // nsec
56 (void)do_div(wallclock, 1000000000); // sec
57
58 return wallclock;
59}
60
61/* paravirt_ops.set_wallclock = vmi_set_wallclock */
62int vmi_set_wallclock(unsigned long now)
63{
64 return 0;
65}
66
67/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
68unsigned long long vmi_get_sched_cycles(void)
69{
70 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
71}
72
73/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
74unsigned long vmi_cpu_khz(void)
75{
76 unsigned long long khz;
77 khz = vmi_timer_ops.get_cycle_frequency();
78 (void)do_div(khz, 1000);
79 return khz;
80}
81
82static inline unsigned int vmi_get_timer_vector(void)
83{
84#ifdef CONFIG_X86_IO_APIC
85 return FIRST_DEVICE_VECTOR;
86#else
87 return FIRST_EXTERNAL_VECTOR;
88#endif
89}
90
91/** vmi clockchip */
92#ifdef CONFIG_X86_LOCAL_APIC
93static unsigned int startup_timer_irq(unsigned int irq)
94{
95 unsigned long val = apic_read(APIC_LVTT);
96 apic_write(APIC_LVTT, vmi_get_timer_vector());
97
98 return (val & APIC_SEND_PENDING);
99}
100
101static void mask_timer_irq(unsigned int irq)
102{
103 unsigned long val = apic_read(APIC_LVTT);
104 apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
105}
106
107static void unmask_timer_irq(unsigned int irq)
108{
109 unsigned long val = apic_read(APIC_LVTT);
110 apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
111}
112
113static void ack_timer_irq(unsigned int irq)
114{
115 ack_APIC_irq();
116}
117
118static struct irq_chip vmi_chip __read_mostly = {
119 .name = "VMI-LOCAL",
120 .startup = startup_timer_irq,
121 .mask = mask_timer_irq,
122 .unmask = unmask_timer_irq,
123 .ack = ack_timer_irq
124};
125#endif
126
127/** vmi clockevent */
128#define VMI_ALARM_WIRED_IRQ0 0x00000000
129#define VMI_ALARM_WIRED_LVTT 0x00010000
130static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
131
132static inline int vmi_get_alarm_wiring(void)
133{
134 return vmi_wiring;
135}
136
137static void vmi_timer_set_mode(enum clock_event_mode mode,
138 struct clock_event_device *evt)
139{
140 cycle_t now, cycles_per_hz;
141 BUG_ON(!irqs_disabled());
142
143 switch (mode) {
144 case CLOCK_EVT_MODE_ONESHOT:
145 break;
146 case CLOCK_EVT_MODE_PERIODIC:
147 cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
148 (void)do_div(cycles_per_hz, HZ);
149 now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
150 vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
151 break;
152 case CLOCK_EVT_MODE_UNUSED:
153 case CLOCK_EVT_MODE_SHUTDOWN:
154 switch (evt->mode) {
155 case CLOCK_EVT_MODE_ONESHOT:
156 vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
157 break;
158 case CLOCK_EVT_MODE_PERIODIC:
159 vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
160 break;
161 default:
162 break;
163 }
164 break;
165 default:
166 break;
167 }
168}
169
170static int vmi_timer_next_event(unsigned long delta,
171 struct clock_event_device *evt)
172{
173 /* Unfortunately, set_next_event interface only passes relative
174 * expiry, but we want absolute expiry. It'd be better if were
175 * were passed an aboslute expiry, since a bunch of time may
176 * have been stolen between the time the delta is computed and
177 * when we set the alarm below. */
178 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
179
180 BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
181 vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
182 return 0;
183}
184
185static struct clock_event_device vmi_clockevent = {
186 .name = "vmi-timer",
187 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
188 .shift = 22,
189 .set_mode = vmi_timer_set_mode,
190 .set_next_event = vmi_timer_next_event,
191 .rating = 1000,
192 .irq = 0,
193};
194
195static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
196{
197 struct clock_event_device *evt = &__get_cpu_var(local_events);
198 evt->event_handler(evt);
199 return IRQ_HANDLED;
200}
201
202static struct irqaction vmi_clock_action = {
203 .name = "vmi-timer",
204 .handler = vmi_timer_interrupt,
205 .flags = IRQF_DISABLED | IRQF_NOBALANCING,
206 .mask = CPU_MASK_ALL,
207};
208
209static void __devinit vmi_time_init_clockevent(void)
210{
211 cycle_t cycles_per_msec;
212 struct clock_event_device *evt;
213
214 int cpu = smp_processor_id();
215 evt = &__get_cpu_var(local_events);
216
217 /* Use cycles_per_msec since div_sc params are 32-bits. */
218 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
219 (void)do_div(cycles_per_msec, 1000);
220
221 memcpy(evt, &vmi_clockevent, sizeof(*evt));
222 /* Must pick .shift such that .mult fits in 32-bits. Choosing
223 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
224 * before overflow. */
225 evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
226 /* Upper bound is clockevent's use of ulong for cycle deltas. */
227 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
228 evt->min_delta_ns = clockevent_delta2ns(1, evt);
229 evt->cpumask = cpumask_of_cpu(cpu);
230
231 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
232 evt->name, evt->mult, evt->shift);
233 clockevents_register_device(evt);
234}
235
236void __init vmi_time_init(void)
237{
238 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
239 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
240
241 vmi_time_init_clockevent();
242 setup_irq(0, &vmi_clock_action);
243}
244
245#ifdef CONFIG_X86_LOCAL_APIC
246void __devinit vmi_time_bsp_init(void)
247{
248 /*
249 * On APIC systems, we want local timers to fire on each cpu. We do
250 * this by programming LVTT to deliver timer events to the IRQ handler
251 * for IRQ-0, since we can't re-use the APIC local timer handler
252 * without interfering with that code.
253 */
254 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
255 local_irq_disable();
256#ifdef CONFIG_X86_SMP
257 /*
258 * XXX handle_percpu_irq only defined for SMP; we need to switch over
259 * to using it, since this is a local interrupt, which each CPU must
260 * handle individually without locking out or dropping simultaneous
261 * local timers on other CPUs. We also don't want to trigger the
262 * quirk workaround code for interrupts which gets invoked from
263 * handle_percpu_irq via eoi, so we use our own IRQ chip.
264 */
265 set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
266#else
267 set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
268#endif
269 vmi_wiring = VMI_ALARM_WIRED_LVTT;
270 apic_write(APIC_LVTT, vmi_get_timer_vector());
271 local_irq_enable();
272 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
273}
274
275void __devinit vmi_time_ap_init(void)
276{
277 vmi_time_init_clockevent();
278 apic_write(APIC_LVTT, vmi_get_timer_vector());
279}
280#endif
281
282/** vmi clocksource */
283
284static cycle_t read_real_cycles(void)
285{
286 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
287}
288
289static struct clocksource clocksource_vmi = {
290 .name = "vmi-timer",
291 .rating = 450,
292 .read = read_real_cycles,
293 .mask = CLOCKSOURCE_MASK(64),
294 .mult = 0, /* to be set */
295 .shift = 22,
296 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
297};
298
299static int __init init_vmi_clocksource(void)
300{
301 cycle_t cycles_per_msec;
302
303 if (!vmi_timer_ops.get_cycle_frequency)
304 return 0;
305 /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
306 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
307 (void)do_div(cycles_per_msec, 1000);
308
309 /* Note that clocksource.{mult, shift} converts in the opposite direction
310 * as clockevents. */
311 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
312 clocksource_vmi.shift);
313
314 printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
315 return clocksource_register(&clocksource_vmi);
316
317}
318module_init(init_vmi_clocksource);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
deleted file mode 100644
index 9dfb17739b6..00000000000
--- a/arch/i386/kernel/vmitime.c
+++ /dev/null
@@ -1,482 +0,0 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25/*
26 * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
27 * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
28 * See comments there for proper credits.
29 */
30
31#include <linux/spinlock.h>
32#include <linux/init.h>
33#include <linux/errno.h>
34#include <linux/jiffies.h>
35#include <linux/interrupt.h>
36#include <linux/kernel_stat.h>
37#include <linux/rcupdate.h>
38#include <linux/clocksource.h>
39
40#include <asm/timer.h>
41#include <asm/io.h>
42#include <asm/apic.h>
43#include <asm/div64.h>
44#include <asm/timer.h>
45#include <asm/desc.h>
46
47#include <asm/vmi.h>
48#include <asm/vmi_time.h>
49
50#include <mach_timer.h>
51#include <io_ports.h>
52
53#ifdef CONFIG_X86_LOCAL_APIC
54#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
55#else
56#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
57#endif
58
59/* Cached VMI operations */
60struct vmi_timer_ops vmi_timer_ops;
61
62#ifdef CONFIG_NO_IDLE_HZ
63
64/* /proc/sys/kernel/hz_timer state. */
65int sysctl_hz_timer;
66
67/* Some stats */
68static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
69static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
70static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
71
72#endif /* CONFIG_NO_IDLE_HZ */
73
74/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
75static int alarm_hz = CONFIG_VMI_ALARM_HZ;
76
77/* Cache of the value get_cycle_frequency / HZ. */
78static signed long long cycles_per_jiffy;
79
80/* Cache of the value get_cycle_frequency / alarm_hz. */
81static signed long long cycles_per_alarm;
82
83/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
84 * Protected by xtime_lock. */
85static unsigned long long real_cycles_accounted_system;
86
87/* The number of cycles accounted for by update_process_times(), per cpu. */
88static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
89
90/* The number of stolen cycles accounted, per cpu. */
91static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
92
93/* Clock source. */
94static cycle_t read_real_cycles(void)
95{
96 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
97}
98
99static cycle_t read_available_cycles(void)
100{
101 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
102}
103
104#if 0
105static cycle_t read_stolen_cycles(void)
106{
107 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
108}
109#endif /* 0 */
110
111static struct clocksource clocksource_vmi = {
112 .name = "vmi-timer",
113 .rating = 450,
114 .read = read_real_cycles,
115 .mask = CLOCKSOURCE_MASK(64),
116 .mult = 0, /* to be set */
117 .shift = 22,
118 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
119};
120
121
122/* Timer interrupt handler. */
123static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
124
125static struct irqaction vmi_timer_irq = {
126 .handler = vmi_timer_interrupt,
127 .flags = IRQF_DISABLED,
128 .mask = CPU_MASK_NONE,
129 .name = "VMI-alarm",
130};
131
132/* Alarm rate */
133static int __init vmi_timer_alarm_rate_setup(char* str)
134{
135 int alarm_rate;
136 if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
137 alarm_hz = alarm_rate;
138 printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
139 }
140 return 1;
141}
142__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
143
144
145/* Initialization */
146static void vmi_get_wallclock_ts(struct timespec *ts)
147{
148 unsigned long long wallclock;
149 wallclock = vmi_timer_ops.get_wallclock(); // nsec units
150 ts->tv_nsec = do_div(wallclock, 1000000000);
151 ts->tv_sec = wallclock;
152}
153
154unsigned long vmi_get_wallclock(void)
155{
156 struct timespec ts;
157 vmi_get_wallclock_ts(&ts);
158 return ts.tv_sec;
159}
160
161int vmi_set_wallclock(unsigned long now)
162{
163 return -1;
164}
165
166unsigned long long vmi_get_sched_cycles(void)
167{
168 return read_available_cycles();
169}
170
171unsigned long vmi_cpu_khz(void)
172{
173 unsigned long long khz;
174
175 khz = vmi_timer_ops.get_cycle_frequency();
176 (void)do_div(khz, 1000);
177 return khz;
178}
179
180void __init vmi_time_init(void)
181{
182 unsigned long long cycles_per_sec, cycles_per_msec;
183 unsigned long flags;
184
185 local_irq_save(flags);
186 setup_irq(0, &vmi_timer_irq);
187#ifdef CONFIG_X86_LOCAL_APIC
188 set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
189#endif
190
191 real_cycles_accounted_system = read_real_cycles();
192 per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
193
194 cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
195 cycles_per_jiffy = cycles_per_sec;
196 (void)do_div(cycles_per_jiffy, HZ);
197 cycles_per_alarm = cycles_per_sec;
198 (void)do_div(cycles_per_alarm, alarm_hz);
199 cycles_per_msec = cycles_per_sec;
200 (void)do_div(cycles_per_msec, 1000);
201
202 printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
203 "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
204 cycles_per_alarm);
205
206 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
207 clocksource_vmi.shift);
208 if (clocksource_register(&clocksource_vmi))
209 printk(KERN_WARNING "Error registering VMITIME clocksource.");
210
211 /* Disable PIT. */
212 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
213
214 /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
215 * reduce the latency calling update_process_times. */
216 vmi_timer_ops.set_alarm(
217 VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
218 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
219 cycles_per_alarm);
220
221 local_irq_restore(flags);
222}
223
224#ifdef CONFIG_X86_LOCAL_APIC
225
226void __init vmi_timer_setup_boot_alarm(void)
227{
228 local_irq_disable();
229
230 /* Route the interrupt to the correct vector. */
231 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
232
233 /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
234 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
235 vmi_timer_ops.set_alarm(
236 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
237 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
238 cycles_per_alarm);
239 local_irq_enable();
240}
241
242/* Initialize the time accounting variables for an AP on an SMP system.
243 * Also, set the local alarm for the AP. */
244void __devinit vmi_timer_setup_secondary_alarm(void)
245{
246 int cpu = smp_processor_id();
247
248 /* Route the interrupt to the correct vector. */
249 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
250
251 per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
252
253 vmi_timer_ops.set_alarm(
254 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
255 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
256 cycles_per_alarm);
257}
258
259#endif
260
261/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
262static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
263{
264 long long cycles_not_accounted;
265
266 write_seqlock(&xtime_lock);
267
268 cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
269 while (cycles_not_accounted >= cycles_per_jiffy) {
270 /* systems wide jiffies. */
271 do_timer(1);
272
273 cycles_not_accounted -= cycles_per_jiffy;
274 real_cycles_accounted_system += cycles_per_jiffy;
275 }
276
277 write_sequnlock(&xtime_lock);
278}
279
280/* Update per-cpu process times. */
281static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
282 unsigned long long cur_process_times_cycles)
283{
284 long long cycles_not_accounted;
285 cycles_not_accounted = cur_process_times_cycles -
286 per_cpu(process_times_cycles_accounted_cpu, cpu);
287
288 while (cycles_not_accounted >= cycles_per_jiffy) {
289 /* Account time to the current process. This includes
290 * calling into the scheduler to decrement the timeslice
291 * and possibly reschedule.*/
292 update_process_times(user_mode(regs));
293 /* XXX handle /proc/profile multiplier. */
294 profile_tick(CPU_PROFILING);
295
296 cycles_not_accounted -= cycles_per_jiffy;
297 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
298 }
299}
300
301#ifdef CONFIG_NO_IDLE_HZ
302/* Update per-cpu idle times. Used when a no-hz halt is ended. */
303static void vmi_account_no_hz_idle_cycles(int cpu,
304 unsigned long long cur_process_times_cycles)
305{
306 long long cycles_not_accounted;
307 unsigned long no_idle_hz_jiffies = 0;
308
309 cycles_not_accounted = cur_process_times_cycles -
310 per_cpu(process_times_cycles_accounted_cpu, cpu);
311
312 while (cycles_not_accounted >= cycles_per_jiffy) {
313 no_idle_hz_jiffies++;
314 cycles_not_accounted -= cycles_per_jiffy;
315 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
316 }
317 /* Account time to the idle process. */
318 account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
319}
320#endif
321
322/* Update per-cpu stolen time. */
323static void vmi_account_stolen_cycles(int cpu,
324 unsigned long long cur_real_cycles,
325 unsigned long long cur_avail_cycles)
326{
327 long long stolen_cycles_not_accounted;
328 unsigned long stolen_jiffies = 0;
329
330 if (cur_real_cycles < cur_avail_cycles)
331 return;
332
333 stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
334 per_cpu(stolen_cycles_accounted_cpu, cpu);
335
336 while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
337 stolen_jiffies++;
338 stolen_cycles_not_accounted -= cycles_per_jiffy;
339 per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
340 }
341 /* HACK: pass NULL to force time onto cpustat->steal. */
342 account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
343}
344
345/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
346 * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
347static void vmi_local_timer_interrupt(int cpu)
348{
349 unsigned long long cur_real_cycles, cur_process_times_cycles;
350
351 cur_real_cycles = read_real_cycles();
352 cur_process_times_cycles = read_available_cycles();
353 /* Update system wide (real) time state (xtime, jiffies). */
354 vmi_account_real_cycles(cur_real_cycles);
355 /* Update per-cpu process times. */
356 vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
357 /* Update time stolen from this cpu by the hypervisor. */
358 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
359}
360
361#ifdef CONFIG_NO_IDLE_HZ
362
363/* Must be called only from idle loop, with interrupts disabled. */
364int vmi_stop_hz_timer(void)
365{
366 /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
367
368 unsigned long seq, next;
369 unsigned long long real_cycles_expiry;
370 int cpu = smp_processor_id();
371
372 BUG_ON(!irqs_disabled());
373 if (sysctl_hz_timer != 0)
374 return 0;
375
376 cpu_set(cpu, nohz_cpu_mask);
377 smp_mb();
378
379 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
380 (next = next_timer_interrupt(),
381 time_before_eq(next, jiffies + HZ/CONFIG_VMI_ALARM_HZ))) {
382 cpu_clear(cpu, nohz_cpu_mask);
383 return 0;
384 }
385
386 /* Convert jiffies to the real cycle counter. */
387 do {
388 seq = read_seqbegin(&xtime_lock);
389 real_cycles_expiry = real_cycles_accounted_system +
390 (long)(next - jiffies) * cycles_per_jiffy;
391 } while (read_seqretry(&xtime_lock, seq));
392
393 /* This cpu is going idle. Disable the periodic alarm. */
394 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
395 per_cpu(idle_start_jiffies, cpu) = jiffies;
396 /* Set the real time alarm to expire at the next event. */
397 vmi_timer_ops.set_alarm(
398 VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
399 real_cycles_expiry, 0);
400 return 1;
401}
402
403static void vmi_reenable_hz_timer(int cpu)
404{
405 /* For /proc/vmi/info idle_hz stat. */
406 per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
407 per_cpu(vmi_idle_no_hz_irqs, cpu)++;
408
409 /* Don't bother explicitly cancelling the one-shot alarm -- at
410 * worse we will receive a spurious timer interrupt. */
411 vmi_timer_ops.set_alarm(
412 VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
413 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
414 cycles_per_alarm);
415 /* Indicate this cpu is no longer nohz idle. */
416 cpu_clear(cpu, nohz_cpu_mask);
417}
418
419/* Called from interrupt handlers when (local) HZ timer is disabled. */
420void vmi_account_time_restart_hz_timer(void)
421{
422 unsigned long long cur_real_cycles, cur_process_times_cycles;
423 int cpu = smp_processor_id();
424
425 BUG_ON(!irqs_disabled());
426 /* Account the time during which the HZ timer was disabled. */
427 cur_real_cycles = read_real_cycles();
428 cur_process_times_cycles = read_available_cycles();
429 /* Update system wide (real) time state (xtime, jiffies). */
430 vmi_account_real_cycles(cur_real_cycles);
431 /* Update per-cpu idle times. */
432 vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
433 /* Update time stolen from this cpu by the hypervisor. */
434 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
435 /* Reenable the hz timer. */
436 vmi_reenable_hz_timer(cpu);
437}
438
439#endif /* CONFIG_NO_IDLE_HZ */
440
441/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
442 * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
443 * APIC setup and setup_boot_vmi_alarm() is called. */
444static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
445{
446 vmi_local_timer_interrupt(smp_processor_id());
447 return IRQ_HANDLED;
448}
449
450#ifdef CONFIG_X86_LOCAL_APIC
451
452/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
453 * Also used in UP when CONFIG_X86_LOCAL_APIC.
454 * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
455void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
456{
457 struct pt_regs *old_regs = set_irq_regs(regs);
458 int cpu = smp_processor_id();
459
460 /*
461 * the NMI deadlock-detector uses this.
462 */
463 per_cpu(irq_stat,cpu).apic_timer_irqs++;
464
465 /*
466 * NOTE! We'd better ACK the irq immediately,
467 * because timer handling can be slow.
468 */
469 ack_APIC_irq();
470
471 /*
472 * update_process_times() expects us to have done irq_enter().
473 * Besides, if we don't timer interrupts ignore the global
474 * interrupt lock, which is the WrongThing (tm) to do.
475 */
476 irq_enter();
477 vmi_local_timer_interrupt(cpu);
478 irq_exit();
479 set_irq_regs(old_regs);
480}
481
482#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 6f38f818380..23e8614edee 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -26,12 +26,11 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
26OUTPUT_ARCH(i386) 26OUTPUT_ARCH(i386)
27ENTRY(phys_startup_32) 27ENTRY(phys_startup_32)
28jiffies = jiffies_64; 28jiffies = jiffies_64;
29_proxy_pda = 1;
30 29
31PHDRS { 30PHDRS {
32 text PT_LOAD FLAGS(5); /* R_E */ 31 text PT_LOAD FLAGS(5); /* R_E */
33 data PT_LOAD FLAGS(7); /* RWE */ 32 data PT_LOAD FLAGS(7); /* RWE */
34 note PT_NOTE FLAGS(4); /* R__ */ 33 note PT_NOTE FLAGS(0); /* ___ */
35} 34}
36SECTIONS 35SECTIONS
37{ 36{
@@ -61,8 +60,6 @@ SECTIONS
61 __stop___ex_table = .; 60 __stop___ex_table = .;
62 } 61 }
63 62
64 RODATA
65
66 BUG_TABLE 63 BUG_TABLE
67 64
68 . = ALIGN(4); 65 . = ALIGN(4);
@@ -72,6 +69,8 @@ SECTIONS
72 __tracedata_end = .; 69 __tracedata_end = .;
73 } 70 }
74 71
72 RODATA
73
75 /* writeable */ 74 /* writeable */
76 . = ALIGN(4096); 75 . = ALIGN(4096);
77 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ 76 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
@@ -117,22 +116,11 @@ SECTIONS
117 116
118 /* might get freed after init */ 117 /* might get freed after init */
119 . = ALIGN(4096); 118 . = ALIGN(4096);
120 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
121 __smp_alt_begin = .;
122 __smp_alt_instructions = .;
123 *(.smp_altinstructions)
124 __smp_alt_instructions_end = .;
125 }
126 . = ALIGN(4);
127 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 119 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
128 __smp_locks = .; 120 __smp_locks = .;
129 *(.smp_locks) 121 *(.smp_locks)
130 __smp_locks_end = .; 122 __smp_locks_end = .;
131 } 123 }
132 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
133 *(.smp_altinstr_replacement)
134 __smp_alt_end = .;
135 }
136 /* will be freed after init 124 /* will be freed after init
137 * Following ALIGN() is required to make sure no other data falls on the 125 * Following ALIGN() is required to make sure no other data falls on the
138 * same page where __smp_alt_end is pointing as that page might be freed 126 * same page where __smp_alt_end is pointing as that page might be freed
@@ -178,9 +166,9 @@ SECTIONS
178 } 166 }
179 . = ALIGN(4); 167 . = ALIGN(4);
180 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 168 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
181 __start_parainstructions = .; 169 __parainstructions = .;
182 *(.parainstructions) 170 *(.parainstructions)
183 __stop_parainstructions = .; 171 __parainstructions_end = .;
184 } 172 }
185 /* .exit.text is discard at runtime, not link time, to deal with references 173 /* .exit.text is discard at runtime, not link time, to deal with references
186 from .altinstructions and .eh_frame */ 174 from .altinstructions and .eh_frame */
@@ -194,7 +182,7 @@ SECTIONS
194 __initramfs_end = .; 182 __initramfs_end = .;
195 } 183 }
196#endif 184#endif
197 . = ALIGN(L1_CACHE_BYTES); 185 . = ALIGN(4096);
198 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 186 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
199 __per_cpu_start = .; 187 __per_cpu_start = .;
200 *(.data.percpu) 188 *(.data.percpu)
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
index f66cd11adb7..4a8b0ed9b8f 100644
--- a/arch/i386/kernel/vsyscall.lds.S
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -7,7 +7,7 @@
7 7
8SECTIONS 8SECTIONS
9{ 9{
10 . = VDSO_PRELINK + SIZEOF_HEADERS; 10 . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
11 11
12 .hash : { *(.hash) } :text 12 .hash : { *(.hash) } :text
13 .gnu.hash : { *(.gnu.hash) } 13 .gnu.hash : { *(.gnu.hash) }
@@ -21,7 +21,7 @@ SECTIONS
21 For the layouts to match, we need to skip more than enough 21 For the layouts to match, we need to skip more than enough
22 space for the dynamic symbol table et al. If this amount 22 space for the dynamic symbol table et al. If this amount
23 is insufficient, ld -shared will barf. Just increase it here. */ 23 is insufficient, ld -shared will barf. Just increase it here. */
24 . = VDSO_PRELINK + 0x400; 24 . = VDSO_PRELINK_asm + 0x400;
25 25
26 .text : { *(.text) } :text =0x90909090 26 .text : { *(.text) } :text =0x90909090
27 .note : { *(.note.*) } :text :note 27 .note : { *(.note.*) } :text :note
diff --git a/arch/i386/lib/bitops.c b/arch/i386/lib/bitops.c
index 97db3853dc8..afd0045595d 100644
--- a/arch/i386/lib/bitops.c
+++ b/arch/i386/lib/bitops.c
@@ -43,7 +43,7 @@ EXPORT_SYMBOL(find_next_bit);
43 */ 43 */
44int find_next_zero_bit(const unsigned long *addr, int size, int offset) 44int find_next_zero_bit(const unsigned long *addr, int size, int offset)
45{ 45{
46 unsigned long * p = ((unsigned long *) addr) + (offset >> 5); 46 const unsigned long *p = addr + (offset >> 5);
47 int set = 0, bit = offset & 31, res; 47 int set = 0, bit = offset & 31, res;
48 48
49 if (bit) { 49 if (bit) {
@@ -64,7 +64,7 @@ int find_next_zero_bit(const unsigned long *addr, int size, int offset)
64 /* 64 /*
65 * No zero yet, search remaining full bytes for a zero 65 * No zero yet, search remaining full bytes for a zero
66 */ 66 */
67 res = find_first_zero_bit (p, size - 32 * (p - (unsigned long *) addr)); 67 res = find_first_zero_bit(p, size - 32 * (p - addr));
68 return (offset + set + res); 68 return (offset + set + res);
69} 69}
70EXPORT_SYMBOL(find_next_zero_bit); 70EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/i386/lib/checksum.S b/arch/i386/lib/checksum.S
index 75ffd02654f..adbccd0bbb7 100644
--- a/arch/i386/lib/checksum.S
+++ b/arch/i386/lib/checksum.S
@@ -25,6 +25,8 @@
25 * 2 of the License, or (at your option) any later version. 25 * 2 of the License, or (at your option) any later version.
26 */ 26 */
27 27
28#include <linux/linkage.h>
29#include <asm/dwarf2.h>
28#include <asm/errno.h> 30#include <asm/errno.h>
29 31
30/* 32/*
@@ -36,8 +38,6 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
36 */ 38 */
37 39
38.text 40.text
39.align 4
40.globl csum_partial
41 41
42#ifndef CONFIG_X86_USE_PPRO_CHECKSUM 42#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
43 43
@@ -48,9 +48,14 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
48 * Fortunately, it is easy to convert 2-byte alignment to 4-byte 48 * Fortunately, it is easy to convert 2-byte alignment to 4-byte
49 * alignment for the unrolled loop. 49 * alignment for the unrolled loop.
50 */ 50 */
51csum_partial: 51ENTRY(csum_partial)
52 CFI_STARTPROC
52 pushl %esi 53 pushl %esi
54 CFI_ADJUST_CFA_OFFSET 4
55 CFI_REL_OFFSET esi, 0
53 pushl %ebx 56 pushl %ebx
57 CFI_ADJUST_CFA_OFFSET 4
58 CFI_REL_OFFSET ebx, 0
54 movl 20(%esp),%eax # Function arg: unsigned int sum 59 movl 20(%esp),%eax # Function arg: unsigned int sum
55 movl 16(%esp),%ecx # Function arg: int len 60 movl 16(%esp),%ecx # Function arg: int len
56 movl 12(%esp),%esi # Function arg: unsigned char *buff 61 movl 12(%esp),%esi # Function arg: unsigned char *buff
@@ -128,16 +133,27 @@ csum_partial:
128 roll $8, %eax 133 roll $8, %eax
1298: 1348:
130 popl %ebx 135 popl %ebx
136 CFI_ADJUST_CFA_OFFSET -4
137 CFI_RESTORE ebx
131 popl %esi 138 popl %esi
139 CFI_ADJUST_CFA_OFFSET -4
140 CFI_RESTORE esi
132 ret 141 ret
142 CFI_ENDPROC
143ENDPROC(csum_partial)
133 144
134#else 145#else
135 146
136/* Version for PentiumII/PPro */ 147/* Version for PentiumII/PPro */
137 148
138csum_partial: 149ENTRY(csum_partial)
150 CFI_STARTPROC
139 pushl %esi 151 pushl %esi
152 CFI_ADJUST_CFA_OFFSET 4
153 CFI_REL_OFFSET esi, 0
140 pushl %ebx 154 pushl %ebx
155 CFI_ADJUST_CFA_OFFSET 4
156 CFI_REL_OFFSET ebx, 0
141 movl 20(%esp),%eax # Function arg: unsigned int sum 157 movl 20(%esp),%eax # Function arg: unsigned int sum
142 movl 16(%esp),%ecx # Function arg: int len 158 movl 16(%esp),%ecx # Function arg: int len
143 movl 12(%esp),%esi # Function arg: const unsigned char *buf 159 movl 12(%esp),%esi # Function arg: const unsigned char *buf
@@ -245,8 +261,14 @@ csum_partial:
245 roll $8, %eax 261 roll $8, %eax
24690: 26290:
247 popl %ebx 263 popl %ebx
264 CFI_ADJUST_CFA_OFFSET -4
265 CFI_RESTORE ebx
248 popl %esi 266 popl %esi
267 CFI_ADJUST_CFA_OFFSET -4
268 CFI_RESTORE esi
249 ret 269 ret
270 CFI_ENDPROC
271ENDPROC(csum_partial)
250 272
251#endif 273#endif
252 274
@@ -278,19 +300,24 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
278 .long 9999b, 6002f ; \ 300 .long 9999b, 6002f ; \
279 .previous 301 .previous
280 302
281.align 4
282.globl csum_partial_copy_generic
283
284#ifndef CONFIG_X86_USE_PPRO_CHECKSUM 303#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
285 304
286#define ARGBASE 16 305#define ARGBASE 16
287#define FP 12 306#define FP 12
288 307
289csum_partial_copy_generic: 308ENTRY(csum_partial_copy_generic)
309 CFI_STARTPROC
290 subl $4,%esp 310 subl $4,%esp
311 CFI_ADJUST_CFA_OFFSET 4
291 pushl %edi 312 pushl %edi
313 CFI_ADJUST_CFA_OFFSET 4
314 CFI_REL_OFFSET edi, 0
292 pushl %esi 315 pushl %esi
316 CFI_ADJUST_CFA_OFFSET 4
317 CFI_REL_OFFSET esi, 0
293 pushl %ebx 318 pushl %ebx
319 CFI_ADJUST_CFA_OFFSET 4
320 CFI_REL_OFFSET ebx, 0
294 movl ARGBASE+16(%esp),%eax # sum 321 movl ARGBASE+16(%esp),%eax # sum
295 movl ARGBASE+12(%esp),%ecx # len 322 movl ARGBASE+12(%esp),%ecx # len
296 movl ARGBASE+4(%esp),%esi # src 323 movl ARGBASE+4(%esp),%esi # src
@@ -400,10 +427,19 @@ DST( movb %cl, (%edi) )
400.previous 427.previous
401 428
402 popl %ebx 429 popl %ebx
430 CFI_ADJUST_CFA_OFFSET -4
431 CFI_RESTORE ebx
403 popl %esi 432 popl %esi
433 CFI_ADJUST_CFA_OFFSET -4
434 CFI_RESTORE esi
404 popl %edi 435 popl %edi
436 CFI_ADJUST_CFA_OFFSET -4
437 CFI_RESTORE edi
405 popl %ecx # equivalent to addl $4,%esp 438 popl %ecx # equivalent to addl $4,%esp
439 CFI_ADJUST_CFA_OFFSET -4
406 ret 440 ret
441 CFI_ENDPROC
442ENDPROC(csum_partial_copy_generic)
407 443
408#else 444#else
409 445
@@ -421,10 +457,17 @@ DST( movb %cl, (%edi) )
421 457
422#define ARGBASE 12 458#define ARGBASE 12
423 459
424csum_partial_copy_generic: 460ENTRY(csum_partial_copy_generic)
461 CFI_STARTPROC
425 pushl %ebx 462 pushl %ebx
463 CFI_ADJUST_CFA_OFFSET 4
464 CFI_REL_OFFSET ebx, 0
426 pushl %edi 465 pushl %edi
466 CFI_ADJUST_CFA_OFFSET 4
467 CFI_REL_OFFSET edi, 0
427 pushl %esi 468 pushl %esi
469 CFI_ADJUST_CFA_OFFSET 4
470 CFI_REL_OFFSET esi, 0
428 movl ARGBASE+4(%esp),%esi #src 471 movl ARGBASE+4(%esp),%esi #src
429 movl ARGBASE+8(%esp),%edi #dst 472 movl ARGBASE+8(%esp),%edi #dst
430 movl ARGBASE+12(%esp),%ecx #len 473 movl ARGBASE+12(%esp),%ecx #len
@@ -485,9 +528,17 @@ DST( movb %dl, (%edi) )
485.previous 528.previous
486 529
487 popl %esi 530 popl %esi
531 CFI_ADJUST_CFA_OFFSET -4
532 CFI_RESTORE esi
488 popl %edi 533 popl %edi
534 CFI_ADJUST_CFA_OFFSET -4
535 CFI_RESTORE edi
489 popl %ebx 536 popl %ebx
537 CFI_ADJUST_CFA_OFFSET -4
538 CFI_RESTORE ebx
490 ret 539 ret
540 CFI_ENDPROC
541ENDPROC(csum_partial_copy_generic)
491 542
492#undef ROUND 543#undef ROUND
493#undef ROUND1 544#undef ROUND1
diff --git a/arch/i386/lib/getuser.S b/arch/i386/lib/getuser.S
index 62d7f178a32..6d84b53f12a 100644
--- a/arch/i386/lib/getuser.S
+++ b/arch/i386/lib/getuser.S
@@ -8,6 +8,8 @@
8 * return an error value in addition to the "real" 8 * return an error value in addition to the "real"
9 * return value. 9 * return value.
10 */ 10 */
11#include <linux/linkage.h>
12#include <asm/dwarf2.h>
11#include <asm/thread_info.h> 13#include <asm/thread_info.h>
12 14
13 15
@@ -24,19 +26,19 @@
24 */ 26 */
25 27
26.text 28.text
27.align 4 29ENTRY(__get_user_1)
28.globl __get_user_1 30 CFI_STARTPROC
29__get_user_1:
30 GET_THREAD_INFO(%edx) 31 GET_THREAD_INFO(%edx)
31 cmpl TI_addr_limit(%edx),%eax 32 cmpl TI_addr_limit(%edx),%eax
32 jae bad_get_user 33 jae bad_get_user
331: movzbl (%eax),%edx 341: movzbl (%eax),%edx
34 xorl %eax,%eax 35 xorl %eax,%eax
35 ret 36 ret
37 CFI_ENDPROC
38ENDPROC(__get_user_1)
36 39
37.align 4 40ENTRY(__get_user_2)
38.globl __get_user_2 41 CFI_STARTPROC
39__get_user_2:
40 addl $1,%eax 42 addl $1,%eax
41 jc bad_get_user 43 jc bad_get_user
42 GET_THREAD_INFO(%edx) 44 GET_THREAD_INFO(%edx)
@@ -45,10 +47,11 @@ __get_user_2:
452: movzwl -1(%eax),%edx 472: movzwl -1(%eax),%edx
46 xorl %eax,%eax 48 xorl %eax,%eax
47 ret 49 ret
50 CFI_ENDPROC
51ENDPROC(__get_user_2)
48 52
49.align 4 53ENTRY(__get_user_4)
50.globl __get_user_4 54 CFI_STARTPROC
51__get_user_4:
52 addl $3,%eax 55 addl $3,%eax
53 jc bad_get_user 56 jc bad_get_user
54 GET_THREAD_INFO(%edx) 57 GET_THREAD_INFO(%edx)
@@ -57,11 +60,16 @@ __get_user_4:
573: movl -3(%eax),%edx 603: movl -3(%eax),%edx
58 xorl %eax,%eax 61 xorl %eax,%eax
59 ret 62 ret
63 CFI_ENDPROC
64ENDPROC(__get_user_4)
60 65
61bad_get_user: 66bad_get_user:
67 CFI_STARTPROC
62 xorl %edx,%edx 68 xorl %edx,%edx
63 movl $-14,%eax 69 movl $-14,%eax
64 ret 70 ret
71 CFI_ENDPROC
72END(bad_get_user)
65 73
66.section __ex_table,"a" 74.section __ex_table,"a"
67 .long 1b,bad_get_user 75 .long 1b,bad_get_user
diff --git a/arch/i386/lib/putuser.S b/arch/i386/lib/putuser.S
index a32d9f570f4..f58fba109d1 100644
--- a/arch/i386/lib/putuser.S
+++ b/arch/i386/lib/putuser.S
@@ -8,6 +8,8 @@
8 * return an error value in addition to the "real" 8 * return an error value in addition to the "real"
9 * return value. 9 * return value.
10 */ 10 */
11#include <linux/linkage.h>
12#include <asm/dwarf2.h>
11#include <asm/thread_info.h> 13#include <asm/thread_info.h>
12 14
13 15
@@ -23,23 +25,28 @@
23 * as they get called from within inline assembly. 25 * as they get called from within inline assembly.
24 */ 26 */
25 27
26#define ENTER pushl %ebx ; GET_THREAD_INFO(%ebx) 28#define ENTER CFI_STARTPROC ; \
27#define EXIT popl %ebx ; ret 29 pushl %ebx ; \
30 CFI_ADJUST_CFA_OFFSET 4 ; \
31 CFI_REL_OFFSET ebx, 0 ; \
32 GET_THREAD_INFO(%ebx)
33#define EXIT popl %ebx ; \
34 CFI_ADJUST_CFA_OFFSET -4 ; \
35 CFI_RESTORE ebx ; \
36 ret ; \
37 CFI_ENDPROC
28 38
29.text 39.text
30.align 4 40ENTRY(__put_user_1)
31.globl __put_user_1
32__put_user_1:
33 ENTER 41 ENTER
34 cmpl TI_addr_limit(%ebx),%ecx 42 cmpl TI_addr_limit(%ebx),%ecx
35 jae bad_put_user 43 jae bad_put_user
361: movb %al,(%ecx) 441: movb %al,(%ecx)
37 xorl %eax,%eax 45 xorl %eax,%eax
38 EXIT 46 EXIT
47ENDPROC(__put_user_1)
39 48
40.align 4 49ENTRY(__put_user_2)
41.globl __put_user_2
42__put_user_2:
43 ENTER 50 ENTER
44 movl TI_addr_limit(%ebx),%ebx 51 movl TI_addr_limit(%ebx),%ebx
45 subl $1,%ebx 52 subl $1,%ebx
@@ -48,10 +55,9 @@ __put_user_2:
482: movw %ax,(%ecx) 552: movw %ax,(%ecx)
49 xorl %eax,%eax 56 xorl %eax,%eax
50 EXIT 57 EXIT
58ENDPROC(__put_user_2)
51 59
52.align 4 60ENTRY(__put_user_4)
53.globl __put_user_4
54__put_user_4:
55 ENTER 61 ENTER
56 movl TI_addr_limit(%ebx),%ebx 62 movl TI_addr_limit(%ebx),%ebx
57 subl $3,%ebx 63 subl $3,%ebx
@@ -60,10 +66,9 @@ __put_user_4:
603: movl %eax,(%ecx) 663: movl %eax,(%ecx)
61 xorl %eax,%eax 67 xorl %eax,%eax
62 EXIT 68 EXIT
69ENDPROC(__put_user_4)
63 70
64.align 4 71ENTRY(__put_user_8)
65.globl __put_user_8
66__put_user_8:
67 ENTER 72 ENTER
68 movl TI_addr_limit(%ebx),%ebx 73 movl TI_addr_limit(%ebx),%ebx
69 subl $7,%ebx 74 subl $7,%ebx
@@ -73,10 +78,16 @@ __put_user_8:
735: movl %edx,4(%ecx) 785: movl %edx,4(%ecx)
74 xorl %eax,%eax 79 xorl %eax,%eax
75 EXIT 80 EXIT
81ENDPROC(__put_user_8)
76 82
77bad_put_user: 83bad_put_user:
84 CFI_STARTPROC simple
85 CFI_DEF_CFA esp, 2*4
86 CFI_OFFSET eip, -1*4
87 CFI_OFFSET ebx, -2*4
78 movl $-14,%eax 88 movl $-14,%eax
79 EXIT 89 EXIT
90END(bad_put_user)
80 91
81.section __ex_table,"a" 92.section __ex_table,"a"
82 .long 1b,bad_put_user 93 .long 1b,bad_put_user
diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index 086b3726862..9f38b12b4af 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -716,7 +716,6 @@ do { \
716unsigned long __copy_to_user_ll(void __user *to, const void *from, 716unsigned long __copy_to_user_ll(void __user *to, const void *from,
717 unsigned long n) 717 unsigned long n)
718{ 718{
719 BUG_ON((long) n < 0);
720#ifndef CONFIG_X86_WP_WORKS_OK 719#ifndef CONFIG_X86_WP_WORKS_OK
721 if (unlikely(boot_cpu_data.wp_works_ok == 0) && 720 if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
722 ((unsigned long )to) < TASK_SIZE) { 721 ((unsigned long )to) < TASK_SIZE) {
@@ -785,7 +784,6 @@ EXPORT_SYMBOL(__copy_to_user_ll);
785unsigned long __copy_from_user_ll(void *to, const void __user *from, 784unsigned long __copy_from_user_ll(void *to, const void __user *from,
786 unsigned long n) 785 unsigned long n)
787{ 786{
788 BUG_ON((long)n < 0);
789 if (movsl_is_ok(to, from, n)) 787 if (movsl_is_ok(to, from, n))
790 __copy_user_zeroing(to, from, n); 788 __copy_user_zeroing(to, from, n);
791 else 789 else
@@ -797,7 +795,6 @@ EXPORT_SYMBOL(__copy_from_user_ll);
797unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, 795unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
798 unsigned long n) 796 unsigned long n)
799{ 797{
800 BUG_ON((long)n < 0);
801 if (movsl_is_ok(to, from, n)) 798 if (movsl_is_ok(to, from, n))
802 __copy_user(to, from, n); 799 __copy_user(to, from, n);
803 else 800 else
@@ -810,7 +807,6 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero);
810unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, 807unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
811 unsigned long n) 808 unsigned long n)
812{ 809{
813 BUG_ON((long)n < 0);
814#ifdef CONFIG_X86_INTEL_USERCOPY 810#ifdef CONFIG_X86_INTEL_USERCOPY
815 if ( n > 64 && cpu_has_xmm2) 811 if ( n > 64 && cpu_has_xmm2)
816 n = __copy_user_zeroing_intel_nocache(to, from, n); 812 n = __copy_user_zeroing_intel_nocache(to, from, n);
@@ -825,7 +821,6 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
825unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, 821unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
826 unsigned long n) 822 unsigned long n)
827{ 823{
828 BUG_ON((long)n < 0);
829#ifdef CONFIG_X86_INTEL_USERCOPY 824#ifdef CONFIG_X86_INTEL_USERCOPY
830 if ( n > 64 && cpu_has_xmm2) 825 if ( n > 64 && cpu_has_xmm2)
831 n = __copy_user_intel_nocache(to, from, n); 826 n = __copy_user_intel_nocache(to, from, n);
@@ -853,7 +848,6 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
853unsigned long 848unsigned long
854copy_to_user(void __user *to, const void *from, unsigned long n) 849copy_to_user(void __user *to, const void *from, unsigned long n)
855{ 850{
856 BUG_ON((long) n < 0);
857 if (access_ok(VERIFY_WRITE, to, n)) 851 if (access_ok(VERIFY_WRITE, to, n))
858 n = __copy_to_user(to, from, n); 852 n = __copy_to_user(to, from, n);
859 return n; 853 return n;
@@ -879,7 +873,6 @@ EXPORT_SYMBOL(copy_to_user);
879unsigned long 873unsigned long
880copy_from_user(void *to, const void __user *from, unsigned long n) 874copy_from_user(void *to, const void __user *from, unsigned long n)
881{ 875{
882 BUG_ON((long) n < 0);
883 if (access_ok(VERIFY_READ, from, n)) 876 if (access_ok(VERIFY_READ, from, n))
884 n = __copy_from_user(to, from, n); 877 n = __copy_from_user(to, from, n);
885 else 878 else
diff --git a/arch/i386/mach-generic/bigsmp.c b/arch/i386/mach-generic/bigsmp.c
index 8a210fa915b..e932d3485ae 100644
--- a/arch/i386/mach-generic/bigsmp.c
+++ b/arch/i386/mach-generic/bigsmp.c
@@ -45,7 +45,7 @@ static struct dmi_system_id __initdata bigsmp_dmi_table[] = {
45}; 45};
46 46
47 47
48static int probe_bigsmp(void) 48static int __init probe_bigsmp(void)
49{ 49{
50 if (def_to_bigsmp) 50 if (def_to_bigsmp)
51 dmi_bigsmp = 1; 51 dmi_bigsmp = 1;
diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c
index b8963a5a3b2..b47f951c0ec 100644
--- a/arch/i386/mach-generic/es7000.c
+++ b/arch/i386/mach-generic/es7000.c
@@ -25,4 +25,45 @@ static int probe_es7000(void)
25 return 0; 25 return 0;
26} 26}
27 27
28extern void es7000_sw_apic(void);
29static void __init enable_apic_mode(void)
30{
31 es7000_sw_apic();
32 return;
33}
34
35static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
36 char *productid)
37{
38 if (mpc->mpc_oemptr) {
39 struct mp_config_oemtable *oem_table =
40 (struct mp_config_oemtable *)mpc->mpc_oemptr;
41 if (!strncmp(oem, "UNISYS", 6))
42 return parse_unisys_oem((char *)oem_table);
43 }
44 return 0;
45}
46
47#ifdef CONFIG_ACPI
48/* Hook from generic ACPI tables.c */
49static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{
51 unsigned long oem_addr;
52 if (!find_unisys_acpi_oem_table(&oem_addr)) {
53 if (es7000_check_dsdt())
54 return parse_unisys_oem((char *)oem_addr);
55 else {
56 setup_unisys();
57 return 1;
58 }
59 }
60 return 0;
61}
62#else
63static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
64{
65 return 0;
66}
67#endif
68
28struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000); 69struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index fe0ed393294..1a5e448a29c 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -573,15 +573,7 @@ do_boot_cpu(__u8 cpu)
573 /* init_tasks (in sched.c) is indexed logically */ 573 /* init_tasks (in sched.c) is indexed logically */
574 stack_start.esp = (void *) idle->thread.esp; 574 stack_start.esp = (void *) idle->thread.esp;
575 575
576 /* Pre-allocate and initialize the CPU's GDT and PDA so it 576 init_gdt(cpu, idle);
577 doesn't have to do any memory allocation during the
578 delicate CPU-bringup phase. */
579 if (!init_gdt(cpu, idle)) {
580 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
581 cpucount--;
582 return;
583 }
584
585 irq_ctx_init(cpu); 577 irq_ctx_init(cpu);
586 578
587 /* Note: Don't modify initial ss override */ 579 /* Note: Don't modify initial ss override */
@@ -749,12 +741,6 @@ initialize_secondary(void)
749#endif 741#endif
750 742
751 /* 743 /*
752 * switch to the per CPU GDT we already set up
753 * in do_boot_cpu()
754 */
755 cpu_set_gdt(current_thread_info()->cpu);
756
757 /*
758 * We don't actually need to load the full TSS, 744 * We don't actually need to load the full TSS,
759 * basically just the stack pointer and the eip. 745 * basically just the stack pointer and the eip.
760 */ 746 */
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index b8c4e259fc8..f534c29e80b 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -20,6 +20,7 @@
20#include <linux/tty.h> 20#include <linux/tty.h>
21#include <linux/vt_kern.h> /* For unblank_screen() */ 21#include <linux/vt_kern.h> /* For unblank_screen() */
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/kprobes.h> 25#include <linux/kprobes.h>
25#include <linux/uaccess.h> 26#include <linux/uaccess.h>
@@ -301,7 +302,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
301 struct mm_struct *mm; 302 struct mm_struct *mm;
302 struct vm_area_struct * vma; 303 struct vm_area_struct * vma;
303 unsigned long address; 304 unsigned long address;
304 unsigned long page;
305 int write, si_code; 305 int write, si_code;
306 306
307 /* get the address */ 307 /* get the address */
@@ -510,7 +510,9 @@ no_context:
510 bust_spinlocks(1); 510 bust_spinlocks(1);
511 511
512 if (oops_may_print()) { 512 if (oops_may_print()) {
513 #ifdef CONFIG_X86_PAE 513 __typeof__(pte_val(__pte(0))) page;
514
515#ifdef CONFIG_X86_PAE
514 if (error_code & 16) { 516 if (error_code & 16) {
515 pte_t *pte = lookup_address(address); 517 pte_t *pte = lookup_address(address);
516 518
@@ -519,7 +521,7 @@ no_context:
519 "NX-protected page - exploit attempt? " 521 "NX-protected page - exploit attempt? "
520 "(uid: %d)\n", current->uid); 522 "(uid: %d)\n", current->uid);
521 } 523 }
522 #endif 524#endif
523 if (address < PAGE_SIZE) 525 if (address < PAGE_SIZE)
524 printk(KERN_ALERT "BUG: unable to handle kernel NULL " 526 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
525 "pointer dereference"); 527 "pointer dereference");
@@ -529,25 +531,38 @@ no_context:
529 printk(" at virtual address %08lx\n",address); 531 printk(" at virtual address %08lx\n",address);
530 printk(KERN_ALERT " printing eip:\n"); 532 printk(KERN_ALERT " printing eip:\n");
531 printk("%08lx\n", regs->eip); 533 printk("%08lx\n", regs->eip);
532 } 534
533 page = read_cr3(); 535 page = read_cr3();
534 page = ((unsigned long *) __va(page))[address >> 22]; 536 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
535 if (oops_may_print()) 537#ifdef CONFIG_X86_PAE
538 printk(KERN_ALERT "*pdpt = %016Lx\n", page);
539 if ((page >> PAGE_SHIFT) < max_low_pfn
540 && page & _PAGE_PRESENT) {
541 page &= PAGE_MASK;
542 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
543 & (PTRS_PER_PMD - 1)];
544 printk(KERN_ALERT "*pde = %016Lx\n", page);
545 page &= ~_PAGE_NX;
546 }
547#else
536 printk(KERN_ALERT "*pde = %08lx\n", page); 548 printk(KERN_ALERT "*pde = %08lx\n", page);
537 /*
538 * We must not directly access the pte in the highpte
539 * case, the page table might be allocated in highmem.
540 * And lets rather not kmap-atomic the pte, just in case
541 * it's allocated already.
542 */
543#ifndef CONFIG_HIGHPTE
544 if ((page & 1) && oops_may_print()) {
545 page &= PAGE_MASK;
546 address &= 0x003ff000;
547 page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
548 printk(KERN_ALERT "*pte = %08lx\n", page);
549 }
550#endif 549#endif
550
551 /*
552 * We must not directly access the pte in the highpte
553 * case if the page table is located in highmem.
554 * And let's rather not kmap-atomic the pte, just in case
555 * it's allocated already.
556 */
557 if ((page >> PAGE_SHIFT) < max_low_pfn
558 && (page & _PAGE_PRESENT)) {
559 page &= PAGE_MASK;
560 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
561 & (PTRS_PER_PTE - 1)];
562 printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
563 }
564 }
565
551 tsk->thread.cr2 = address; 566 tsk->thread.cr2 = address;
552 tsk->thread.trap_no = 14; 567 tsk->thread.trap_no = 14;
553 tsk->thread.error_code = error_code; 568 tsk->thread.error_code = error_code;
@@ -588,7 +603,6 @@ do_sigbus:
588 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 603 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
589} 604}
590 605
591#ifndef CONFIG_X86_PAE
592void vmalloc_sync_all(void) 606void vmalloc_sync_all(void)
593{ 607{
594 /* 608 /*
@@ -601,6 +615,9 @@ void vmalloc_sync_all(void)
601 static unsigned long start = TASK_SIZE; 615 static unsigned long start = TASK_SIZE;
602 unsigned long address; 616 unsigned long address;
603 617
618 if (SHARED_KERNEL_PMD)
619 return;
620
604 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 621 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
605 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 622 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
606 if (!test_bit(pgd_index(address), insync)) { 623 if (!test_bit(pgd_index(address), insync)) {
@@ -623,4 +640,3 @@ void vmalloc_sync_all(void)
623 start = address + PGDIR_SIZE; 640 start = address + PGDIR_SIZE;
624 } 641 }
625} 642}
626#endif
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index ac70d09df7e..ad8d86cc683 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -26,7 +26,7 @@ void kunmap(struct page *page)
26 * However when holding an atomic kmap is is not legal to sleep, so atomic 26 * However when holding an atomic kmap is is not legal to sleep, so atomic
27 * kmaps are appropriate for short, tight code paths only. 27 * kmaps are appropriate for short, tight code paths only.
28 */ 28 */
29void *kmap_atomic(struct page *page, enum km_type type) 29void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
30{ 30{
31 enum fixed_addresses idx; 31 enum fixed_addresses idx;
32 unsigned long vaddr; 32 unsigned long vaddr;
@@ -41,12 +41,17 @@ void *kmap_atomic(struct page *page, enum km_type type)
41 return page_address(page); 41 return page_address(page);
42 42
43 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 43 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
44 set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); 44 set_pte(kmap_pte-idx, mk_pte(page, prot));
45 arch_flush_lazy_mmu_mode(); 45 arch_flush_lazy_mmu_mode();
46 46
47 return (void*) vaddr; 47 return (void*) vaddr;
48} 48}
49 49
50void *kmap_atomic(struct page *page, enum km_type type)
51{
52 return kmap_atomic_prot(page, type, kmap_prot);
53}
54
50void kunmap_atomic(void *kvaddr, enum km_type type) 55void kunmap_atomic(void *kvaddr, enum km_type type)
51{ 56{
52 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 57 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -67,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
67#endif 72#endif
68 } 73 }
69 74
75 arch_flush_lazy_mmu_mode();
70 pagefault_enable(); 76 pagefault_enable();
71} 77}
72 78
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index ae436882af7..dbe16f63a56 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/pagemap.h> 24#include <linux/pagemap.h>
25#include <linux/pfn.h>
25#include <linux/poison.h> 26#include <linux/poison.h>
26#include <linux/bootmem.h> 27#include <linux/bootmem.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
@@ -42,6 +43,7 @@
42#include <asm/tlb.h> 43#include <asm/tlb.h>
43#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
44#include <asm/sections.h> 45#include <asm/sections.h>
46#include <asm/paravirt.h>
45 47
46unsigned int __VMALLOC_RESERVE = 128 << 20; 48unsigned int __VMALLOC_RESERVE = 128 << 20;
47 49
@@ -61,17 +63,18 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
61 pmd_t *pmd_table; 63 pmd_t *pmd_table;
62 64
63#ifdef CONFIG_X86_PAE 65#ifdef CONFIG_X86_PAE
64 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 66 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
65 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); 67 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
66 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 68
67 pud = pud_offset(pgd, 0); 69 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
68 if (pmd_table != pmd_offset(pud, 0)) 70 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
69 BUG(); 71 pud = pud_offset(pgd, 0);
70#else 72 if (pmd_table != pmd_offset(pud, 0))
73 BUG();
74 }
75#endif
71 pud = pud_offset(pgd, 0); 76 pud = pud_offset(pgd, 0);
72 pmd_table = pmd_offset(pud, 0); 77 pmd_table = pmd_offset(pud, 0);
73#endif
74
75 return pmd_table; 78 return pmd_table;
76} 79}
77 80
@@ -81,14 +84,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
81 */ 84 */
82static pte_t * __init one_page_table_init(pmd_t *pmd) 85static pte_t * __init one_page_table_init(pmd_t *pmd)
83{ 86{
84 if (pmd_none(*pmd)) { 87 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
85 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); 88 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
89
86 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); 90 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
87 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 91 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
88 if (page_table != pte_offset_kernel(pmd, 0)) 92 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
89 BUG();
90
91 return page_table;
92 } 93 }
93 94
94 return pte_offset_kernel(pmd, 0); 95 return pte_offset_kernel(pmd, 0);
@@ -108,7 +109,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
108static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) 109static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
109{ 110{
110 pgd_t *pgd; 111 pgd_t *pgd;
111 pud_t *pud;
112 pmd_t *pmd; 112 pmd_t *pmd;
113 int pgd_idx, pmd_idx; 113 int pgd_idx, pmd_idx;
114 unsigned long vaddr; 114 unsigned long vaddr;
@@ -119,13 +119,10 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
119 pgd = pgd_base + pgd_idx; 119 pgd = pgd_base + pgd_idx;
120 120
121 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { 121 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
122 if (pgd_none(*pgd)) 122 pmd = one_md_table_init(pgd);
123 one_md_table_init(pgd); 123 pmd = pmd + pmd_index(vaddr);
124 pud = pud_offset(pgd, vaddr);
125 pmd = pmd_offset(pud, vaddr);
126 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { 124 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
127 if (pmd_none(*pmd)) 125 one_page_table_init(pmd);
128 one_page_table_init(pmd);
129 126
130 vaddr += PMD_SIZE; 127 vaddr += PMD_SIZE;
131 } 128 }
@@ -167,20 +164,22 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
167 /* Map with big pages if possible, otherwise create normal page tables. */ 164 /* Map with big pages if possible, otherwise create normal page tables. */
168 if (cpu_has_pse) { 165 if (cpu_has_pse) {
169 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; 166 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
170
171 if (is_kernel_text(address) || is_kernel_text(address2)) 167 if (is_kernel_text(address) || is_kernel_text(address2))
172 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); 168 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
173 else 169 else
174 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); 170 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
171
175 pfn += PTRS_PER_PTE; 172 pfn += PTRS_PER_PTE;
176 } else { 173 } else {
177 pte = one_page_table_init(pmd); 174 pte = one_page_table_init(pmd);
178 175
179 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { 176 for (pte_ofs = 0;
180 if (is_kernel_text(address)) 177 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
181 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); 178 pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
182 else 179 if (is_kernel_text(address))
183 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); 180 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
181 else
182 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
184 } 183 }
185 } 184 }
186 } 185 }
@@ -337,24 +336,78 @@ extern void __init remap_numa_kva(void);
337#define remap_numa_kva() do {} while (0) 336#define remap_numa_kva() do {} while (0)
338#endif 337#endif
339 338
340static void __init pagetable_init (void) 339void __init native_pagetable_setup_start(pgd_t *base)
341{ 340{
342 unsigned long vaddr;
343 pgd_t *pgd_base = swapper_pg_dir;
344
345#ifdef CONFIG_X86_PAE 341#ifdef CONFIG_X86_PAE
346 int i; 342 int i;
347 /* Init entries of the first-level page table to the zero page */ 343
348 for (i = 0; i < PTRS_PER_PGD; i++) 344 /*
349 set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); 345 * Init entries of the first-level page table to the
346 * zero page, if they haven't already been set up.
347 *
348 * In a normal native boot, we'll be running on a
349 * pagetable rooted in swapper_pg_dir, but not in PAE
350 * mode, so this will end up clobbering the mappings
351 * for the lower 24Mbytes of the address space,
352 * without affecting the kernel address space.
353 */
354 for (i = 0; i < USER_PTRS_PER_PGD; i++)
355 set_pgd(&base[i],
356 __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
357
358 /* Make sure kernel address space is empty so that a pagetable
359 will be allocated for it. */
360 memset(&base[USER_PTRS_PER_PGD], 0,
361 KERNEL_PGD_PTRS * sizeof(pgd_t));
350#else 362#else
351 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); 363 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
352#endif 364#endif
365}
366
367void __init native_pagetable_setup_done(pgd_t *base)
368{
369#ifdef CONFIG_X86_PAE
370 /*
371 * Add low memory identity-mappings - SMP needs it when
372 * starting up on an AP from real-mode. In the non-PAE
373 * case we already have these mappings through head.S.
374 * All user-space mappings are explicitly cleared after
375 * SMP startup.
376 */
377 set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
378#endif
379}
380
381/*
382 * Build a proper pagetable for the kernel mappings. Up until this
383 * point, we've been running on some set of pagetables constructed by
384 * the boot process.
385 *
386 * If we're booting on native hardware, this will be a pagetable
387 * constructed in arch/i386/kernel/head.S, and not running in PAE mode
388 * (even if we'll end up running in PAE). The root of the pagetable
389 * will be swapper_pg_dir.
390 *
391 * If we're booting paravirtualized under a hypervisor, then there are
392 * more options: we may already be running PAE, and the pagetable may
393 * or may not be based in swapper_pg_dir. In any case,
394 * paravirt_pagetable_setup_start() will set up swapper_pg_dir
395 * appropriately for the rest of the initialization to work.
396 *
397 * In general, pagetable_init() assumes that the pagetable may already
398 * be partially populated, and so it avoids stomping on any existing
399 * mappings.
400 */
401static void __init pagetable_init (void)
402{
403 unsigned long vaddr, end;
404 pgd_t *pgd_base = swapper_pg_dir;
405
406 paravirt_pagetable_setup_start(pgd_base);
353 407
354 /* Enable PSE if available */ 408 /* Enable PSE if available */
355 if (cpu_has_pse) { 409 if (cpu_has_pse)
356 set_in_cr4(X86_CR4_PSE); 410 set_in_cr4(X86_CR4_PSE);
357 }
358 411
359 /* Enable PGE if available */ 412 /* Enable PGE if available */
360 if (cpu_has_pge) { 413 if (cpu_has_pge) {
@@ -371,20 +424,12 @@ static void __init pagetable_init (void)
371 * created - mappings will be set by set_fixmap(): 424 * created - mappings will be set by set_fixmap():
372 */ 425 */
373 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 426 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
374 page_table_range_init(vaddr, 0, pgd_base); 427 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
428 page_table_range_init(vaddr, end, pgd_base);
375 429
376 permanent_kmaps_init(pgd_base); 430 permanent_kmaps_init(pgd_base);
377 431
378#ifdef CONFIG_X86_PAE 432 paravirt_pagetable_setup_done(pgd_base);
379 /*
380 * Add low memory identity-mappings - SMP needs it when
381 * starting up on an AP from real-mode. In the non-PAE
382 * case we already have these mappings through head.S.
383 * All user-space mappings are explicitly cleared after
384 * SMP startup.
385 */
386 set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
387#endif
388} 433}
389 434
390#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) 435#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
@@ -700,6 +745,8 @@ struct kmem_cache *pmd_cache;
700 745
701void __init pgtable_cache_init(void) 746void __init pgtable_cache_init(void)
702{ 747{
748 size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
749
703 if (PTRS_PER_PMD > 1) { 750 if (PTRS_PER_PMD > 1) {
704 pmd_cache = kmem_cache_create("pmd", 751 pmd_cache = kmem_cache_create("pmd",
705 PTRS_PER_PMD*sizeof(pmd_t), 752 PTRS_PER_PMD*sizeof(pmd_t),
@@ -709,13 +756,23 @@ void __init pgtable_cache_init(void)
709 NULL); 756 NULL);
710 if (!pmd_cache) 757 if (!pmd_cache)
711 panic("pgtable_cache_init(): cannot create pmd cache"); 758 panic("pgtable_cache_init(): cannot create pmd cache");
759
760 if (!SHARED_KERNEL_PMD) {
761 /* If we're in PAE mode and have a non-shared
762 kernel pmd, then the pgd size must be a
763 page size. This is because the pgd_list
764 links through the page structure, so there
765 can only be one pgd per page for this to
766 work. */
767 pgd_size = PAGE_SIZE;
768 }
712 } 769 }
713 pgd_cache = kmem_cache_create("pgd", 770 pgd_cache = kmem_cache_create("pgd",
714 PTRS_PER_PGD*sizeof(pgd_t), 771 pgd_size,
715 PTRS_PER_PGD*sizeof(pgd_t), 772 pgd_size,
716 0, 773 0,
717 pgd_ctor, 774 pgd_ctor,
718 PTRS_PER_PMD == 1 ? pgd_dtor : NULL); 775 (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL);
719 if (!pgd_cache) 776 if (!pgd_cache)
720 panic("pgtable_cache_init(): Cannot create pgd cache"); 777 panic("pgtable_cache_init(): Cannot create pgd cache");
721} 778}
@@ -751,13 +808,25 @@ static int noinline do_test_wp_bit(void)
751 808
752void mark_rodata_ro(void) 809void mark_rodata_ro(void)
753{ 810{
754 unsigned long addr = (unsigned long)__start_rodata; 811 unsigned long start = PFN_ALIGN(_text);
812 unsigned long size = PFN_ALIGN(_etext) - start;
755 813
756 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) 814#ifdef CONFIG_HOTPLUG_CPU
757 change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); 815 /* It must still be possible to apply SMP alternatives. */
816 if (num_possible_cpus() <= 1)
817#endif
818 {
819 change_page_attr(virt_to_page(start),
820 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
821 printk("Write protecting the kernel text: %luk\n", size >> 10);
822 }
758 823
759 printk("Write protecting the kernel read-only data: %uk\n", 824 start += size;
760 (__end_rodata - __start_rodata) >> 10); 825 size = (unsigned long)__end_rodata - start;
826 change_page_attr(virt_to_page(start),
827 size >> PAGE_SHIFT, PAGE_KERNEL_RO);
828 printk("Write protecting the kernel read-only data: %luk\n",
829 size >> 10);
761 830
762 /* 831 /*
763 * change_page_attr() requires a global_flush_tlb() call after it. 832 * change_page_attr() requires a global_flush_tlb() call after it.
@@ -774,26 +843,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
774 unsigned long addr; 843 unsigned long addr;
775 844
776 for (addr = begin; addr < end; addr += PAGE_SIZE) { 845 for (addr = begin; addr < end; addr += PAGE_SIZE) {
777 ClearPageReserved(virt_to_page(addr)); 846 struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
778 init_page_count(virt_to_page(addr)); 847 ClearPageReserved(page);
779 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); 848 init_page_count(page);
780 free_page(addr); 849 memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
850 __free_page(page);
781 totalram_pages++; 851 totalram_pages++;
782 } 852 }
783 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); 853 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
784} 854}
785 855
786void free_initmem(void) 856void free_initmem(void)
787{ 857{
788 free_init_pages("unused kernel memory", 858 free_init_pages("unused kernel memory",
789 (unsigned long)(&__init_begin), 859 __pa_symbol(&__init_begin),
790 (unsigned long)(&__init_end)); 860 __pa_symbol(&__init_end));
791} 861}
792 862
793#ifdef CONFIG_BLK_DEV_INITRD 863#ifdef CONFIG_BLK_DEV_INITRD
794void free_initrd_mem(unsigned long start, unsigned long end) 864void free_initrd_mem(unsigned long start, unsigned long end)
795{ 865{
796 free_init_pages("initrd memory", start, end); 866 free_init_pages("initrd memory", __pa(start), __pa(end));
797} 867}
798#endif 868#endif
799 869
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 412ebbd8adb..47bd477c8ec 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
91 unsigned long flags; 91 unsigned long flags;
92 92
93 set_pte_atomic(kpte, pte); /* change init_mm */ 93 set_pte_atomic(kpte, pte); /* change init_mm */
94 if (PTRS_PER_PMD > 1) 94 if (SHARED_KERNEL_PMD)
95 return; 95 return;
96 96
97 spin_lock_irqsave(&pgd_lock, flags); 97 spin_lock_irqsave(&pgd_lock, flags);
@@ -142,7 +142,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
142 return -EINVAL; 142 return -EINVAL;
143 kpte_page = virt_to_page(kpte); 143 kpte_page = virt_to_page(kpte);
144 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 144 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
145 if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 145 if (!pte_huge(*kpte)) {
146 set_pte_atomic(kpte, mk_pte(page, prot)); 146 set_pte_atomic(kpte, mk_pte(page, prot));
147 } else { 147 } else {
148 pgprot_t ref_prot; 148 pgprot_t ref_prot;
@@ -158,7 +158,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
158 kpte_page = split; 158 kpte_page = split;
159 } 159 }
160 page_private(kpte_page)++; 160 page_private(kpte_page)++;
161 } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 161 } else if (!pte_huge(*kpte)) {
162 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); 162 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
163 BUG_ON(page_private(kpte_page) == 0); 163 BUG_ON(page_private(kpte_page) == 0);
164 page_private(kpte_page)--; 164 page_private(kpte_page)--;
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index fa0cfbd551e..9a96c164742 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -144,10 +144,8 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
144} 144}
145 145
146static int fixmaps; 146static int fixmaps;
147#ifndef CONFIG_COMPAT_VDSO
148unsigned long __FIXADDR_TOP = 0xfffff000; 147unsigned long __FIXADDR_TOP = 0xfffff000;
149EXPORT_SYMBOL(__FIXADDR_TOP); 148EXPORT_SYMBOL(__FIXADDR_TOP);
150#endif
151 149
152void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) 150void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
153{ 151{
@@ -173,12 +171,8 @@ void reserve_top_address(unsigned long reserve)
173 BUG_ON(fixmaps > 0); 171 BUG_ON(fixmaps > 0);
174 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 172 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
175 (int)-reserve); 173 (int)-reserve);
176#ifdef CONFIG_COMPAT_VDSO
177 BUG_ON(reserve != 0);
178#else
179 __FIXADDR_TOP = -reserve - PAGE_SIZE; 174 __FIXADDR_TOP = -reserve - PAGE_SIZE;
180 __VMALLOC_RESERVE += reserve; 175 __VMALLOC_RESERVE += reserve;
181#endif
182} 176}
183 177
184pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 178pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -238,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd)
238 set_page_private(next, (unsigned long)pprev); 232 set_page_private(next, (unsigned long)pprev);
239} 233}
240 234
235#if (PTRS_PER_PMD == 1)
236/* Non-PAE pgd constructor */
241void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) 237void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
242{ 238{
243 unsigned long flags; 239 unsigned long flags;
244 240
245 if (PTRS_PER_PMD == 1) { 241 /* !PAE, no pagetable sharing */
246 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 242 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
247 spin_lock_irqsave(&pgd_lock, flags); 243
248 } 244 spin_lock_irqsave(&pgd_lock, flags);
249 245
246 /* must happen under lock */
250 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, 247 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
251 swapper_pg_dir + USER_PTRS_PER_PGD, 248 swapper_pg_dir + USER_PTRS_PER_PGD,
252 KERNEL_PGD_PTRS); 249 KERNEL_PGD_PTRS);
253
254 if (PTRS_PER_PMD > 1)
255 return;
256
257 /* must happen under lock */
258 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, 250 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
259 __pa(swapper_pg_dir) >> PAGE_SHIFT, 251 __pa(swapper_pg_dir) >> PAGE_SHIFT,
260 USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); 252 USER_PTRS_PER_PGD,
261 253 KERNEL_PGD_PTRS);
262 pgd_list_add(pgd); 254 pgd_list_add(pgd);
263 spin_unlock_irqrestore(&pgd_lock, flags); 255 spin_unlock_irqrestore(&pgd_lock, flags);
264} 256}
257#else /* PTRS_PER_PMD > 1 */
258/* PAE pgd constructor */
259void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
260{
261 /* PAE, kernel PMD may be shared */
262
263 if (SHARED_KERNEL_PMD) {
264 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
265 swapper_pg_dir + USER_PTRS_PER_PGD,
266 KERNEL_PGD_PTRS);
267 } else {
268 unsigned long flags;
269
270 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
271 spin_lock_irqsave(&pgd_lock, flags);
272 pgd_list_add(pgd);
273 spin_unlock_irqrestore(&pgd_lock, flags);
274 }
275}
276#endif /* PTRS_PER_PMD */
265 277
266/* never called when PTRS_PER_PMD > 1 */
267void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) 278void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
268{ 279{
269 unsigned long flags; /* can be called from interrupt context */ 280 unsigned long flags; /* can be called from interrupt context */
270 281
282 BUG_ON(SHARED_KERNEL_PMD);
283
271 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); 284 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
272 spin_lock_irqsave(&pgd_lock, flags); 285 spin_lock_irqsave(&pgd_lock, flags);
273 pgd_list_del(pgd); 286 pgd_list_del(pgd);
274 spin_unlock_irqrestore(&pgd_lock, flags); 287 spin_unlock_irqrestore(&pgd_lock, flags);
275} 288}
276 289
290#define UNSHARED_PTRS_PER_PGD \
291 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
292
293/* If we allocate a pmd for part of the kernel address space, then
294 make sure its initialized with the appropriate kernel mappings.
295 Otherwise use a cached zeroed pmd. */
296static pmd_t *pmd_cache_alloc(int idx)
297{
298 pmd_t *pmd;
299
300 if (idx >= USER_PTRS_PER_PGD) {
301 pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
302
303 if (pmd)
304 memcpy(pmd,
305 (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
306 sizeof(pmd_t) * PTRS_PER_PMD);
307 } else
308 pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
309
310 return pmd;
311}
312
313static void pmd_cache_free(pmd_t *pmd, int idx)
314{
315 if (idx >= USER_PTRS_PER_PGD)
316 free_page((unsigned long)pmd);
317 else
318 kmem_cache_free(pmd_cache, pmd);
319}
320
277pgd_t *pgd_alloc(struct mm_struct *mm) 321pgd_t *pgd_alloc(struct mm_struct *mm)
278{ 322{
279 int i; 323 int i;
@@ -282,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
282 if (PTRS_PER_PMD == 1 || !pgd) 326 if (PTRS_PER_PMD == 1 || !pgd)
283 return pgd; 327 return pgd;
284 328
285 for (i = 0; i < USER_PTRS_PER_PGD; ++i) { 329 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
286 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); 330 pmd_t *pmd = pmd_cache_alloc(i);
331
287 if (!pmd) 332 if (!pmd)
288 goto out_oom; 333 goto out_oom;
334
289 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); 335 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
290 set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); 336 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
291 } 337 }
@@ -296,7 +342,7 @@ out_oom:
296 pgd_t pgdent = pgd[i]; 342 pgd_t pgdent = pgd[i];
297 void* pmd = (void *)__va(pgd_val(pgdent)-1); 343 void* pmd = (void *)__va(pgd_val(pgdent)-1);
298 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); 344 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
299 kmem_cache_free(pmd_cache, pmd); 345 pmd_cache_free(pmd, i);
300 } 346 }
301 kmem_cache_free(pgd_cache, pgd); 347 kmem_cache_free(pgd_cache, pgd);
302 return NULL; 348 return NULL;
@@ -308,11 +354,11 @@ void pgd_free(pgd_t *pgd)
308 354
309 /* in the PAE case user pgd entries are overwritten before usage */ 355 /* in the PAE case user pgd entries are overwritten before usage */
310 if (PTRS_PER_PMD > 1) 356 if (PTRS_PER_PMD > 1)
311 for (i = 0; i < USER_PTRS_PER_PGD; ++i) { 357 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
312 pgd_t pgdent = pgd[i]; 358 pgd_t pgdent = pgd[i];
313 void* pmd = (void *)__va(pgd_val(pgdent)-1); 359 void* pmd = (void *)__va(pgd_val(pgdent)-1);
314 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); 360 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
315 kmem_cache_free(pmd_cache, pmd); 361 pmd_cache_free(pmd, i);
316 } 362 }
317 /* in the non-PAE case, free_pgtables() clears user pgd entries */ 363 /* in the non-PAE case, free_pgtables() clears user pgd entries */
318 kmem_cache_free(pgd_cache, pgd); 364 kmem_cache_free(pgd_cache, pgd);
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 8fda7be9dd4..695f737516a 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -414,6 +414,10 @@ int __init op_nmi_init(struct oprofile_operations *ops)
414 user space an consistent name. */ 414 user space an consistent name. */
415 cpu_type = "x86-64/hammer"; 415 cpu_type = "x86-64/hammer";
416 break; 416 break;
417 case 0x10:
418 model = &op_athlon_spec;
419 cpu_type = "x86-64/family10";
420 break;
417 } 421 }
418 break; 422 break;
419 423
diff --git a/arch/i386/pci/init.c b/arch/i386/pci/init.c
index b21b6da8ab1..1cf11af96de 100644
--- a/arch/i386/pci/init.c
+++ b/arch/i386/pci/init.c
@@ -6,7 +6,7 @@
6 in the right sequence from here. */ 6 in the right sequence from here. */
7static __init int pci_access_init(void) 7static __init int pci_access_init(void)
8{ 8{
9 int type = 0; 9 int type __attribute__((unused)) = 0;
10 10
11#ifdef CONFIG_PCI_DIRECT 11#ifdef CONFIG_PCI_DIRECT
12 type = pci_direct_probe(); 12 type = pci_direct_probe();
diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c
index 747d8c63b0c..c7cabeed4d7 100644
--- a/arch/i386/pci/mmconfig-shared.c
+++ b/arch/i386/pci/mmconfig-shared.c
@@ -60,14 +60,19 @@ static const char __init *pci_mmcfg_e7520(void)
60 u32 win; 60 u32 win;
61 pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); 61 pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win);
62 62
63 pci_mmcfg_config_num = 1; 63 win = win & 0xf000;
64 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); 64 if(win == 0x0000 || win == 0xf000)
65 if (!pci_mmcfg_config) 65 pci_mmcfg_config_num = 0;
66 return NULL; 66 else {
67 pci_mmcfg_config[0].address = (win & 0xf000) << 16; 67 pci_mmcfg_config_num = 1;
68 pci_mmcfg_config[0].pci_segment = 0; 68 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
69 pci_mmcfg_config[0].start_bus_number = 0; 69 if (!pci_mmcfg_config)
70 pci_mmcfg_config[0].end_bus_number = 255; 70 return NULL;
71 pci_mmcfg_config[0].address = win << 16;
72 pci_mmcfg_config[0].pci_segment = 0;
73 pci_mmcfg_config[0].start_bus_number = 0;
74 pci_mmcfg_config[0].end_bus_number = 255;
75 }
71 76
72 return "Intel Corporation E7520 Memory Controller Hub"; 77 return "Intel Corporation E7520 Memory Controller Hub";
73} 78}
@@ -108,6 +113,10 @@ static const char __init *pci_mmcfg_intel_945(void)
108 if ((pciexbar & mask) & 0x0fffffffU) 113 if ((pciexbar & mask) & 0x0fffffffU)
109 pci_mmcfg_config_num = 0; 114 pci_mmcfg_config_num = 0;
110 115
116 /* Don't hit the APIC registers and their friends */
117 if ((pciexbar & mask) >= 0xf0000000U)
118 pci_mmcfg_config_num = 0;
119
111 if (pci_mmcfg_config_num) { 120 if (pci_mmcfg_config_num) {
112 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); 121 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
113 if (!pci_mmcfg_config) 122 if (!pci_mmcfg_config)
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index 2c15500f871..998fd3ec0d6 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -21,6 +21,7 @@ unsigned long saved_context_eflags;
21 21
22void __save_processor_state(struct saved_context *ctxt) 22void __save_processor_state(struct saved_context *ctxt)
23{ 23{
24 mtrr_save_fixed_ranges(NULL);
24 kernel_fpu_begin(); 25 kernel_fpu_begin();
25 26
26 /* 27 /*
diff --git a/arch/i386/power/suspend.c b/arch/i386/power/suspend.c
index db5e98d2eb7..a0020b913f3 100644
--- a/arch/i386/power/suspend.c
+++ b/arch/i386/power/suspend.c
@@ -16,6 +16,9 @@
16/* Defined in arch/i386/power/swsusp.S */ 16/* Defined in arch/i386/power/swsusp.S */
17extern int restore_image(void); 17extern int restore_image(void);
18 18
19/* References to section boundaries */
20extern const void __nosave_begin, __nosave_end;
21
19/* Pointer to the temporary resume page tables */ 22/* Pointer to the temporary resume page tables */
20pgd_t *resume_pg_dir; 23pgd_t *resume_pg_dir;
21 24
@@ -156,3 +159,14 @@ int swsusp_arch_resume(void)
156 restore_image(); 159 restore_image();
157 return 0; 160 return 0;
158} 161}
162
163/*
164 * pfn_is_nosave - check if given pfn is in the 'nosave' section
165 */
166
167int pfn_is_nosave(unsigned long pfn)
168{
169 unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
170 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
171 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
172}
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S
index 439cc257cd1..6c73bca3f47 100644
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -110,7 +110,7 @@ SECTIONS
110 __initramfs_end = .; 110 __initramfs_end = .;
111#endif 111#endif
112 112
113 . = ALIGN(32); 113 . = ALIGN(4096);
114 __per_cpu_start = .; 114 __per_cpu_start = .;
115 .data.percpu : { *(.data.percpu) } 115 .data.percpu : { *(.data.percpu) }
116 __per_cpu_end = .; 116 __per_cpu_end = .;
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index c76b793310c..043f637e3d1 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -119,7 +119,7 @@ SECTIONS
119 .init.ramfs : { *(.init.ramfs) } 119 .init.ramfs : { *(.init.ramfs) }
120 __initramfs_end = .; 120 __initramfs_end = .;
121#endif 121#endif
122 . = ALIGN(32); 122 . = ALIGN(_PAGE_SIZE);
123 __per_cpu_start = .; 123 __per_cpu_start = .;
124 .data.percpu : { *(.data.percpu) } 124 .data.percpu : { *(.data.percpu) }
125 __per_cpu_end = .; 125 __per_cpu_end = .;
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 2a8253358c6..c7458599059 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -181,7 +181,7 @@ SECTIONS
181 .init.ramfs : { *(.init.ramfs) } 181 .init.ramfs : { *(.init.ramfs) }
182 __initramfs_end = .; 182 __initramfs_end = .;
183#endif 183#endif
184 . = ALIGN(32); 184 . = ALIGN(ASM_PAGE_SIZE);
185 __per_cpu_start = .; 185 __per_cpu_start = .;
186 .data.percpu : { *(.data.percpu) } 186 .data.percpu : { *(.data.percpu) }
187 __per_cpu_end = .; 187 __per_cpu_end = .;
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index e0fa80eca36..aa693d0f151 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
37obj-$(CONFIG_6xx) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o 37obj-$(CONFIG_6xx) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o
38obj-$(CONFIG_TAU) += tau_6xx.o 38obj-$(CONFIG_TAU) += tau_6xx.o
39obj32-$(CONFIG_SOFTWARE_SUSPEND) += swsusp_32.o 39obj32-$(CONFIG_SOFTWARE_SUSPEND) += swsusp_32.o
40obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
40obj32-$(CONFIG_MODULES) += module_32.o 41obj32-$(CONFIG_MODULES) += module_32.o
41 42
42ifeq ($(CONFIG_PPC_MERGE),y) 43ifeq ($(CONFIG_PPC_MERGE),y)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 22083ce3cc3..6018178708a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -582,14 +582,14 @@ void __init setup_per_cpu_areas(void)
582 char *ptr; 582 char *ptr;
583 583
584 /* Copy section for each CPU (we discard the original) */ 584 /* Copy section for each CPU (we discard the original) */
585 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); 585 size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
586#ifdef CONFIG_MODULES 586#ifdef CONFIG_MODULES
587 if (size < PERCPU_ENOUGH_ROOM) 587 if (size < PERCPU_ENOUGH_ROOM)
588 size = PERCPU_ENOUGH_ROOM; 588 size = PERCPU_ENOUGH_ROOM;
589#endif 589#endif
590 590
591 for_each_possible_cpu(i) { 591 for_each_possible_cpu(i) {
592 ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); 592 ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
593 if (!ptr) 593 if (!ptr)
594 panic("Cannot allocate cpu data for CPU %d\n", i); 594 panic("Cannot allocate cpu data for CPU %d\n", i);
595 595
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
new file mode 100644
index 00000000000..8cee5710754
--- /dev/null
+++ b/arch/powerpc/kernel/suspend.c
@@ -0,0 +1,24 @@
1/*
2 * Suspend support specific for power.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <asm/page.h>
11
12/* References to section boundaries */
13extern const void __nosave_begin, __nosave_end;
14
15/*
16 * pfn_is_nosave - check if given pfn is in the 'nosave' section
17 */
18
19int pfn_is_nosave(unsigned long pfn)
20{
21 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
22 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
23 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
24}
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 7eefeb4a30e..13206731314 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -139,11 +139,7 @@ SECTIONS
139 __initramfs_end = .; 139 __initramfs_end = .;
140 } 140 }
141#endif 141#endif
142#ifdef CONFIG_PPC32 142 . = ALIGN(PAGE_SIZE);
143 . = ALIGN(32);
144#else
145 . = ALIGN(128);
146#endif
147 .data.percpu : { 143 .data.percpu : {
148 __per_cpu_start = .; 144 __per_cpu_start = .;
149 *(.data.percpu) 145 *(.data.percpu)
diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S
index a0625562a44..44cd128fb71 100644
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -130,7 +130,7 @@ SECTIONS
130 __ftr_fixup : { *(__ftr_fixup) } 130 __ftr_fixup : { *(__ftr_fixup) }
131 __stop___ftr_fixup = .; 131 __stop___ftr_fixup = .;
132 132
133 . = ALIGN(32); 133 . = ALIGN(4096);
134 __per_cpu_start = .; 134 __per_cpu_start = .;
135 .data.percpu : { *(.data.percpu) } 135 .data.percpu : { *(.data.percpu) }
136 __per_cpu_end = .; 136 __per_cpu_end = .;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 418f6426a94..e9d3432aba6 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -107,7 +107,7 @@ SECTIONS
107 . = ALIGN(2); 107 . = ALIGN(2);
108 __initramfs_end = .; 108 __initramfs_end = .;
109#endif 109#endif
110 . = ALIGN(256); 110 . = ALIGN(4096);
111 __per_cpu_start = .; 111 __per_cpu_start = .;
112 .data.percpu : { *(.data.percpu) } 112 .data.percpu : { *(.data.percpu) }
113 __per_cpu_end = .; 113 __per_cpu_end = .;
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 78a6c09875b..2f606d0ce1f 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -54,7 +54,7 @@ SECTIONS
54 . = ALIGN(PAGE_SIZE); 54 . = ALIGN(PAGE_SIZE);
55 .data.page_aligned : { *(.data.page_aligned) } 55 .data.page_aligned : { *(.data.page_aligned) }
56 56
57 . = ALIGN(L1_CACHE_BYTES); 57 . = ALIGN(PAGE_SIZE);
58 __per_cpu_start = .; 58 __per_cpu_start = .;
59 .data.percpu : { *(.data.percpu) } 59 .data.percpu : { *(.data.percpu) }
60 __per_cpu_end = .; 60 __per_cpu_end = .;
diff --git a/arch/sh64/kernel/vmlinux.lds.S b/arch/sh64/kernel/vmlinux.lds.S
index a59c5e99813..4f9616f3983 100644
--- a/arch/sh64/kernel/vmlinux.lds.S
+++ b/arch/sh64/kernel/vmlinux.lds.S
@@ -85,7 +85,7 @@ SECTIONS
85 . = ALIGN(PAGE_SIZE); 85 . = ALIGN(PAGE_SIZE);
86 .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) } 86 .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) }
87 87
88 . = ALIGN(L1_CACHE_BYTES); 88 . = ALIGN(PAGE_SIZE);
89 __per_cpu_start = .; 89 __per_cpu_start = .;
90 .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) } 90 .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) }
91 __per_cpu_end = . ; 91 __per_cpu_end = . ;
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index e5c24e0521d..f0bb6e60e62 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -65,7 +65,7 @@ SECTIONS
65 __initramfs_end = .; 65 __initramfs_end = .;
66#endif 66#endif
67 67
68 . = ALIGN(32); 68 . = ALIGN(4096);
69 __per_cpu_start = .; 69 __per_cpu_start = .;
70 .data.percpu : { *(.data.percpu) } 70 .data.percpu : { *(.data.percpu) }
71 __per_cpu_end = .; 71 __per_cpu_end = .;
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index d4f0a70f484..1fac215252e 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1343,11 +1343,11 @@ void __init setup_per_cpu_areas(void)
1343 /* Copy section for each CPU (we discard the original) */ 1343 /* Copy section for each CPU (we discard the original) */
1344 goal = PERCPU_ENOUGH_ROOM; 1344 goal = PERCPU_ENOUGH_ROOM;
1345 1345
1346 __per_cpu_shift = 0; 1346 __per_cpu_shift = PAGE_SHIFT;
1347 for (size = 1UL; size < goal; size <<= 1UL) 1347 for (size = PAGE_SIZE; size < goal; size <<= 1UL)
1348 __per_cpu_shift++; 1348 __per_cpu_shift++;
1349 1349
1350 ptr = alloc_bootmem(size * NR_CPUS); 1350 ptr = alloc_bootmem_pages(size * NR_CPUS);
1351 1351
1352 __per_cpu_base = ptr - __per_cpu_start; 1352 __per_cpu_base = ptr - __per_cpu_start;
1353 1353
diff --git a/arch/um/defconfig b/arch/um/defconfig
index 780cc0a4a12..f938fa82214 100644
--- a/arch/um/defconfig
+++ b/arch/um/defconfig
@@ -41,6 +41,7 @@ CONFIG_M686=y
41# CONFIG_MGEODE_LX is not set 41# CONFIG_MGEODE_LX is not set
42# CONFIG_MCYRIXIII is not set 42# CONFIG_MCYRIXIII is not set
43# CONFIG_MVIAC3_2 is not set 43# CONFIG_MVIAC3_2 is not set
44# CONFIG_MVIAC7 is not set
44# CONFIG_X86_GENERIC is not set 45# CONFIG_X86_GENERIC is not set
45CONFIG_X86_CMPXCHG=y 46CONFIG_X86_CMPXCHG=y
46CONFIG_X86_XADD=y 47CONFIG_X86_XADD=y
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index e9b4f058a49..145bb824b2a 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -415,13 +415,13 @@ config OUT_OF_LINE_PFN_TO_PAGE
415 depends on DISCONTIGMEM 415 depends on DISCONTIGMEM
416 416
417config NR_CPUS 417config NR_CPUS
418 int "Maximum number of CPUs (2-256)" 418 int "Maximum number of CPUs (2-255)"
419 range 2 255 419 range 2 255
420 depends on SMP 420 depends on SMP
421 default "8" 421 default "8"
422 help 422 help
423 This allows you to specify the maximum number of CPUs which this 423 This allows you to specify the maximum number of CPUs which this
424 kernel will support. Current maximum is 256 CPUs due to 424 kernel will support. Current maximum is 255 CPUs due to
425 APIC addressing limits. Less depending on the hardware. 425 APIC addressing limits. Less depending on the hardware.
426 426
427 This is purely to save memory - each supported CPU requires 427 This is purely to save memory - each supported CPU requires
@@ -565,23 +565,56 @@ config CRASH_DUMP
565 PHYSICAL_START. 565 PHYSICAL_START.
566 For more details see Documentation/kdump/kdump.txt 566 For more details see Documentation/kdump/kdump.txt
567 567
568config RELOCATABLE
569 bool "Build a relocatable kernel(EXPERIMENTAL)"
570 depends on EXPERIMENTAL
571 help
572 Builds a relocatable kernel. This enables loading and running
573 a kernel binary from a different physical address than it has
574 been compiled for.
575
576 One use is for the kexec on panic case where the recovery kernel
577 must live at a different physical address than the primary
578 kernel.
579
580 Note: If CONFIG_RELOCATABLE=y, then kernel run from the address
581 it has been loaded at and compile time physical address
582 (CONFIG_PHYSICAL_START) is ignored.
583
568config PHYSICAL_START 584config PHYSICAL_START
569 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 585 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
570 default "0x1000000" if CRASH_DUMP
571 default "0x200000" 586 default "0x200000"
572 help 587 help
573 This gives the physical address where the kernel is loaded. Normally 588 This gives the physical address where the kernel is loaded. It
574 for regular kernels this value is 0x200000 (2MB). But in the case 589 should be aligned to 2MB boundary.
575 of kexec on panic the fail safe kernel needs to run at a different 590
576 address than the panic-ed kernel. This option is used to set the load 591 If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
577 address for kernels used to capture crash dump on being kexec'ed 592 bzImage will decompress itself to above physical address and
578 after panic. The default value for crash dump kernels is 593 run from there. Otherwise, bzImage will run from the address where
579 0x1000000 (16MB). This can also be set based on the "X" value as 594 it has been loaded by the boot loader and will ignore above physical
595 address.
596
597 In normal kdump cases one does not have to set/change this option
598 as now bzImage can be compiled as a completely relocatable image
599 (CONFIG_RELOCATABLE=y) and be used to load and run from a different
600 address. This option is mainly useful for the folks who don't want
601 to use a bzImage for capturing the crash dump and want to use a
602 vmlinux instead.
603
604 So if you are using bzImage for capturing the crash dump, leave
605 the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y.
606 Otherwise if you plan to use vmlinux for capturing the crash dump
607 change this value to start of the reserved region (Typically 16MB
608 0x1000000). In other words, it can be set based on the "X" value as
580 specified in the "crashkernel=YM@XM" command line boot parameter 609 specified in the "crashkernel=YM@XM" command line boot parameter
581 passed to the panic-ed kernel. Typically this parameter is set as 610 passed to the panic-ed kernel. Typically this parameter is set as
582 crashkernel=64M@16M. Please take a look at 611 crashkernel=64M@16M. Please take a look at
583 Documentation/kdump/kdump.txt for more details about crash dumps. 612 Documentation/kdump/kdump.txt for more details about crash dumps.
584 613
614 Usage of bzImage for capturing the crash dump is advantageous as
615 one does not have to build two kernels. Same kernel can be used
616 as production kernel and capture kernel.
617
585 Don't change this unless you know what you are doing. 618 Don't change this unless you know what you are doing.
586 619
587config SECCOMP 620config SECCOMP
@@ -627,14 +660,6 @@ config CC_STACKPROTECTOR_ALL
627 660
628source kernel/Kconfig.hz 661source kernel/Kconfig.hz
629 662
630config REORDER
631 bool "Function reordering"
632 default n
633 help
634 This option enables the toolchain to reorder functions for a more
635 optimal TLB usage. If you have pretty much any version of binutils,
636 this can increase your kernel build time by roughly one minute.
637
638config K8_NB 663config K8_NB
639 def_bool y 664 def_bool y
640 depends on AGP_AMD64 || IOMMU || (PCI && NUMA) 665 depends on AGP_AMD64 || IOMMU || (PCI && NUMA)
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 2941a915d4e..29617ae3926 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -40,10 +40,6 @@ cflags-y += -m64
40cflags-y += -mno-red-zone 40cflags-y += -mno-red-zone
41cflags-y += -mcmodel=kernel 41cflags-y += -mcmodel=kernel
42cflags-y += -pipe 42cflags-y += -pipe
43cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections
44# this makes reading assembly source easier, but produces worse code
45# actually it makes the kernel smaller too.
46cflags-y += -fno-reorder-blocks
47cflags-y += -Wno-sign-compare 43cflags-y += -Wno-sign-compare
48cflags-y += -fno-asynchronous-unwind-tables 44cflags-y += -fno-asynchronous-unwind-tables
49ifneq ($(CONFIG_DEBUG_INFO),y) 45ifneq ($(CONFIG_DEBUG_INFO),y)
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile
index deb063e7762..ee6f6505f95 100644
--- a/arch/x86_64/boot/Makefile
+++ b/arch/x86_64/boot/Makefile
@@ -36,7 +36,7 @@ subdir- := compressed/ #Let make clean descend in compressed/
36# --------------------------------------------------------------------------- 36# ---------------------------------------------------------------------------
37 37
38$(obj)/bzImage: IMAGE_OFFSET := 0x100000 38$(obj)/bzImage: IMAGE_OFFSET := 0x100000
39$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ 39$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
40$(obj)/bzImage: BUILDFLAGS := -b 40$(obj)/bzImage: BUILDFLAGS := -b
41 41
42quiet_cmd_image = BUILD $@ 42quiet_cmd_image = BUILD $@
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
index e70fa6e1da0..705a3e33d7e 100644
--- a/arch/x86_64/boot/compressed/Makefile
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -8,16 +8,14 @@
8 8
9targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o 9targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
10EXTRA_AFLAGS := -traditional 10EXTRA_AFLAGS := -traditional
11AFLAGS := $(subst -m64,-m32,$(AFLAGS))
12 11
13# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with 12# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
14# -m32 13# -m32
15CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing 14CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin
16LDFLAGS := -m elf_i386 15LDFLAGS := -m elf_x86_64
17 16
18LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 17LDFLAGS_vmlinux := -T
19 18$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
20$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
21 $(call if_changed,ld) 19 $(call if_changed,ld)
22 @: 20 @:
23 21
@@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
27$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 25$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
28 $(call if_changed,gzip) 26 $(call if_changed,gzip)
29 27
30LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T 28LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
31 29
32$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 30$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
33 $(call if_changed,ld) 31 $(call if_changed,ld)
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index 6f55565e4d4..f9d5692a010 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -26,116 +26,279 @@
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/segment.h> 28#include <asm/segment.h>
29#include <asm/pgtable.h>
29#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/msr.h>
30 32
33.section ".text.head"
31 .code32 34 .code32
32 .globl startup_32 35 .globl startup_32
33 36
34startup_32: 37startup_32:
35 cld 38 cld
36 cli 39 cli
37 movl $(__KERNEL_DS),%eax 40 movl $(__KERNEL_DS), %eax
38 movl %eax,%ds 41 movl %eax, %ds
39 movl %eax,%es 42 movl %eax, %es
40 movl %eax,%fs 43 movl %eax, %ss
41 movl %eax,%gs 44
42 45/* Calculate the delta between where we were compiled to run
43 lss stack_start,%esp 46 * at and where we were actually loaded at. This can only be done
44 xorl %eax,%eax 47 * with a short local call on x86. Nothing else will tell us what
451: incl %eax # check that A20 really IS enabled 48 * address we are running at. The reserved chunk of the real-mode
46 movl %eax,0x000000 # loop forever if it isn't 49 * data at 0x34-0x3f are used as the stack for this calculation.
47 cmpl %eax,0x100000 50 * Only 4 bytes are needed.
48 je 1b 51 */
52 leal 0x40(%esi), %esp
53 call 1f
541: popl %ebp
55 subl $1b, %ebp
56
57/* setup a stack and make sure cpu supports long mode. */
58 movl $user_stack_end, %eax
59 addl %ebp, %eax
60 movl %eax, %esp
61
62 call verify_cpu
63 testl %eax, %eax
64 jnz no_longmode
65
66/* Compute the delta between where we were compiled to run at
67 * and where the code will actually run at.
68 */
69/* %ebp contains the address we are loaded at by the boot loader and %ebx
70 * contains the address where we should move the kernel image temporarily
71 * for safe in-place decompression.
72 */
73
74#ifdef CONFIG_RELOCATABLE
75 movl %ebp, %ebx
76 addl $(LARGE_PAGE_SIZE -1), %ebx
77 andl $LARGE_PAGE_MASK, %ebx
78#else
79 movl $CONFIG_PHYSICAL_START, %ebx
80#endif
81
82 /* Replace the compressed data size with the uncompressed size */
83 subl input_len(%ebp), %ebx
84 movl output_len(%ebp), %eax
85 addl %eax, %ebx
86 /* Add 8 bytes for every 32K input block */
87 shrl $12, %eax
88 addl %eax, %ebx
89 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
90 addl $(32768 + 18 + 4095), %ebx
91 andl $~4095, %ebx
49 92
50/* 93/*
51 * Initialize eflags. Some BIOS's leave bits like NT set. This would 94 * Prepare for entering 64 bit mode
52 * confuse the debugger if this code is traced.
53 * XXX - best to initialize before switching to protected mode.
54 */ 95 */
55 pushl $0 96
56 popfl 97 /* Load new GDT with the 64bit segments using 32bit descriptor */
98 leal gdt(%ebp), %eax
99 movl %eax, gdt+2(%ebp)
100 lgdt gdt(%ebp)
101
102 /* Enable PAE mode */
103 xorl %eax, %eax
104 orl $(1 << 5), %eax
105 movl %eax, %cr4
106
107 /*
108 * Build early 4G boot pagetable
109 */
110 /* Initialize Page tables to 0*/
111 leal pgtable(%ebx), %edi
112 xorl %eax, %eax
113 movl $((4096*6)/4), %ecx
114 rep stosl
115
116 /* Build Level 4 */
117 leal pgtable + 0(%ebx), %edi
118 leal 0x1007 (%edi), %eax
119 movl %eax, 0(%edi)
120
121 /* Build Level 3 */
122 leal pgtable + 0x1000(%ebx), %edi
123 leal 0x1007(%edi), %eax
124 movl $4, %ecx
1251: movl %eax, 0x00(%edi)
126 addl $0x00001000, %eax
127 addl $8, %edi
128 decl %ecx
129 jnz 1b
130
131 /* Build Level 2 */
132 leal pgtable + 0x2000(%ebx), %edi
133 movl $0x00000183, %eax
134 movl $2048, %ecx
1351: movl %eax, 0(%edi)
136 addl $0x00200000, %eax
137 addl $8, %edi
138 decl %ecx
139 jnz 1b
140
141 /* Enable the boot page tables */
142 leal pgtable(%ebx), %eax
143 movl %eax, %cr3
144
145 /* Enable Long mode in EFER (Extended Feature Enable Register) */
146 movl $MSR_EFER, %ecx
147 rdmsr
148 btsl $_EFER_LME, %eax
149 wrmsr
150
151 /* Setup for the jump to 64bit mode
152 *
153 * When the jump is performend we will be in long mode but
154 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
155 * (and in turn EFER.LMA = 1). To jump into 64bit mode we use
156 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
157 * We place all of the values on our mini stack so lret can
158 * used to perform that far jump.
159 */
160 pushl $__KERNEL_CS
161 leal startup_64(%ebp), %eax
162 pushl %eax
163
164 /* Enter paged protected Mode, activating Long Mode */
165 movl $0x80000001, %eax /* Enable Paging and Protected mode */
166 movl %eax, %cr0
167
168 /* Jump from 32bit compatibility mode into 64bit mode. */
169 lret
170
171no_longmode:
172 /* This isn't an x86-64 CPU so hang */
1731:
174 hlt
175 jmp 1b
176
177#include "../../kernel/verify_cpu.S"
178
179 /* Be careful here startup_64 needs to be at a predictable
180 * address so I can export it in an ELF header. Bootloaders
181 * should look at the ELF header to find this address, as
182 * it may change in the future.
183 */
184 .code64
185 .org 0x200
186ENTRY(startup_64)
187 /* We come here either from startup_32 or directly from a
188 * 64bit bootloader. If we come here from a bootloader we depend on
189 * an identity mapped page table being provied that maps our
190 * entire text+data+bss and hopefully all of memory.
191 */
192
193 /* Setup data segments. */
194 xorl %eax, %eax
195 movl %eax, %ds
196 movl %eax, %es
197 movl %eax, %ss
198
199 /* Compute the decompressed kernel start address. It is where
200 * we were loaded at aligned to a 2M boundary. %rbp contains the
201 * decompressed kernel start address.
202 *
203 * If it is a relocatable kernel then decompress and run the kernel
204 * from load address aligned to 2MB addr, otherwise decompress and
205 * run the kernel from CONFIG_PHYSICAL_START
206 */
207
208 /* Start with the delta to where the kernel will run at. */
209#ifdef CONFIG_RELOCATABLE
210 leaq startup_32(%rip) /* - $startup_32 */, %rbp
211 addq $(LARGE_PAGE_SIZE - 1), %rbp
212 andq $LARGE_PAGE_MASK, %rbp
213 movq %rbp, %rbx
214#else
215 movq $CONFIG_PHYSICAL_START, %rbp
216 movq %rbp, %rbx
217#endif
218
219 /* Replace the compressed data size with the uncompressed size */
220 movl input_len(%rip), %eax
221 subq %rax, %rbx
222 movl output_len(%rip), %eax
223 addq %rax, %rbx
224 /* Add 8 bytes for every 32K input block */
225 shrq $12, %rax
226 addq %rax, %rbx
227 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
228 addq $(32768 + 18 + 4095), %rbx
229 andq $~4095, %rbx
230
231/* Copy the compressed kernel to the end of our buffer
232 * where decompression in place becomes safe.
233 */
234 leaq _end(%rip), %r8
235 leaq _end(%rbx), %r9
236 movq $_end /* - $startup_32 */, %rcx
2371: subq $8, %r8
238 subq $8, %r9
239 movq 0(%r8), %rax
240 movq %rax, 0(%r9)
241 subq $8, %rcx
242 jnz 1b
243
244/*
245 * Jump to the relocated address.
246 */
247 leaq relocated(%rbx), %rax
248 jmp *%rax
249
250.section ".text"
251relocated:
252
57/* 253/*
58 * Clear BSS 254 * Clear BSS
59 */ 255 */
60 xorl %eax,%eax 256 xorq %rax, %rax
61 movl $_edata,%edi 257 leaq _edata(%rbx), %rdi
62 movl $_end,%ecx 258 leaq _end(%rbx), %rcx
63 subl %edi,%ecx 259 subq %rdi, %rcx
64 cld 260 cld
65 rep 261 rep
66 stosb 262 stosb
263
264 /* Setup the stack */
265 leaq user_stack_end(%rip), %rsp
266
267 /* zero EFLAGS after setting rsp */
268 pushq $0
269 popfq
270
67/* 271/*
68 * Do the decompression, and jump to the new kernel.. 272 * Do the decompression, and jump to the new kernel..
69 */ 273 */
70 subl $16,%esp # place for structure on the stack 274 pushq %rsi # Save the real mode argument
71 movl %esp,%eax 275 movq %rsi, %rdi # real mode address
72 pushl %esi # real mode pointer as second arg 276 leaq _heap(%rip), %rsi # _heap
73 pushl %eax # address of structure as first arg 277 leaq input_data(%rip), %rdx # input_data
74 call decompress_kernel 278 movl input_len(%rip), %eax
75 orl %eax,%eax 279 movq %rax, %rcx # input_len
76 jnz 3f 280 movq %rbp, %r8 # output
77 addl $8,%esp 281 call decompress_kernel
78 xorl %ebx,%ebx 282 popq %rsi
79 ljmp $(__KERNEL_CS), $__PHYSICAL_START
80 283
81/*
82 * We come here, if we were loaded high.
83 * We need to move the move-in-place routine down to 0x1000
84 * and then start it with the buffer addresses in registers,
85 * which we got from the stack.
86 */
873:
88 movl %esi,%ebx
89 movl $move_routine_start,%esi
90 movl $0x1000,%edi
91 movl $move_routine_end,%ecx
92 subl %esi,%ecx
93 addl $3,%ecx
94 shrl $2,%ecx
95 cld
96 rep
97 movsl
98
99 popl %esi # discard the address
100 addl $4,%esp # real mode pointer
101 popl %esi # low_buffer_start
102 popl %ecx # lcount
103 popl %edx # high_buffer_start
104 popl %eax # hcount
105 movl $__PHYSICAL_START,%edi
106 cli # make sure we don't get interrupted
107 ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
108 284
109/* 285/*
110 * Routine (template) for moving the decompressed kernel in place, 286 * Jump to the decompressed kernel.
111 * if we were high loaded. This _must_ PIC-code !
112 */ 287 */
113move_routine_start: 288 jmp *%rbp
114 movl %ecx,%ebp
115 shrl $2,%ecx
116 rep
117 movsl
118 movl %ebp,%ecx
119 andl $3,%ecx
120 rep
121 movsb
122 movl %edx,%esi
123 movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0
124 addl $3,%ecx
125 shrl $2,%ecx
126 rep
127 movsl
128 movl %ebx,%esi # Restore setup pointer
129 xorl %ebx,%ebx
130 ljmp $(__KERNEL_CS), $__PHYSICAL_START
131move_routine_end:
132 289
133 290 .data
134/* Stack for uncompression */ 291gdt:
135 .align 32 292 .word gdt_end - gdt
136user_stack: 293 .long gdt
294 .word 0
295 .quad 0x0000000000000000 /* NULL descriptor */
296 .quad 0x00af9a000000ffff /* __KERNEL_CS */
297 .quad 0x00cf92000000ffff /* __KERNEL_DS */
298gdt_end:
299 .bss
300/* Stack for uncompression */
301 .balign 4
302user_stack:
137 .fill 4096,4,0 303 .fill 4096,4,0
138stack_start: 304user_stack_end:
139 .long user_stack+4096
140 .word __KERNEL_DS
141
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index 3755b2e394d..f932b0e8909 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -9,10 +9,95 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12#define _LINUX_STRING_H_ 1
13#define __LINUX_BITMAP_H 1
14
15#include <linux/linkage.h>
12#include <linux/screen_info.h> 16#include <linux/screen_info.h>
13#include <asm/io.h> 17#include <asm/io.h>
14#include <asm/page.h> 18#include <asm/page.h>
15 19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analized.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actuall data and 4K of bss.
98 *
99 */
100
16/* 101/*
17 * gzip declarations 102 * gzip declarations
18 */ 103 */
@@ -28,15 +113,20 @@ typedef unsigned char uch;
28typedef unsigned short ush; 113typedef unsigned short ush;
29typedef unsigned long ulg; 114typedef unsigned long ulg;
30 115
31#define WSIZE 0x8000 /* Window size must be at least 32k, */ 116#define WSIZE 0x80000000 /* Window size must be at least 32k,
32 /* and a power of two */ 117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
33 123
34static uch *inbuf; /* input buffer */ 124static uch *inbuf; /* input buffer */
35static uch window[WSIZE]; /* Sliding window buffer */ 125static uch *window; /* Sliding window buffer, (and final output buffer) */
36 126
37static unsigned insize = 0; /* valid bytes in inbuf */ 127static unsigned insize; /* valid bytes in inbuf */
38static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ 128static unsigned inptr; /* index of next byte to be processed in inbuf */
39static unsigned outcnt = 0; /* bytes in output buffer */ 129static unsigned outcnt; /* bytes in output buffer */
40 130
41/* gzip flag byte */ 131/* gzip flag byte */
42#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ 132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
@@ -87,8 +177,6 @@ extern unsigned char input_data[];
87extern int input_len; 177extern int input_len;
88 178
89static long bytes_out = 0; 179static long bytes_out = 0;
90static uch *output_data;
91static unsigned long output_ptr = 0;
92 180
93static void *malloc(int size); 181static void *malloc(int size);
94static void free(void *where); 182static void free(void *where);
@@ -98,17 +186,10 @@ static void *memcpy(void *dest, const void *src, unsigned n);
98 186
99static void putstr(const char *); 187static void putstr(const char *);
100 188
101extern int end; 189static long free_mem_ptr;
102static long free_mem_ptr = (long)&end;
103static long free_mem_end_ptr; 190static long free_mem_end_ptr;
104 191
105#define INPLACE_MOVE_ROUTINE 0x1000 192#define HEAP_SIZE 0x7000
106#define LOW_BUFFER_START 0x2000
107#define LOW_BUFFER_MAX 0x90000
108#define HEAP_SIZE 0x3000
109static unsigned int low_buffer_end, low_buffer_size;
110static int high_loaded =0;
111static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
112 193
113static char *vidmem = (char *)0xb8000; 194static char *vidmem = (char *)0xb8000;
114static int vidport; 195static int vidport;
@@ -218,58 +299,31 @@ static void* memcpy(void* dest, const void* src, unsigned n)
218 */ 299 */
219static int fill_inbuf(void) 300static int fill_inbuf(void)
220{ 301{
221 if (insize != 0) { 302 error("ran out of input data");
222 error("ran out of input data"); 303 return 0;
223 }
224
225 inbuf = input_data;
226 insize = input_len;
227 inptr = 1;
228 return inbuf[0];
229} 304}
230 305
231/* =========================================================================== 306/* ===========================================================================
232 * Write the output window window[0..outcnt-1] and update crc and bytes_out. 307 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
233 * (Used for the decompressed data only.) 308 * (Used for the decompressed data only.)
234 */ 309 */
235static void flush_window_low(void)
236{
237 ulg c = crc; /* temporary variable */
238 unsigned n;
239 uch *in, *out, ch;
240
241 in = window;
242 out = &output_data[output_ptr];
243 for (n = 0; n < outcnt; n++) {
244 ch = *out++ = *in++;
245 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
246 }
247 crc = c;
248 bytes_out += (ulg)outcnt;
249 output_ptr += (ulg)outcnt;
250 outcnt = 0;
251}
252
253static void flush_window_high(void)
254{
255 ulg c = crc; /* temporary variable */
256 unsigned n;
257 uch *in, ch;
258 in = window;
259 for (n = 0; n < outcnt; n++) {
260 ch = *output_data++ = *in++;
261 if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
262 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
263 }
264 crc = c;
265 bytes_out += (ulg)outcnt;
266 outcnt = 0;
267}
268
269static void flush_window(void) 310static void flush_window(void)
270{ 311{
271 if (high_loaded) flush_window_high(); 312 /* With my window equal to my output buffer
272 else flush_window_low(); 313 * I only need to compute the crc here.
314 */
315 ulg c = crc; /* temporary variable */
316 unsigned n;
317 uch *in, ch;
318
319 in = window;
320 for (n = 0; n < outcnt; n++) {
321 ch = *in++;
322 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
323 }
324 crc = c;
325 bytes_out += (ulg)outcnt;
326 outcnt = 0;
273} 327}
274 328
275static void error(char *x) 329static void error(char *x)
@@ -281,57 +335,8 @@ static void error(char *x)
281 while(1); /* Halt */ 335 while(1); /* Halt */
282} 336}
283 337
284static void setup_normal_output_buffer(void) 338asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
285{ 339 uch *input_data, unsigned long input_len, uch *output)
286#ifdef STANDARD_MEMORY_BIOS_CALL
287 if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
288#else
289 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
290#endif
291 output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */
292 free_mem_end_ptr = (long)real_mode;
293}
294
295struct moveparams {
296 uch *low_buffer_start; int lcount;
297 uch *high_buffer_start; int hcount;
298};
299
300static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
301{
302 high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
303#ifdef STANDARD_MEMORY_BIOS_CALL
304 if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
305#else
306 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
307#endif
308 mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
309 low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
310 ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
311 low_buffer_size = low_buffer_end - LOW_BUFFER_START;
312 high_loaded = 1;
313 free_mem_end_ptr = (long)high_buffer_start;
314 if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
315 high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
316 mv->hcount = 0; /* say: we need not to move high_buffer */
317 }
318 else mv->hcount = -1;
319 mv->high_buffer_start = high_buffer_start;
320}
321
322static void close_output_buffer_if_we_run_high(struct moveparams *mv)
323{
324 if (bytes_out > low_buffer_size) {
325 mv->lcount = low_buffer_size;
326 if (mv->hcount)
327 mv->hcount = bytes_out - low_buffer_size;
328 } else {
329 mv->lcount = bytes_out;
330 mv->hcount = 0;
331 }
332}
333
334int decompress_kernel(struct moveparams *mv, void *rmode)
335{ 340{
336 real_mode = rmode; 341 real_mode = rmode;
337 342
@@ -346,13 +351,21 @@ int decompress_kernel(struct moveparams *mv, void *rmode)
346 lines = RM_SCREEN_INFO.orig_video_lines; 351 lines = RM_SCREEN_INFO.orig_video_lines;
347 cols = RM_SCREEN_INFO.orig_video_cols; 352 cols = RM_SCREEN_INFO.orig_video_cols;
348 353
349 if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); 354 window = output; /* Output buffer (Normally at 1M) */
350 else setup_output_buffer_if_we_run_high(mv); 355 free_mem_ptr = heap; /* Heap */
356 free_mem_end_ptr = heap + HEAP_SIZE;
357 inbuf = input_data; /* Input buffer */
358 insize = input_len;
359 inptr = 0;
360
361 if ((ulg)output & (__KERNEL_ALIGN - 1))
362 error("Destination address not 2M aligned");
363 if ((ulg)output >= 0xffffffffffUL)
364 error("Destination address too large");
351 365
352 makecrc(); 366 makecrc();
353 putstr(".\nDecompressing Linux..."); 367 putstr(".\nDecompressing Linux...");
354 gunzip(); 368 gunzip();
355 putstr("done.\nBooting the kernel.\n"); 369 putstr("done.\nBooting the kernel.\n");
356 if (high_loaded) close_output_buffer_if_we_run_high(mv); 370 return;
357 return high_loaded;
358} 371}
diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds
new file mode 100644
index 00000000000..94c13e557fb
--- /dev/null
+++ b/arch/x86_64/boot/compressed/vmlinux.lds
@@ -0,0 +1,44 @@
1OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
2OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64)
4SECTIONS
5{
6 /* Be careful parts of head.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0;
10 .text : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 *(.text.compressed)
15 _text = .; /* Text */
16 *(.text)
17 *(.text.*)
18 _etext = . ;
19 }
20 .rodata : {
21 _rodata = . ;
22 *(.rodata) /* read-only data */
23 *(.rodata.*)
24 _erodata = . ;
25 }
26 .data : {
27 _data = . ;
28 *(.data)
29 *(.data.*)
30 _edata = . ;
31 }
32 .bss : {
33 _bss = . ;
34 *(.bss)
35 *(.bss.*)
36 *(COMMON)
37 . = ALIGN(8);
38 _end = . ;
39 . = ALIGN(4096);
40 pgtable = . ;
41 . = . + 4096 * 6;
42 _heap = .;
43 }
44}
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr
index 1ed9d791f86..bd1429ce193 100644
--- a/arch/x86_64/boot/compressed/vmlinux.scr
+++ b/arch/x86_64/boot/compressed/vmlinux.scr
@@ -1,9 +1,10 @@
1SECTIONS 1SECTIONS
2{ 2{
3 .data : { 3 .text.compressed : {
4 input_len = .; 4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .; 5 LONG(input_data_end - input_data) input_data = .;
6 *(.data) 6 *(.data)
7 input_data_end = .; 7 output_len = . - 4;
8 input_data_end = .;
8 } 9 }
9} 10}
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index 770940cc010..e9e33f94969 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -51,6 +51,7 @@
51#include <asm/boot.h> 51#include <asm/boot.h>
52#include <asm/e820.h> 52#include <asm/e820.h>
53#include <asm/page.h> 53#include <asm/page.h>
54#include <asm/setup.h>
54 55
55/* Signature words to ensure LILO loaded us right */ 56/* Signature words to ensure LILO loaded us right */
56#define SIG1 0xAA55 57#define SIG1 0xAA55
@@ -80,7 +81,7 @@ start:
80# This is the setup header, and it must start at %cs:2 (old 0x9020:2) 81# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
81 82
82 .ascii "HdrS" # header signature 83 .ascii "HdrS" # header signature
83 .word 0x0204 # header version number (>= 0x0105) 84 .word 0x0206 # header version number (>= 0x0105)
84 # or else old loadlin-1.5 will fail) 85 # or else old loadlin-1.5 will fail)
85realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
86start_sys_seg: .word SYSSEG 87start_sys_seg: .word SYSSEG
@@ -155,7 +156,20 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
155 # low memory 0x10000 or higher. 156 # low memory 0x10000 or higher.
156 157
157ramdisk_max: .long 0xffffffff 158ramdisk_max: .long 0xffffffff
158 159kernel_alignment: .long 0x200000 # physical addr alignment required for
160 # protected mode relocatable kernel
161#ifdef CONFIG_RELOCATABLE
162relocatable_kernel: .byte 1
163#else
164relocatable_kernel: .byte 0
165#endif
166pad2: .byte 0
167pad3: .word 0
168
169cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
170 #added with boot protocol
171 #version 2.06
172
159trampoline: call start_of_setup 173trampoline: call start_of_setup
160 .align 16 174 .align 16
161 # The offset at this point is 0x240 175 # The offset at this point is 0x240
@@ -290,64 +304,10 @@ loader_ok:
290 movw %cs,%ax 304 movw %cs,%ax
291 movw %ax,%ds 305 movw %ax,%ds
292 306
293 /* minimum CPUID flags for x86-64 */ 307 call verify_cpu
294 /* see http://www.x86-64.org/lists/discuss/msg02971.html */ 308 testl %eax,%eax
295#define SSE_MASK ((1<<25)|(1<<26)) 309 jz sse_ok
296#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ 310
297 (1<<13)|(1<<15)|(1<<24))
298#define REQUIRED_MASK2 (1<<29)
299
300 pushfl /* standard way to check for cpuid */
301 popl %eax
302 movl %eax,%ebx
303 xorl $0x200000,%eax
304 pushl %eax
305 popfl
306 pushfl
307 popl %eax
308 cmpl %eax,%ebx
309 jz no_longmode /* cpu has no cpuid */
310 movl $0x0,%eax
311 cpuid
312 cmpl $0x1,%eax
313 jb no_longmode /* no cpuid 1 */
314 xor %di,%di
315 cmpl $0x68747541,%ebx /* AuthenticAMD */
316 jnz noamd
317 cmpl $0x69746e65,%edx
318 jnz noamd
319 cmpl $0x444d4163,%ecx
320 jnz noamd
321 mov $1,%di /* cpu is from AMD */
322noamd:
323 movl $0x1,%eax
324 cpuid
325 andl $REQUIRED_MASK1,%edx
326 xorl $REQUIRED_MASK1,%edx
327 jnz no_longmode
328 movl $0x80000000,%eax
329 cpuid
330 cmpl $0x80000001,%eax
331 jb no_longmode /* no extended cpuid */
332 movl $0x80000001,%eax
333 cpuid
334 andl $REQUIRED_MASK2,%edx
335 xorl $REQUIRED_MASK2,%edx
336 jnz no_longmode
337sse_test:
338 movl $1,%eax
339 cpuid
340 andl $SSE_MASK,%edx
341 cmpl $SSE_MASK,%edx
342 je sse_ok
343 test %di,%di
344 jz no_longmode /* only try to force SSE on AMD */
345 movl $0xc0010015,%ecx /* HWCR */
346 rdmsr
347 btr $15,%eax /* enable SSE */
348 wrmsr
349 xor %di,%di /* don't loop */
350 jmp sse_test /* try again */
351no_longmode: 311no_longmode:
352 call beep 312 call beep
353 lea long_mode_panic,%si 313 lea long_mode_panic,%si
@@ -357,7 +317,8 @@ no_longmode_loop:
357long_mode_panic: 317long_mode_panic:
358 .string "Your CPU does not support long mode. Use a 32bit distribution." 318 .string "Your CPU does not support long mode. Use a 32bit distribution."
359 .byte 0 319 .byte 0
360 320
321#include "../kernel/verify_cpu.S"
361sse_ok: 322sse_ok:
362 popw %ds 323 popw %ds
363 324
@@ -846,7 +807,7 @@ gdt_48:
846 807
847# Include video setup & detection code 808# Include video setup & detection code
848 809
849#include "video.S" 810#include "../../i386/boot/video.S"
850 811
851# Setup signature -- must be last 812# Setup signature -- must be last
852setup_sig1: .word SIG1 813setup_sig1: .word SIG1
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
deleted file mode 100644
index 6090516c9c7..00000000000
--- a/arch/x86_64/boot/video.S
+++ /dev/null
@@ -1,2043 +0,0 @@
1/* video.S
2 *
3 * Display adapter & video mode setup, version 2.13 (14-May-99)
4 *
5 * Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz>
6 * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson
7 *
8 * Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999
9 *
10 * For further information, look at Documentation/svga.txt.
11 *
12 */
13
14/* Enable autodetection of SVGA adapters and modes. */
15#undef CONFIG_VIDEO_SVGA
16
17/* Enable autodetection of VESA modes */
18#define CONFIG_VIDEO_VESA
19
20/* Enable compacting of mode table */
21#define CONFIG_VIDEO_COMPACT
22
23/* Retain screen contents when switching modes */
24#define CONFIG_VIDEO_RETAIN
25
26/* Enable local mode list */
27#undef CONFIG_VIDEO_LOCAL
28
29/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
30#undef CONFIG_VIDEO_400_HACK
31
32/* Hack that lets you force specific BIOS mode ID and specific dimensions */
33#undef CONFIG_VIDEO_GFX_HACK
34#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */
35#define VIDEO_GFX_BIOS_BX 0x0102
36#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */
37
38/* This code uses an extended set of video mode numbers. These include:
39 * Aliases for standard modes
40 * NORMAL_VGA (-1)
41 * EXTENDED_VGA (-2)
42 * ASK_VGA (-3)
43 * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
44 * of compatibility when extending the table. These are between 0x00 and 0xff.
45 */
46#define VIDEO_FIRST_MENU 0x0000
47
48/* Standard BIOS video modes (BIOS number + 0x0100) */
49#define VIDEO_FIRST_BIOS 0x0100
50
51/* VESA BIOS video modes (VESA number + 0x0200) */
52#define VIDEO_FIRST_VESA 0x0200
53
54/* Video7 special modes (BIOS number + 0x0900) */
55#define VIDEO_FIRST_V7 0x0900
56
57/* Special video modes */
58#define VIDEO_FIRST_SPECIAL 0x0f00
59#define VIDEO_80x25 0x0f00
60#define VIDEO_8POINT 0x0f01
61#define VIDEO_80x43 0x0f02
62#define VIDEO_80x28 0x0f03
63#define VIDEO_CURRENT_MODE 0x0f04
64#define VIDEO_80x30 0x0f05
65#define VIDEO_80x34 0x0f06
66#define VIDEO_80x60 0x0f07
67#define VIDEO_GFX_HACK 0x0f08
68#define VIDEO_LAST_SPECIAL 0x0f09
69
70/* Video modes given by resolution */
71#define VIDEO_FIRST_RESOLUTION 0x1000
72
73/* The "recalculate timings" flag */
74#define VIDEO_RECALC 0x8000
75
76/* Positions of various video parameters passed to the kernel */
77/* (see also include/linux/tty.h) */
78#define PARAM_CURSOR_POS 0x00
79#define PARAM_VIDEO_PAGE 0x04
80#define PARAM_VIDEO_MODE 0x06
81#define PARAM_VIDEO_COLS 0x07
82#define PARAM_VIDEO_EGA_BX 0x0a
83#define PARAM_VIDEO_LINES 0x0e
84#define PARAM_HAVE_VGA 0x0f
85#define PARAM_FONT_POINTS 0x10
86
87#define PARAM_LFB_WIDTH 0x12
88#define PARAM_LFB_HEIGHT 0x14
89#define PARAM_LFB_DEPTH 0x16
90#define PARAM_LFB_BASE 0x18
91#define PARAM_LFB_SIZE 0x1c
92#define PARAM_LFB_LINELENGTH 0x24
93#define PARAM_LFB_COLORS 0x26
94#define PARAM_VESAPM_SEG 0x2e
95#define PARAM_VESAPM_OFF 0x30
96#define PARAM_LFB_PAGES 0x32
97#define PARAM_VESA_ATTRIB 0x34
98#define PARAM_CAPABILITIES 0x36
99
100/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
101#ifdef CONFIG_VIDEO_RETAIN
102#define DO_STORE call store_screen
103#else
104#define DO_STORE
105#endif /* CONFIG_VIDEO_RETAIN */
106
107# This is the main entry point called by setup.S
108# %ds *must* be pointing to the bootsector
109video: pushw %ds # We use different segments
110 pushw %ds # FS contains original DS
111 popw %fs
112 pushw %cs # DS is equal to CS
113 popw %ds
114 pushw %cs # ES is equal to CS
115 popw %es
116 xorw %ax, %ax
117 movw %ax, %gs # GS is zero
118 cld
119 call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA)
120#ifdef CONFIG_VIDEO_SELECT
121 movw %fs:(0x01fa), %ax # User selected video mode
122 cmpw $ASK_VGA, %ax # Bring up the menu
123 jz vid2
124
125 call mode_set # Set the mode
126 jc vid1
127
128 leaw badmdt, %si # Invalid mode ID
129 call prtstr
130vid2: call mode_menu
131vid1:
132#ifdef CONFIG_VIDEO_RETAIN
133 call restore_screen # Restore screen contents
134#endif /* CONFIG_VIDEO_RETAIN */
135 call store_edid
136#endif /* CONFIG_VIDEO_SELECT */
137 call mode_params # Store mode parameters
138 popw %ds # Restore original DS
139 ret
140
141# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel.
142basic_detect:
143 movb $0, %fs:(PARAM_HAVE_VGA)
144 movb $0x12, %ah # Check EGA/VGA
145 movb $0x10, %bl
146 int $0x10
147 movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel
148 cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card.
149 je basret
150
151 incb adapter
152 movw $0x1a00, %ax # Check EGA or VGA?
153 int $0x10
154 cmpb $0x1a, %al # 1a means VGA...
155 jne basret # anything else is EGA.
156
157 incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA
158 incb adapter
159basret: ret
160
161# Store the video mode parameters for later usage by the kernel.
162# This is done by asking the BIOS except for the rows/columns
163# parameters in the default 80x25 mode -- these are set directly,
164# because some very obscure BIOSes supply insane values.
165mode_params:
166#ifdef CONFIG_VIDEO_SELECT
167 cmpb $0, graphic_mode
168 jnz mopar_gr
169#endif
170 movb $0x03, %ah # Read cursor position
171 xorb %bh, %bh
172 int $0x10
173 movw %dx, %fs:(PARAM_CURSOR_POS)
174 movb $0x0f, %ah # Read page/mode/width
175 int $0x10
176 movw %bx, %fs:(PARAM_VIDEO_PAGE)
177 movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width
178 cmpb $0x7, %al # MDA/HGA => segment differs
179 jnz mopar0
180
181 movw $0xb000, video_segment
182mopar0: movw %gs:(0x485), %ax # Font size
183 movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA)
184 movw force_size, %ax # Forced size?
185 orw %ax, %ax
186 jz mopar1
187
188 movb %ah, %fs:(PARAM_VIDEO_COLS)
189 movb %al, %fs:(PARAM_VIDEO_LINES)
190 ret
191
192mopar1: movb $25, %al
193 cmpb $0, adapter # If we are on CGA/MDA/HGA, the
194 jz mopar2 # screen must have 25 lines.
195
196 movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS
197 incb %al # location of max lines.
198mopar2: movb %al, %fs:(PARAM_VIDEO_LINES)
199 ret
200
201#ifdef CONFIG_VIDEO_SELECT
202# Fetching of VESA frame buffer parameters
203mopar_gr:
204 leaw modelist+1024, %di
205 movb $0x23, %fs:(PARAM_HAVE_VGA)
206 movw 16(%di), %ax
207 movw %ax, %fs:(PARAM_LFB_LINELENGTH)
208 movw 18(%di), %ax
209 movw %ax, %fs:(PARAM_LFB_WIDTH)
210 movw 20(%di), %ax
211 movw %ax, %fs:(PARAM_LFB_HEIGHT)
212 movb 25(%di), %al
213 movb $0, %ah
214 movw %ax, %fs:(PARAM_LFB_DEPTH)
215 movb 29(%di), %al
216 movb $0, %ah
217 movw %ax, %fs:(PARAM_LFB_PAGES)
218 movl 40(%di), %eax
219 movl %eax, %fs:(PARAM_LFB_BASE)
220 movl 31(%di), %eax
221 movl %eax, %fs:(PARAM_LFB_COLORS)
222 movl 35(%di), %eax
223 movl %eax, %fs:(PARAM_LFB_COLORS+4)
224 movw 0(%di), %ax
225 movw %ax, %fs:(PARAM_VESA_ATTRIB)
226
227# get video mem size
228 leaw modelist+1024, %di
229 movw $0x4f00, %ax
230 int $0x10
231 xorl %eax, %eax
232 movw 18(%di), %ax
233 movl %eax, %fs:(PARAM_LFB_SIZE)
234
235# store mode capabilities
236 movl 10(%di), %eax
237 movl %eax, %fs:(PARAM_CAPABILITIES)
238
239# switching the DAC to 8-bit is for <= 8 bpp only
240 movw %fs:(PARAM_LFB_DEPTH), %ax
241 cmpw $8, %ax
242 jg dac_done
243
244# get DAC switching capability
245 xorl %eax, %eax
246 movb 10(%di), %al
247 testb $1, %al
248 jz dac_set
249
250# attempt to switch DAC to 8-bit
251 movw $0x4f08, %ax
252 movw $0x0800, %bx
253 int $0x10
254 cmpw $0x004f, %ax
255 jne dac_set
256 movb %bh, dac_size # store actual DAC size
257
258dac_set:
259# set color size to DAC size
260 movb dac_size, %al
261 movb %al, %fs:(PARAM_LFB_COLORS+0)
262 movb %al, %fs:(PARAM_LFB_COLORS+2)
263 movb %al, %fs:(PARAM_LFB_COLORS+4)
264 movb %al, %fs:(PARAM_LFB_COLORS+6)
265
266# set color offsets to 0
267 movb $0, %fs:(PARAM_LFB_COLORS+1)
268 movb $0, %fs:(PARAM_LFB_COLORS+3)
269 movb $0, %fs:(PARAM_LFB_COLORS+5)
270 movb $0, %fs:(PARAM_LFB_COLORS+7)
271
272dac_done:
273# get protected mode interface informations
274 movw $0x4f0a, %ax
275 xorw %bx, %bx
276 xorw %di, %di
277 int $0x10
278 cmp $0x004f, %ax
279 jnz no_pm
280
281 movw %es, %fs:(PARAM_VESAPM_SEG)
282 movw %di, %fs:(PARAM_VESAPM_OFF)
283no_pm: ret
284
285# The video mode menu
286mode_menu:
287 leaw keymsg, %si # "Return/Space/Timeout" message
288 call prtstr
289 call flush
290nokey: call getkt
291
292 cmpb $0x0d, %al # ENTER ?
293 je listm # yes - manual mode selection
294
295 cmpb $0x20, %al # SPACE ?
296 je defmd1 # no - repeat
297
298 call beep
299 jmp nokey
300
301defmd1: ret # No mode chosen? Default 80x25
302
303listm: call mode_table # List mode table
304listm0: leaw name_bann, %si # Print adapter name
305 call prtstr
306 movw card_name, %si
307 orw %si, %si
308 jnz an2
309
310 movb adapter, %al
311 leaw old_name, %si
312 orb %al, %al
313 jz an1
314
315 leaw ega_name, %si
316 decb %al
317 jz an1
318
319 leaw vga_name, %si
320 jmp an1
321
322an2: call prtstr
323 leaw svga_name, %si
324an1: call prtstr
325 leaw listhdr, %si # Table header
326 call prtstr
327 movb $0x30, %dl # DL holds mode number
328 leaw modelist, %si
329lm1: cmpw $ASK_VGA, (%si) # End?
330 jz lm2
331
332 movb %dl, %al # Menu selection number
333 call prtchr
334 call prtsp2
335 lodsw
336 call prthw # Mode ID
337 call prtsp2
338 movb 0x1(%si), %al
339 call prtdec # Rows
340 movb $0x78, %al # the letter 'x'
341 call prtchr
342 lodsw
343 call prtdec # Columns
344 movb $0x0d, %al # New line
345 call prtchr
346 movb $0x0a, %al
347 call prtchr
348 incb %dl # Next character
349 cmpb $0x3a, %dl
350 jnz lm1
351
352 movb $0x61, %dl
353 jmp lm1
354
355lm2: leaw prompt, %si # Mode prompt
356 call prtstr
357 leaw edit_buf, %di # Editor buffer
358lm3: call getkey
359 cmpb $0x0d, %al # Enter?
360 jz lment
361
362 cmpb $0x08, %al # Backspace?
363 jz lmbs
364
365 cmpb $0x20, %al # Printable?
366 jc lm3
367
368 cmpw $edit_buf+4, %di # Enough space?
369 jz lm3
370
371 stosb
372 call prtchr
373 jmp lm3
374
375lmbs: cmpw $edit_buf, %di # Backspace
376 jz lm3
377
378 decw %di
379 movb $0x08, %al
380 call prtchr
381 call prtspc
382 movb $0x08, %al
383 call prtchr
384 jmp lm3
385
386lment: movb $0, (%di)
387 leaw crlft, %si
388 call prtstr
389 leaw edit_buf, %si
390 cmpb $0, (%si) # Empty string = default mode
391 jz lmdef
392
393 cmpb $0, 1(%si) # One character = menu selection
394 jz mnusel
395
396 cmpw $0x6373, (%si) # "scan" => mode scanning
397 jnz lmhx
398
399 cmpw $0x6e61, 2(%si)
400 jz lmscan
401
402lmhx: xorw %bx, %bx # Else => mode ID in hex
403lmhex: lodsb
404 orb %al, %al
405 jz lmuse1
406
407 subb $0x30, %al
408 jc lmbad
409
410 cmpb $10, %al
411 jc lmhx1
412
413 subb $7, %al
414 andb $0xdf, %al
415 cmpb $10, %al
416 jc lmbad
417
418 cmpb $16, %al
419 jnc lmbad
420
421lmhx1: shlw $4, %bx
422 orb %al, %bl
423 jmp lmhex
424
425lmuse1: movw %bx, %ax
426 jmp lmuse
427
428mnusel: lodsb # Menu selection
429 xorb %ah, %ah
430 subb $0x30, %al
431 jc lmbad
432
433 cmpb $10, %al
434 jc lmuse
435
436 cmpb $0x61-0x30, %al
437 jc lmbad
438
439 subb $0x61-0x30-10, %al
440 cmpb $36, %al
441 jnc lmbad
442
443lmuse: call mode_set
444 jc lmdef
445
446lmbad: leaw unknt, %si
447 call prtstr
448 jmp lm2
449lmscan: cmpb $0, adapter # Scanning only on EGA/VGA
450 jz lmbad
451
452 movw $0, mt_end # Scanning of modes is
453 movb $1, scanning # done as new autodetection.
454 call mode_table
455 jmp listm0
456lmdef: ret
457
458# Additional parts of mode_set... (relative jumps, you know)
459setv7: # Video7 extended modes
460 DO_STORE
461 subb $VIDEO_FIRST_V7>>8, %bh
462 movw $0x6f05, %ax
463 int $0x10
464 stc
465 ret
466
467_setrec: jmp setrec # Ugly...
468_set_80x25: jmp set_80x25
469
470# Aliases for backward compatibility.
471setalias:
472 movw $VIDEO_80x25, %ax
473 incw %bx
474 jz mode_set
475
476 movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al
477 incw %bx
478 jnz setbad # Fall-through!
479
480# Setting of user mode (AX=mode ID) => CF=success
481mode_set:
482 movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S
483 movw %ax, %bx
484 cmpb $0xff, %ah
485 jz setalias
486
487 testb $VIDEO_RECALC>>8, %ah
488 jnz _setrec
489
490 cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah
491 jnc setres
492
493 cmpb $VIDEO_FIRST_SPECIAL>>8, %ah
494 jz setspc
495
496 cmpb $VIDEO_FIRST_V7>>8, %ah
497 jz setv7
498
499 cmpb $VIDEO_FIRST_VESA>>8, %ah
500 jnc check_vesa
501
502 orb %ah, %ah
503 jz setmenu
504
505 decb %ah
506 jz setbios
507
508setbad: clc
509 movb $0, do_restore # The screen needn't be restored
510 ret
511
512setvesa:
513 DO_STORE
514 subb $VIDEO_FIRST_VESA>>8, %bh
515 movw $0x4f02, %ax # VESA BIOS mode set call
516 int $0x10
517 cmpw $0x004f, %ax # AL=4f if implemented
518 jnz setbad # AH=0 if OK
519
520 stc
521 ret
522
523setbios:
524 DO_STORE
525 int $0x10 # Standard BIOS mode set call
526 pushw %bx
527 movb $0x0f, %ah # Check if really set
528 int $0x10
529 popw %bx
530 cmpb %bl, %al
531 jnz setbad
532
533 stc
534 ret
535
536setspc: xorb %bh, %bh # Set special mode
537 cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl
538 jnc setbad
539
540 addw %bx, %bx
541 jmp *spec_inits(%bx)
542
543setmenu:
544 orb %al, %al # 80x25 is an exception
545 jz _set_80x25
546
547 pushw %bx # Set mode chosen from menu
548 call mode_table # Build the mode table
549 popw %ax
550 shlw $2, %ax
551 addw %ax, %si
552 cmpw %di, %si
553 jnc setbad
554
555 movw (%si), %ax # Fetch mode ID
556_m_s: jmp mode_set
557
558setres: pushw %bx # Set mode chosen by resolution
559 call mode_table
560 popw %bx
561 xchgb %bl, %bh
562setr1: lodsw
563 cmpw $ASK_VGA, %ax # End of the list?
564 jz setbad
565
566 lodsw
567 cmpw %bx, %ax
568 jnz setr1
569
570 movw -4(%si), %ax # Fetch mode ID
571 jmp _m_s
572
573check_vesa:
574#ifdef CONFIG_FIRMWARE_EDID
575 leaw modelist+1024, %di
576 movw $0x4f00, %ax
577 int $0x10
578 cmpw $0x004f, %ax
579 jnz setbad
580
581 movw 4(%di), %ax
582 movw %ax, vbe_version
583#endif
584 leaw modelist+1024, %di
585 subb $VIDEO_FIRST_VESA>>8, %bh
586 movw %bx, %cx # Get mode information structure
587 movw $0x4f01, %ax
588 int $0x10
589 addb $VIDEO_FIRST_VESA>>8, %bh
590 cmpw $0x004f, %ax
591 jnz setbad
592
593 movb (%di), %al # Check capabilities.
594 andb $0x19, %al
595 cmpb $0x09, %al
596 jz setvesa # This is a text mode
597
598 movb (%di), %al # Check capabilities.
599 andb $0x99, %al
600 cmpb $0x99, %al
601 jnz _setbad # Doh! No linear frame buffer.
602
603 subb $VIDEO_FIRST_VESA>>8, %bh
604 orw $0x4000, %bx # Use linear frame buffer
605 movw $0x4f02, %ax # VESA BIOS mode set call
606 int $0x10
607 cmpw $0x004f, %ax # AL=4f if implemented
608 jnz _setbad # AH=0 if OK
609
610 movb $1, graphic_mode # flag graphic mode
611 movb $0, do_restore # no screen restore
612 stc
613 ret
614
615_setbad: jmp setbad # Ugly...
616
617# Recalculate vertical display end registers -- this fixes various
618# inconsistencies of extended modes on many adapters. Called when
619# the VIDEO_RECALC flag is set in the mode ID.
620
621setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode
622 call mode_set
623 jnc rct3
624
625 movw %gs:(0x485), %ax # Font size in pixels
626 movb %gs:(0x484), %bl # Number of rows
627 incb %bl
628 mulb %bl # Number of visible
629 decw %ax # scan lines - 1
630 movw $0x3d4, %dx
631 movw %ax, %bx
632 movb $0x12, %al # Lower 8 bits
633 movb %bl, %ah
634 outw %ax, %dx
635 movb $0x07, %al # Bits 8 and 9 in the overflow register
636 call inidx
637 xchgb %al, %ah
638 andb $0xbd, %ah
639 shrb %bh
640 jnc rct1
641 orb $0x02, %ah
642rct1: shrb %bh
643 jnc rct2
644 orb $0x40, %ah
645rct2: movb $0x07, %al
646 outw %ax, %dx
647 stc
648rct3: ret
649
650# Table of routines for setting of the special modes.
651spec_inits:
652 .word set_80x25
653 .word set_8pixel
654 .word set_80x43
655 .word set_80x28
656 .word set_current
657 .word set_80x30
658 .word set_80x34
659 .word set_80x60
660 .word set_gfx
661
662# Set the 80x25 mode. If already set, do nothing.
663set_80x25:
664 movw $0x5019, force_size # Override possibly broken BIOS
665use_80x25:
666#ifdef CONFIG_VIDEO_400_HACK
667 movw $0x1202, %ax # Force 400 scan lines
668 movb $0x30, %bl
669 int $0x10
670#else
671 movb $0x0f, %ah # Get current mode ID
672 int $0x10
673 cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available
674 jz st80 # on CGA/MDA/HGA and is also available on EGAM
675
676 cmpw $0x5003, %ax # Unknown mode, force 80x25 color
677 jnz force3
678
679st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25
680 jz set80
681
682 movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc.
683 orb %al, %al # Some buggy BIOS'es set 0 rows
684 jz set80
685
686 cmpb $24, %al # It's hopefully correct
687 jz set80
688#endif /* CONFIG_VIDEO_400_HACK */
689force3: DO_STORE
690 movw $0x0003, %ax # Forced set
691 int $0x10
692set80: stc
693 ret
694
695# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls.
696set_8pixel:
697 DO_STORE
698 call use_80x25 # The base is 80x25
699set_8pt:
700 movw $0x1112, %ax # Use 8x8 font
701 xorb %bl, %bl
702 int $0x10
703 movw $0x1200, %ax # Use alternate print screen
704 movb $0x20, %bl
705 int $0x10
706 movw $0x1201, %ax # Turn off cursor emulation
707 movb $0x34, %bl
708 int $0x10
709 movb $0x01, %ah # Define cursor scan lines 6-7
710 movw $0x0607, %cx
711 int $0x10
712set_current:
713 stc
714 ret
715
716# Set the 80x28 mode. This mode works on all VGA's, because it's a standard
717# 80x25 mode with 14-point fonts instead of 16-point.
718set_80x28:
719 DO_STORE
720 call use_80x25 # The base is 80x25
721set14: movw $0x1111, %ax # Use 9x14 font
722 xorb %bl, %bl
723 int $0x10
724 movb $0x01, %ah # Define cursor scan lines 11-12
725 movw $0x0b0c, %cx
726 int $0x10
727 stc
728 ret
729
730# Set the 80x43 mode. This mode is works on all VGA's.
731# It's a 350-scanline mode with 8-pixel font.
732set_80x43:
733 DO_STORE
734 movw $0x1201, %ax # Set 350 scans
735 movb $0x30, %bl
736 int $0x10
737 movw $0x0003, %ax # Reset video mode
738 int $0x10
739 jmp set_8pt # Use 8-pixel font
740
741# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font.
742set_80x30:
743 call use_80x25 # Start with real 80x25
744 DO_STORE
745 movw $0x3cc, %dx # Get CRTC port
746 inb %dx, %al
747 movb $0xd4, %dl
748 rorb %al # Mono or color?
749 jc set48a
750
751 movb $0xb4, %dl
752set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7)
753 call outidx
754 movw $0x0b06, %ax # Vertical total
755 call outidx
756 movw $0x3e07, %ax # (Vertical) overflow
757 call outidx
758 movw $0xea10, %ax # Vertical sync start
759 call outidx
760 movw $0xdf12, %ax # Vertical display end
761 call outidx
762 movw $0xe715, %ax # Vertical blank start
763 call outidx
764 movw $0x0416, %ax # Vertical blank end
765 call outidx
766 pushw %dx
767 movb $0xcc, %dl # Misc output register (read)
768 inb %dx, %al
769 movb $0xc2, %dl # (write)
770 andb $0x0d, %al # Preserve clock select bits and color bit
771 orb $0xe2, %al # Set correct sync polarity
772 outb %al, %dx
773 popw %dx
774 movw $0x501e, force_size
775 stc # That's all.
776 ret
777
778# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font.
779set_80x34:
780 call set_80x30 # Set 480 scans
781 call set14 # And 14-pt font
782 movw $0xdb12, %ax # VGA vertical display end
783 movw $0x5022, force_size
784setvde: call outidx
785 stc
786 ret
787
788# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font.
789set_80x60:
790 call set_80x30 # Set 480 scans
791 call set_8pt # And 8-pt font
792 movw $0xdf12, %ax # VGA vertical display end
793 movw $0x503c, force_size
794 jmp setvde
795
796# Special hack for ThinkPad graphics
797set_gfx:
798#ifdef CONFIG_VIDEO_GFX_HACK
799 movw $VIDEO_GFX_BIOS_AX, %ax
800 movw $VIDEO_GFX_BIOS_BX, %bx
801 int $0x10
802 movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size
803 stc
804#endif
805 ret
806
807#ifdef CONFIG_VIDEO_RETAIN
808
809# Store screen contents to temporary buffer.
810store_screen:
811 cmpb $0, do_restore # Already stored?
812 jnz stsr
813
814 testb $CAN_USE_HEAP, loadflags # Have we space for storing?
815 jz stsr
816
817 pushw %ax
818 pushw %bx
819 pushw force_size # Don't force specific size
820 movw $0, force_size
821 call mode_params # Obtain params of current mode
822 popw force_size
823 movb %fs:(PARAM_VIDEO_LINES), %ah
824 movb %fs:(PARAM_VIDEO_COLS), %al
825 movw %ax, %bx # BX=dimensions
826 mulb %ah
827 movw %ax, %cx # CX=number of characters
828 addw %ax, %ax # Calculate image size
829 addw $modelist+1024+4, %ax
830 cmpw heap_end_ptr, %ax
831 jnc sts1 # Unfortunately, out of memory
832
833 movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params
834 leaw modelist+1024, %di
835 stosw
836 movw %bx, %ax
837 stosw
838 pushw %ds # Store the screen
839 movw video_segment, %ds
840 xorw %si, %si
841 rep
842 movsw
843 popw %ds
844 incb do_restore # Screen will be restored later
845sts1: popw %bx
846 popw %ax
847stsr: ret
848
849# Restore screen contents from temporary buffer.
850restore_screen:
851 cmpb $0, do_restore # Has the screen been stored?
852 jz res1
853
854 call mode_params # Get parameters of current mode
855 movb %fs:(PARAM_VIDEO_LINES), %cl
856 movb %fs:(PARAM_VIDEO_COLS), %ch
857 leaw modelist+1024, %si # Screen buffer
858 lodsw # Set cursor position
859 movw %ax, %dx
860 cmpb %cl, %dh
861 jc res2
862
863 movb %cl, %dh
864 decb %dh
865res2: cmpb %ch, %dl
866 jc res3
867
868 movb %ch, %dl
869 decb %dl
870res3: movb $0x02, %ah
871 movb $0x00, %bh
872 int $0x10
873 lodsw # Display size
874 movb %ah, %dl # DL=number of lines
875 movb $0, %ah # BX=phys. length of orig. line
876 movw %ax, %bx
877 cmpb %cl, %dl # Too many?
878 jc res4
879
880 pushw %ax
881 movb %dl, %al
882 subb %cl, %al
883 mulb %bl
884 addw %ax, %si
885 addw %ax, %si
886 popw %ax
887 movb %cl, %dl
888res4: cmpb %ch, %al # Too wide?
889 jc res5
890
891 movb %ch, %al # AX=width of src. line
892res5: movb $0, %cl
893 xchgb %ch, %cl
894 movw %cx, %bp # BP=width of dest. line
895 pushw %es
896 movw video_segment, %es
897 xorw %di, %di # Move the data
898 addw %bx, %bx # Convert BX and BP to _bytes_
899 addw %bp, %bp
900res6: pushw %si
901 pushw %di
902 movw %ax, %cx
903 rep
904 movsw
905 popw %di
906 popw %si
907 addw %bp, %di
908 addw %bx, %si
909 decb %dl
910 jnz res6
911
912 popw %es # Done
913res1: ret
914#endif /* CONFIG_VIDEO_RETAIN */
915
916# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port)
917outidx: outb %al, %dx
918 pushw %ax
919 movb %ah, %al
920 incw %dx
921 outb %al, %dx
922 decw %dx
923 popw %ax
924 ret
925
926# Build the table of video modes (stored after the setup.S code at the
927# `modelist' label. Each video mode record looks like:
928# .word MODE-ID (our special mode ID (see above))
929# .byte rows (number of rows)
930# .byte columns (number of columns)
931# Returns address of the end of the table in DI, the end is marked
932# with a ASK_VGA ID.
933mode_table:
934 movw mt_end, %di # Already filled?
935 orw %di, %di
936 jnz mtab1x
937
938 leaw modelist, %di # Store standard modes:
939 movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL)
940 stosl
941 movb adapter, %al # CGA/MDA/HGA -- no more modes
942 orb %al, %al
943 jz mtabe
944
945 decb %al
946 jnz mtabv
947
948 movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode
949 stosl
950 jmp mtabe
951
952mtab1x: jmp mtab1
953
954mtabv: leaw vga_modes, %si # All modes for std VGA
955 movw $vga_modes_end-vga_modes, %cx
956 rep # I'm unable to use movsw as I don't know how to store a half
957 movsb # of the expression above to cx without using explicit shr.
958
959 cmpb $0, scanning # Mode scan requested?
960 jz mscan1
961
962 call mode_scan
963mscan1:
964
965#ifdef CONFIG_VIDEO_LOCAL
966 call local_modes
967#endif /* CONFIG_VIDEO_LOCAL */
968
969#ifdef CONFIG_VIDEO_VESA
970 call vesa_modes # Detect VESA VGA modes
971#endif /* CONFIG_VIDEO_VESA */
972
973#ifdef CONFIG_VIDEO_SVGA
974 cmpb $0, scanning # Bypass when scanning
975 jnz mscan2
976
977 call svga_modes # Detect SVGA cards & modes
978mscan2:
979#endif /* CONFIG_VIDEO_SVGA */
980
981mtabe:
982
983#ifdef CONFIG_VIDEO_COMPACT
984 leaw modelist, %si
985 movw %di, %dx
986 movw %si, %di
987cmt1: cmpw %dx, %si # Scan all modes
988 jz cmt2
989
990 leaw modelist, %bx # Find in previous entries
991 movw 2(%si), %cx
992cmt3: cmpw %bx, %si
993 jz cmt4
994
995 cmpw 2(%bx), %cx # Found => don't copy this entry
996 jz cmt5
997
998 addw $4, %bx
999 jmp cmt3
1000
1001cmt4: movsl # Copy entry
1002 jmp cmt1
1003
1004cmt5: addw $4, %si # Skip entry
1005 jmp cmt1
1006
1007cmt2:
1008#endif /* CONFIG_VIDEO_COMPACT */
1009
1010 movw $ASK_VGA, (%di) # End marker
1011 movw %di, mt_end
1012mtab1: leaw modelist, %si # SI=mode list, DI=list end
1013ret0: ret
1014
1015# Modes usable on all standard VGAs
1016vga_modes:
1017 .word VIDEO_8POINT
1018 .word 0x5032 # 80x50
1019 .word VIDEO_80x43
1020 .word 0x502b # 80x43
1021 .word VIDEO_80x28
1022 .word 0x501c # 80x28
1023 .word VIDEO_80x30
1024 .word 0x501e # 80x30
1025 .word VIDEO_80x34
1026 .word 0x5022 # 80x34
1027 .word VIDEO_80x60
1028 .word 0x503c # 80x60
1029#ifdef CONFIG_VIDEO_GFX_HACK
1030 .word VIDEO_GFX_HACK
1031 .word VIDEO_GFX_DUMMY_RESOLUTION
1032#endif
1033
1034vga_modes_end:
1035# Detect VESA modes.
1036
1037#ifdef CONFIG_VIDEO_VESA
1038vesa_modes:
1039 cmpb $2, adapter # VGA only
1040 jnz ret0
1041
1042 movw %di, %bp # BP=original mode table end
1043 addw $0x200, %di # Buffer space
1044 movw $0x4f00, %ax # VESA Get card info call
1045 int $0x10
1046 movw %bp, %di
1047 cmpw $0x004f, %ax # Successful?
1048 jnz ret0
1049
1050 cmpw $0x4556, 0x200(%di)
1051 jnz ret0
1052
1053 cmpw $0x4153, 0x202(%di)
1054 jnz ret0
1055
1056 movw $vesa_name, card_name # Set name to "VESA VGA"
1057 pushw %gs
1058 lgsw 0x20e(%di), %si # GS:SI=mode list
1059 movw $128, %cx # Iteration limit
1060vesa1:
1061# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst.
1062# XXX: lodsw %gs:(%si), %ax # Get next mode in the list
1063 gs; lodsw
1064 cmpw $0xffff, %ax # End of the table?
1065 jz vesar
1066
1067 cmpw $0x0080, %ax # Check validity of mode ID
1068 jc vesa2
1069
1070 orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff
1071 jz vesan # Certain BIOSes report 0x80-0xff!
1072
1073 cmpw $0x0800, %ax
1074 jnc vesae
1075
1076vesa2: pushw %cx
1077 movw %ax, %cx # Get mode information structure
1078 movw $0x4f01, %ax
1079 int $0x10
1080 movw %cx, %bx # BX=mode number
1081 addb $VIDEO_FIRST_VESA>>8, %bh
1082 popw %cx
1083 cmpw $0x004f, %ax
1084 jnz vesan # Don't report errors (buggy BIOSES)
1085
1086 movb (%di), %al # Check capabilities. We require
1087 andb $0x19, %al # a color text mode.
1088 cmpb $0x09, %al
1089 jnz vesan
1090
1091 cmpw $0xb800, 8(%di) # Standard video memory address required
1092 jnz vesan
1093
1094 testb $2, (%di) # Mode characteristics supplied?
1095 movw %bx, (%di) # Store mode number
1096 jz vesa3
1097
1098 xorw %dx, %dx
1099 movw 0x12(%di), %bx # Width
1100 orb %bh, %bh
1101 jnz vesan
1102
1103 movb %bl, 0x3(%di)
1104 movw 0x14(%di), %ax # Height
1105 orb %ah, %ah
1106 jnz vesan
1107
1108 movb %al, 2(%di)
1109 mulb %bl
1110 cmpw $8193, %ax # Small enough for Linux console driver?
1111 jnc vesan
1112
1113 jmp vesaok
1114
1115vesa3: subw $0x8108, %bx # This mode has no detailed info specified,
1116 jc vesan # so it must be a standard VESA mode.
1117
1118 cmpw $5, %bx
1119 jnc vesan
1120
1121 movw vesa_text_mode_table(%bx), %ax
1122 movw %ax, 2(%di)
1123vesaok: addw $4, %di # The mode is valid. Store it.
1124vesan: loop vesa1 # Next mode. Limit exceeded => error
1125vesae: leaw vesaer, %si
1126 call prtstr
1127 movw %bp, %di # Discard already found modes.
1128vesar: popw %gs
1129 ret
1130
1131# Dimensions of standard VESA text modes
1132vesa_text_mode_table:
1133 .byte 60, 80 # 0108
1134 .byte 25, 132 # 0109
1135 .byte 43, 132 # 010A
1136 .byte 50, 132 # 010B
1137 .byte 60, 132 # 010C
1138#endif /* CONFIG_VIDEO_VESA */
1139
1140# Scan for video modes. A bit dirty, but should work.
1141mode_scan:
1142 movw $0x0100, %cx # Start with mode 0
1143scm1: movb $0, %ah # Test the mode
1144 movb %cl, %al
1145 int $0x10
1146 movb $0x0f, %ah
1147 int $0x10
1148 cmpb %cl, %al
1149 jnz scm2 # Mode not set
1150
1151 movw $0x3c0, %dx # Test if it's a text mode
1152 movb $0x10, %al # Mode bits
1153 call inidx
1154 andb $0x03, %al
1155 jnz scm2
1156
1157 movb $0xce, %dl # Another set of mode bits
1158 movb $0x06, %al
1159 call inidx
1160 shrb %al
1161 jc scm2
1162
1163 movb $0xd4, %dl # Cursor location
1164 movb $0x0f, %al
1165 call inidx
1166 orb %al, %al
1167 jnz scm2
1168
1169 movw %cx, %ax # Ok, store the mode
1170 stosw
1171 movb %gs:(0x484), %al # Number of rows
1172 incb %al
1173 stosb
1174 movw %gs:(0x44a), %ax # Number of columns
1175 stosb
1176scm2: incb %cl
1177 jns scm1
1178
1179 movw $0x0003, %ax # Return back to mode 3
1180 int $0x10
1181 ret
1182
1183tstidx: outw %ax, %dx # OUT DX,AX and inidx
1184inidx: outb %al, %dx # Read from indexed VGA register
1185 incw %dx # AL=index, DX=index reg port -> AL=data
1186 inb %dx, %al
1187 decw %dx
1188 ret
1189
1190# Try to detect type of SVGA card and supply (usually approximate) video
1191# mode table for it.
1192
1193#ifdef CONFIG_VIDEO_SVGA
1194svga_modes:
1195 leaw svga_table, %si # Test all known SVGA adapters
1196dosvga: lodsw
1197 movw %ax, %bp # Default mode table
1198 orw %ax, %ax
1199 jz didsv1
1200
1201 lodsw # Pointer to test routine
1202 pushw %si
1203 pushw %di
1204 pushw %es
1205 movw $0xc000, %bx
1206 movw %bx, %es
1207 call *%ax # Call test routine
1208 popw %es
1209 popw %di
1210 popw %si
1211 orw %bp, %bp
1212 jz dosvga
1213
1214 movw %bp, %si # Found, copy the modes
1215 movb svga_prefix, %ah
1216cpsvga: lodsb
1217 orb %al, %al
1218 jz didsv
1219
1220 stosw
1221 movsw
1222 jmp cpsvga
1223
1224didsv: movw %si, card_name # Store pointer to card name
1225didsv1: ret
1226
1227# Table of all known SVGA cards. For each card, we store a pointer to
1228# a table of video modes supported by the card and a pointer to a routine
1229# used for testing of presence of the card. The video mode table is always
1230# followed by the name of the card or the chipset.
1231svga_table:
1232 .word ati_md, ati_test
1233 .word oak_md, oak_test
1234 .word paradise_md, paradise_test
1235 .word realtek_md, realtek_test
1236 .word s3_md, s3_test
1237 .word chips_md, chips_test
1238 .word video7_md, video7_test
1239 .word cirrus5_md, cirrus5_test
1240 .word cirrus6_md, cirrus6_test
1241 .word cirrus1_md, cirrus1_test
1242 .word ahead_md, ahead_test
1243 .word everex_md, everex_test
1244 .word genoa_md, genoa_test
1245 .word trident_md, trident_test
1246 .word tseng_md, tseng_test
1247 .word 0
1248
1249# Test routines and mode tables:
1250
1251# S3 - The test algorithm was taken from the SuperProbe package
1252# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org
1253s3_test:
1254 movw $0x0f35, %cx # we store some constants in cl/ch
1255 movw $0x03d4, %dx
1256 movb $0x38, %al
1257 call inidx
1258 movb %al, %bh # store current CRT-register 0x38
1259 movw $0x0038, %ax
1260 call outidx # disable writing to special regs
1261 movb %cl, %al # check whether we can write special reg 0x35
1262 call inidx
1263 movb %al, %bl # save the current value of CRT reg 0x35
1264 andb $0xf0, %al # clear bits 0-3
1265 movb %al, %ah
1266 movb %cl, %al # and write it to CRT reg 0x35
1267 call outidx
1268 call inidx # now read it back
1269 andb %ch, %al # clear the upper 4 bits
1270 jz s3_2 # the first test failed. But we have a
1271
1272 movb %bl, %ah # second chance
1273 movb %cl, %al
1274 call outidx
1275 jmp s3_1 # do the other tests
1276
1277s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35
1278 orb %bl, %ah # set the upper 4 bits of ah with the orig value
1279 call outidx # write ...
1280 call inidx # ... and reread
1281 andb %cl, %al # turn off the upper 4 bits
1282 pushw %ax
1283 movb %bl, %ah # restore old value in register 0x35
1284 movb %cl, %al
1285 call outidx
1286 popw %ax
1287 cmpb %ch, %al # setting lower 4 bits was successful => bad
1288 je no_s3 # writing is allowed => this is not an S3
1289
1290s3_1: movw $0x4838, %ax # allow writing to special regs by putting
1291 call outidx # magic number into CRT-register 0x38
1292 movb %cl, %al # check whether we can write special reg 0x35
1293 call inidx
1294 movb %al, %bl
1295 andb $0xf0, %al
1296 movb %al, %ah
1297 movb %cl, %al
1298 call outidx
1299 call inidx
1300 andb %ch, %al
1301 jnz no_s3 # no, we can't write => no S3
1302
1303 movw %cx, %ax
1304 orb %bl, %ah
1305 call outidx
1306 call inidx
1307 andb %ch, %al
1308 pushw %ax
1309 movb %bl, %ah # restore old value in register 0x35
1310 movb %cl, %al
1311 call outidx
1312 popw %ax
1313 cmpb %ch, %al
1314 jne no_s31 # writing not possible => no S3
1315 movb $0x30, %al
1316 call inidx # now get the S3 id ...
1317 leaw idS3, %di
1318 movw $0x10, %cx
1319 repne
1320 scasb
1321 je no_s31
1322
1323 movb %bh, %ah
1324 movb $0x38, %al
1325 jmp s3rest
1326
1327no_s3: movb $0x35, %al # restore CRT register 0x35
1328 movb %bl, %ah
1329 call outidx
1330no_s31: xorw %bp, %bp # Detection failed
1331s3rest: movb %bh, %ah
1332 movb $0x38, %al # restore old value of CRT register 0x38
1333 jmp outidx
1334
1335idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95
1336 .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0
1337
1338s3_md: .byte 0x54, 0x2b, 0x84
1339 .byte 0x55, 0x19, 0x84
1340 .byte 0
1341 .ascii "S3"
1342 .byte 0
1343
1344# ATI cards.
1345ati_test:
1346 leaw idati, %si
1347 movw $0x31, %di
1348 movw $0x09, %cx
1349 repe
1350 cmpsb
1351 je atiok
1352
1353 xorw %bp, %bp
1354atiok: ret
1355
1356idati: .ascii "761295520"
1357
1358ati_md: .byte 0x23, 0x19, 0x84
1359 .byte 0x33, 0x2c, 0x84
1360 .byte 0x22, 0x1e, 0x64
1361 .byte 0x21, 0x19, 0x64
1362 .byte 0x58, 0x21, 0x50
1363 .byte 0x5b, 0x1e, 0x50
1364 .byte 0
1365 .ascii "ATI"
1366 .byte 0
1367
1368# AHEAD
1369ahead_test:
1370 movw $0x200f, %ax
1371 movw $0x3ce, %dx
1372 outw %ax, %dx
1373 incw %dx
1374 inb %dx, %al
1375 cmpb $0x20, %al
1376 je isahed
1377
1378 cmpb $0x21, %al
1379 je isahed
1380
1381 xorw %bp, %bp
1382isahed: ret
1383
1384ahead_md:
1385 .byte 0x22, 0x2c, 0x84
1386 .byte 0x23, 0x19, 0x84
1387 .byte 0x24, 0x1c, 0x84
1388 .byte 0x2f, 0x32, 0xa0
1389 .byte 0x32, 0x22, 0x50
1390 .byte 0x34, 0x42, 0x50
1391 .byte 0
1392 .ascii "Ahead"
1393 .byte 0
1394
1395# Chips & Tech.
1396chips_test:
1397 movw $0x3c3, %dx
1398 inb %dx, %al
1399 orb $0x10, %al
1400 outb %al, %dx
1401 movw $0x104, %dx
1402 inb %dx, %al
1403 movb %al, %bl
1404 movw $0x3c3, %dx
1405 inb %dx, %al
1406 andb $0xef, %al
1407 outb %al, %dx
1408 cmpb $0xa5, %bl
1409 je cantok
1410
1411 xorw %bp, %bp
1412cantok: ret
1413
1414chips_md:
1415 .byte 0x60, 0x19, 0x84
1416 .byte 0x61, 0x32, 0x84
1417 .byte 0
1418 .ascii "Chips & Technologies"
1419 .byte 0
1420
1421# Cirrus Logic 5X0
1422cirrus1_test:
1423 movw $0x3d4, %dx
1424 movb $0x0c, %al
1425 outb %al, %dx
1426 incw %dx
1427 inb %dx, %al
1428 movb %al, %bl
1429 xorb %al, %al
1430 outb %al, %dx
1431 decw %dx
1432 movb $0x1f, %al
1433 outb %al, %dx
1434 incw %dx
1435 inb %dx, %al
1436 movb %al, %bh
1437 xorb %ah, %ah
1438 shlb $4, %al
1439 movw %ax, %cx
1440 movb %bh, %al
1441 shrb $4, %al
1442 addw %ax, %cx
1443 shlw $8, %cx
1444 addw $6, %cx
1445 movw %cx, %ax
1446 movw $0x3c4, %dx
1447 outw %ax, %dx
1448 incw %dx
1449 inb %dx, %al
1450 andb %al, %al
1451 jnz nocirr
1452
1453 movb %bh, %al
1454 outb %al, %dx
1455 inb %dx, %al
1456 cmpb $0x01, %al
1457 je iscirr
1458
1459nocirr: xorw %bp, %bp
1460iscirr: movw $0x3d4, %dx
1461 movb %bl, %al
1462 xorb %ah, %ah
1463 shlw $8, %ax
1464 addw $0x0c, %ax
1465 outw %ax, %dx
1466 ret
1467
1468cirrus1_md:
1469 .byte 0x1f, 0x19, 0x84
1470 .byte 0x20, 0x2c, 0x84
1471 .byte 0x22, 0x1e, 0x84
1472 .byte 0x31, 0x25, 0x64
1473 .byte 0
1474 .ascii "Cirrus Logic 5X0"
1475 .byte 0
1476
1477# Cirrus Logic 54XX
1478cirrus5_test:
1479 movw $0x3c4, %dx
1480 movb $6, %al
1481 call inidx
1482 movb %al, %bl # BL=backup
1483 movw $6, %ax
1484 call tstidx
1485 cmpb $0x0f, %al
1486 jne c5fail
1487
1488 movw $0x1206, %ax
1489 call tstidx
1490 cmpb $0x12, %al
1491 jne c5fail
1492
1493 movb $0x1e, %al
1494 call inidx
1495 movb %al, %bh
1496 movb %bh, %ah
1497 andb $0xc0, %ah
1498 movb $0x1e, %al
1499 call tstidx
1500 andb $0x3f, %al
1501 jne c5xx
1502
1503 movb $0x1e, %al
1504 movb %bh, %ah
1505 orb $0x3f, %ah
1506 call tstidx
1507 xorb $0x3f, %al
1508 andb $0x3f, %al
1509c5xx: pushf
1510 movb $0x1e, %al
1511 movb %bh, %ah
1512 outw %ax, %dx
1513 popf
1514 je c5done
1515
1516c5fail: xorw %bp, %bp
1517c5done: movb $6, %al
1518 movb %bl, %ah
1519 outw %ax, %dx
1520 ret
1521
1522cirrus5_md:
1523 .byte 0x14, 0x19, 0x84
1524 .byte 0x54, 0x2b, 0x84
1525 .byte 0
1526 .ascii "Cirrus Logic 54XX"
1527 .byte 0
1528
1529# Cirrus Logic 64XX -- no known extra modes, but must be identified, because
1530# it's misidentified by the Ahead test.
1531cirrus6_test:
1532 movw $0x3ce, %dx
1533 movb $0x0a, %al
1534 call inidx
1535 movb %al, %bl # BL=backup
1536 movw $0xce0a, %ax
1537 call tstidx
1538 orb %al, %al
1539 jne c2fail
1540
1541 movw $0xec0a, %ax
1542 call tstidx
1543 cmpb $0x01, %al
1544 jne c2fail
1545
1546 movb $0xaa, %al
1547 call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's.
1548 shrb $4, %al
1549 subb $4, %al
1550 jz c6done
1551
1552 decb %al
1553 jz c6done
1554
1555 subb $2, %al
1556 jz c6done
1557
1558 decb %al
1559 jz c6done
1560
1561c2fail: xorw %bp, %bp
1562c6done: movb $0x0a, %al
1563 movb %bl, %ah
1564 outw %ax, %dx
1565 ret
1566
1567cirrus6_md:
1568 .byte 0
1569 .ascii "Cirrus Logic 64XX"
1570 .byte 0
1571
1572# Everex / Trident
1573everex_test:
1574 movw $0x7000, %ax
1575 xorw %bx, %bx
1576 int $0x10
1577 cmpb $0x70, %al
1578 jne noevrx
1579
1580 shrw $4, %dx
1581 cmpw $0x678, %dx
1582 je evtrid
1583
1584 cmpw $0x236, %dx
1585 jne evrxok
1586
1587evtrid: leaw trident_md, %bp
1588evrxok: ret
1589
1590noevrx: xorw %bp, %bp
1591 ret
1592
1593everex_md:
1594 .byte 0x03, 0x22, 0x50
1595 .byte 0x04, 0x3c, 0x50
1596 .byte 0x07, 0x2b, 0x64
1597 .byte 0x08, 0x4b, 0x64
1598 .byte 0x0a, 0x19, 0x84
1599 .byte 0x0b, 0x2c, 0x84
1600 .byte 0x16, 0x1e, 0x50
1601 .byte 0x18, 0x1b, 0x64
1602 .byte 0x21, 0x40, 0xa0
1603 .byte 0x40, 0x1e, 0x84
1604 .byte 0
1605 .ascii "Everex/Trident"
1606 .byte 0
1607
1608# Genoa.
1609genoa_test:
1610 leaw idgenoa, %si # Check Genoa 'clues'
1611 xorw %ax, %ax
1612 movb %es:(0x37), %al
1613 movw %ax, %di
1614 movw $0x04, %cx
1615 decw %si
1616 decw %di
1617l1: incw %si
1618 incw %di
1619 movb (%si), %al
1620 testb %al, %al
1621 jz l2
1622
1623 cmpb %es:(%di), %al
1624l2: loope l1
1625 orw %cx, %cx
1626 je isgen
1627
1628 xorw %bp, %bp
1629isgen: ret
1630
1631idgenoa: .byte 0x77, 0x00, 0x99, 0x66
1632
1633genoa_md:
1634 .byte 0x58, 0x20, 0x50
1635 .byte 0x5a, 0x2a, 0x64
1636 .byte 0x60, 0x19, 0x84
1637 .byte 0x61, 0x1d, 0x84
1638 .byte 0x62, 0x20, 0x84
1639 .byte 0x63, 0x2c, 0x84
1640 .byte 0x64, 0x3c, 0x84
1641 .byte 0x6b, 0x4f, 0x64
1642 .byte 0x72, 0x3c, 0x50
1643 .byte 0x74, 0x42, 0x50
1644 .byte 0x78, 0x4b, 0x64
1645 .byte 0
1646 .ascii "Genoa"
1647 .byte 0
1648
1649# OAK
1650oak_test:
1651 leaw idoakvga, %si
1652 movw $0x08, %di
1653 movw $0x08, %cx
1654 repe
1655 cmpsb
1656 je isoak
1657
1658 xorw %bp, %bp
1659isoak: ret
1660
1661idoakvga: .ascii "OAK VGA "
1662
1663oak_md: .byte 0x4e, 0x3c, 0x50
1664 .byte 0x4f, 0x3c, 0x84
1665 .byte 0x50, 0x19, 0x84
1666 .byte 0x51, 0x2b, 0x84
1667 .byte 0
1668 .ascii "OAK"
1669 .byte 0
1670
1671# WD Paradise.
1672paradise_test:
1673 leaw idparadise, %si
1674 movw $0x7d, %di
1675 movw $0x04, %cx
1676 repe
1677 cmpsb
1678 je ispara
1679
1680 xorw %bp, %bp
1681ispara: ret
1682
1683idparadise: .ascii "VGA="
1684
1685paradise_md:
1686 .byte 0x41, 0x22, 0x50
1687 .byte 0x47, 0x1c, 0x84
1688 .byte 0x55, 0x19, 0x84
1689 .byte 0x54, 0x2c, 0x84
1690 .byte 0
1691 .ascii "Paradise"
1692 .byte 0
1693
1694# Trident.
1695trident_test:
1696 movw $0x3c4, %dx
1697 movb $0x0e, %al
1698 outb %al, %dx
1699 incw %dx
1700 inb %dx, %al
1701 xchgb %al, %ah
1702 xorb %al, %al
1703 outb %al, %dx
1704 inb %dx, %al
1705 xchgb %ah, %al
1706 movb %al, %bl # Strange thing ... in the book this wasn't
1707 andb $0x02, %bl # necessary but it worked on my card which
1708 jz setb2 # is a trident. Without it the screen goes
1709 # blurred ...
1710 andb $0xfd, %al
1711 jmp clrb2
1712
1713setb2: orb $0x02, %al
1714clrb2: outb %al, %dx
1715 andb $0x0f, %ah
1716 cmpb $0x02, %ah
1717 je istrid
1718
1719 xorw %bp, %bp
1720istrid: ret
1721
1722trident_md:
1723 .byte 0x50, 0x1e, 0x50
1724 .byte 0x51, 0x2b, 0x50
1725 .byte 0x52, 0x3c, 0x50
1726 .byte 0x57, 0x19, 0x84
1727 .byte 0x58, 0x1e, 0x84
1728 .byte 0x59, 0x2b, 0x84
1729 .byte 0x5a, 0x3c, 0x84
1730 .byte 0
1731 .ascii "Trident"
1732 .byte 0
1733
1734# Tseng.
1735tseng_test:
1736 movw $0x3cd, %dx
1737 inb %dx, %al # Could things be this simple ! :-)
1738 movb %al, %bl
1739 movb $0x55, %al
1740 outb %al, %dx
1741 inb %dx, %al
1742 movb %al, %ah
1743 movb %bl, %al
1744 outb %al, %dx
1745 cmpb $0x55, %ah
1746 je istsen
1747
1748isnot: xorw %bp, %bp
1749istsen: ret
1750
1751tseng_md:
1752 .byte 0x26, 0x3c, 0x50
1753 .byte 0x2a, 0x28, 0x64
1754 .byte 0x23, 0x19, 0x84
1755 .byte 0x24, 0x1c, 0x84
1756 .byte 0x22, 0x2c, 0x84
1757 .byte 0x21, 0x3c, 0x84
1758 .byte 0
1759 .ascii "Tseng"
1760 .byte 0
1761
1762# Video7.
1763video7_test:
1764 movw $0x3cc, %dx
1765 inb %dx, %al
1766 movw $0x3b4, %dx
1767 andb $0x01, %al
1768 jz even7
1769
1770 movw $0x3d4, %dx
1771even7: movb $0x0c, %al
1772 outb %al, %dx
1773 incw %dx
1774 inb %dx, %al
1775 movb %al, %bl
1776 movb $0x55, %al
1777 outb %al, %dx
1778 inb %dx, %al
1779 decw %dx
1780 movb $0x1f, %al
1781 outb %al, %dx
1782 incw %dx
1783 inb %dx, %al
1784 movb %al, %bh
1785 decw %dx
1786 movb $0x0c, %al
1787 outb %al, %dx
1788 incw %dx
1789 movb %bl, %al
1790 outb %al, %dx
1791 movb $0x55, %al
1792 xorb $0xea, %al
1793 cmpb %bh, %al
1794 jne isnot
1795
1796 movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching
1797 ret
1798
1799video7_md:
1800 .byte 0x40, 0x2b, 0x50
1801 .byte 0x43, 0x3c, 0x50
1802 .byte 0x44, 0x3c, 0x64
1803 .byte 0x41, 0x19, 0x84
1804 .byte 0x42, 0x2c, 0x84
1805 .byte 0x45, 0x1c, 0x84
1806 .byte 0
1807 .ascii "Video 7"
1808 .byte 0
1809
1810# Realtek VGA
1811realtek_test:
1812 leaw idrtvga, %si
1813 movw $0x45, %di
1814 movw $0x0b, %cx
1815 repe
1816 cmpsb
1817 je isrt
1818
1819 xorw %bp, %bp
1820isrt: ret
1821
1822idrtvga: .ascii "REALTEK VGA"
1823
1824realtek_md:
1825 .byte 0x1a, 0x3c, 0x50
1826 .byte 0x1b, 0x19, 0x84
1827 .byte 0x1c, 0x1e, 0x84
1828 .byte 0x1d, 0x2b, 0x84
1829 .byte 0x1e, 0x3c, 0x84
1830 .byte 0
1831 .ascii "REALTEK"
1832 .byte 0
1833
1834#endif /* CONFIG_VIDEO_SVGA */
1835
1836# User-defined local mode table (VGA only)
1837#ifdef CONFIG_VIDEO_LOCAL
1838local_modes:
1839 leaw local_mode_table, %si
1840locm1: lodsw
1841 orw %ax, %ax
1842 jz locm2
1843
1844 stosw
1845 movsw
1846 jmp locm1
1847
1848locm2: ret
1849
1850# This is the table of local video modes which can be supplied manually
1851# by the user. Each entry consists of mode ID (word) and dimensions
1852# (byte for column count and another byte for row count). These modes
1853# are placed before all SVGA and VESA modes and override them if table
1854# compacting is enabled. The table must end with a zero word followed
1855# by NUL-terminated video adapter name.
1856local_mode_table:
1857 .word 0x0100 # Example: 40x25
1858 .byte 25,40
1859 .word 0
1860 .ascii "Local"
1861 .byte 0
1862#endif /* CONFIG_VIDEO_LOCAL */
1863
1864# Read a key and return the ASCII code in al, scan code in ah
1865getkey: xorb %ah, %ah
1866 int $0x16
1867 ret
1868
1869# Read a key with a timeout of 30 seconds.
1870# The hardware clock is used to get the time.
1871getkt: call gettime
1872 addb $30, %al # Wait 30 seconds
1873 cmpb $60, %al
1874 jl lminute
1875
1876 subb $60, %al
1877lminute:
1878 movb %al, %cl
1879again: movb $0x01, %ah
1880 int $0x16
1881 jnz getkey # key pressed, so get it
1882
1883 call gettime
1884 cmpb %cl, %al
1885 jne again
1886
1887 movb $0x20, %al # timeout, return `space'
1888 ret
1889
1890# Flush the keyboard buffer
1891flush: movb $0x01, %ah
1892 int $0x16
1893 jz empty
1894
1895 xorb %ah, %ah
1896 int $0x16
1897 jmp flush
1898
1899empty: ret
1900
1901# Print hexadecimal number.
1902prthw: pushw %ax
1903 movb %ah, %al
1904 call prthb
1905 popw %ax
1906prthb: pushw %ax
1907 shrb $4, %al
1908 call prthn
1909 popw %ax
1910 andb $0x0f, %al
1911prthn: cmpb $0x0a, %al
1912 jc prth1
1913
1914 addb $0x07, %al
1915prth1: addb $0x30, %al
1916 jmp prtchr
1917
1918# Print decimal number in al
1919prtdec: pushw %ax
1920 pushw %cx
1921 xorb %ah, %ah
1922 movb $0x0a, %cl
1923 idivb %cl
1924 cmpb $0x09, %al
1925 jbe lt100
1926
1927 call prtdec
1928 jmp skip10
1929
1930lt100: addb $0x30, %al
1931 call prtchr
1932skip10: movb %ah, %al
1933 addb $0x30, %al
1934 call prtchr
1935 popw %cx
1936 popw %ax
1937 ret
1938
1939store_edid:
1940#ifdef CONFIG_FIRMWARE_EDID
1941 pushw %es # just save all registers
1942 pushw %ax
1943 pushw %bx
1944 pushw %cx
1945 pushw %dx
1946 pushw %di
1947
1948 pushw %fs
1949 popw %es
1950
1951 movl $0x13131313, %eax # memset block with 0x13
1952 movw $32, %cx
1953 movw $0x140, %di
1954 cld
1955 rep
1956 stosl
1957
1958 cmpw $0x0200, vbe_version # only do EDID on >= VBE2.0
1959 jl no_edid
1960
1961 pushw %es # save ES
1962 xorw %di, %di # Report Capability
1963 pushw %di
1964 popw %es # ES:DI must be 0:0
1965 movw $0x4f15, %ax
1966 xorw %bx, %bx
1967 xorw %cx, %cx
1968 int $0x10
1969 popw %es # restore ES
1970
1971 cmpb $0x00, %ah # call successful
1972 jne no_edid
1973
1974 cmpb $0x4f, %al # function supported
1975 jne no_edid
1976
1977 movw $0x4f15, %ax # do VBE/DDC
1978 movw $0x01, %bx
1979 movw $0x00, %cx
1980 movw $0x01, %dx
1981 movw $0x140, %di
1982 int $0x10
1983
1984no_edid:
1985 popw %di # restore all registers
1986 popw %dx
1987 popw %cx
1988 popw %bx
1989 popw %ax
1990 popw %es
1991#endif
1992 ret
1993
1994# VIDEO_SELECT-only variables
1995mt_end: .word 0 # End of video mode table if built
1996edit_buf: .space 6 # Line editor buffer
1997card_name: .word 0 # Pointer to adapter name
1998scanning: .byte 0 # Performing mode scan
1999do_restore: .byte 0 # Screen contents altered during mode change
2000svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes
2001graphic_mode: .byte 0 # Graphic mode with a linear frame buffer
2002dac_size: .byte 6 # DAC bit depth
2003vbe_version: .word 0 # VBE bios version
2004
2005# Status messages
2006keymsg: .ascii "Press <RETURN> to see video modes available, "
2007 .ascii "<SPACE> to continue or wait 30 secs"
2008 .byte 0x0d, 0x0a, 0
2009
2010listhdr: .byte 0x0d, 0x0a
2011 .ascii "Mode: COLSxROWS:"
2012
2013crlft: .byte 0x0d, 0x0a, 0
2014
2015prompt: .byte 0x0d, 0x0a
2016 .asciz "Enter mode number or `scan': "
2017
2018unknt: .asciz "Unknown mode ID. Try again."
2019
2020badmdt: .ascii "You passed an undefined mode number."
2021 .byte 0x0d, 0x0a, 0
2022
2023vesaer: .ascii "Error: Scanning of VESA modes failed. Please "
2024 .ascii "report to <mj@ucw.cz>."
2025 .byte 0x0d, 0x0a, 0
2026
2027old_name: .asciz "CGA/MDA/HGA"
2028
2029ega_name: .asciz "EGA"
2030
2031svga_name: .ascii " "
2032
2033vga_name: .asciz "VGA"
2034
2035vesa_name: .asciz "VESA"
2036
2037name_bann: .asciz "Video adapter: "
2038#endif /* CONFIG_VIDEO_SELECT */
2039
2040# Other variables:
2041adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA
2042video_segment: .word 0xb800 # Video memory segment
2043force_size: .word 0 # Use this size instead of the one in BIOS vars
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index b26378815b9..941a7e3aa5f 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.21-rc3 3# Linux kernel version: 2.6.21-git3
4# Wed Mar 7 15:29:47 2007 4# Tue May 1 07:30:48 2007
5# 5#
6CONFIG_X86_64=y 6CONFIG_X86_64=y
7CONFIG_64BIT=y 7CONFIG_64BIT=y
@@ -118,11 +118,11 @@ CONFIG_X86_PC=y
118# CONFIG_X86_VSMP is not set 118# CONFIG_X86_VSMP is not set
119# CONFIG_MK8 is not set 119# CONFIG_MK8 is not set
120# CONFIG_MPSC is not set 120# CONFIG_MPSC is not set
121# CONFIG_MCORE2 is not set 121CONFIG_MCORE2=y
122CONFIG_GENERIC_CPU=y 122# CONFIG_GENERIC_CPU is not set
123CONFIG_X86_L1_CACHE_BYTES=128 123CONFIG_X86_L1_CACHE_BYTES=64
124CONFIG_X86_L1_CACHE_SHIFT=7 124CONFIG_X86_L1_CACHE_SHIFT=6
125CONFIG_X86_INTERNODE_CACHE_BYTES=128 125CONFIG_X86_INTERNODE_CACHE_BYTES=64
126CONFIG_X86_TSC=y 126CONFIG_X86_TSC=y
127CONFIG_X86_GOOD_APIC=y 127CONFIG_X86_GOOD_APIC=y
128# CONFIG_MICROCODE is not set 128# CONFIG_MICROCODE is not set
@@ -174,6 +174,7 @@ CONFIG_X86_MCE_INTEL=y
174CONFIG_X86_MCE_AMD=y 174CONFIG_X86_MCE_AMD=y
175# CONFIG_KEXEC is not set 175# CONFIG_KEXEC is not set
176# CONFIG_CRASH_DUMP is not set 176# CONFIG_CRASH_DUMP is not set
177# CONFIG_RELOCATABLE is not set
177CONFIG_PHYSICAL_START=0x200000 178CONFIG_PHYSICAL_START=0x200000
178CONFIG_SECCOMP=y 179CONFIG_SECCOMP=y
179# CONFIG_CC_STACKPROTECTOR is not set 180# CONFIG_CC_STACKPROTECTOR is not set
@@ -182,7 +183,6 @@ CONFIG_HZ_250=y
182# CONFIG_HZ_300 is not set 183# CONFIG_HZ_300 is not set
183# CONFIG_HZ_1000 is not set 184# CONFIG_HZ_1000 is not set
184CONFIG_HZ=250 185CONFIG_HZ=250
185# CONFIG_REORDER is not set
186CONFIG_K8_NB=y 186CONFIG_K8_NB=y
187CONFIG_GENERIC_HARDIRQS=y 187CONFIG_GENERIC_HARDIRQS=y
188CONFIG_GENERIC_IRQ_PROBE=y 188CONFIG_GENERIC_IRQ_PROBE=y
@@ -218,7 +218,6 @@ CONFIG_ACPI_HOTPLUG_CPU=y
218CONFIG_ACPI_THERMAL=y 218CONFIG_ACPI_THERMAL=y
219CONFIG_ACPI_NUMA=y 219CONFIG_ACPI_NUMA=y
220# CONFIG_ACPI_ASUS is not set 220# CONFIG_ACPI_ASUS is not set
221# CONFIG_ACPI_IBM is not set
222# CONFIG_ACPI_TOSHIBA is not set 221# CONFIG_ACPI_TOSHIBA is not set
223CONFIG_ACPI_BLACKLIST_YEAR=0 222CONFIG_ACPI_BLACKLIST_YEAR=0
224# CONFIG_ACPI_DEBUG is not set 223# CONFIG_ACPI_DEBUG is not set
@@ -243,7 +242,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
243# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set 242# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
244CONFIG_CPU_FREQ_GOV_USERSPACE=y 243CONFIG_CPU_FREQ_GOV_USERSPACE=y
245CONFIG_CPU_FREQ_GOV_ONDEMAND=y 244CONFIG_CPU_FREQ_GOV_ONDEMAND=y
246# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set 245CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
247 246
248# 247#
249# CPUFreq processor drivers 248# CPUFreq processor drivers
@@ -299,7 +298,6 @@ CONFIG_NET=y
299# 298#
300# Networking options 299# Networking options
301# 300#
302# CONFIG_NETDEBUG is not set
303CONFIG_PACKET=y 301CONFIG_PACKET=y
304# CONFIG_PACKET_MMAP is not set 302# CONFIG_PACKET_MMAP is not set
305CONFIG_UNIX=y 303CONFIG_UNIX=y
@@ -334,6 +332,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic"
334CONFIG_IPV6=y 332CONFIG_IPV6=y
335# CONFIG_IPV6_PRIVACY is not set 333# CONFIG_IPV6_PRIVACY is not set
336# CONFIG_IPV6_ROUTER_PREF is not set 334# CONFIG_IPV6_ROUTER_PREF is not set
335# CONFIG_IPV6_OPTIMISTIC_DAD is not set
337# CONFIG_INET6_AH is not set 336# CONFIG_INET6_AH is not set
338# CONFIG_INET6_ESP is not set 337# CONFIG_INET6_ESP is not set
339# CONFIG_INET6_IPCOMP is not set 338# CONFIG_INET6_IPCOMP is not set
@@ -389,6 +388,13 @@ CONFIG_IPV6_SIT=y
389# CONFIG_HAMRADIO is not set 388# CONFIG_HAMRADIO is not set
390# CONFIG_IRDA is not set 389# CONFIG_IRDA is not set
391# CONFIG_BT is not set 390# CONFIG_BT is not set
391# CONFIG_AF_RXRPC is not set
392
393#
394# Wireless
395#
396# CONFIG_CFG80211 is not set
397# CONFIG_WIRELESS_EXT is not set
392# CONFIG_IEEE80211 is not set 398# CONFIG_IEEE80211 is not set
393 399
394# 400#
@@ -409,10 +415,6 @@ CONFIG_FW_LOADER=y
409# Connector - unified userspace <-> kernelspace linker 415# Connector - unified userspace <-> kernelspace linker
410# 416#
411# CONFIG_CONNECTOR is not set 417# CONFIG_CONNECTOR is not set
412
413#
414# Memory Technology Devices (MTD)
415#
416# CONFIG_MTD is not set 418# CONFIG_MTD is not set
417 419
418# 420#
@@ -459,6 +461,7 @@ CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
459# CONFIG_SGI_IOC4 is not set 461# CONFIG_SGI_IOC4 is not set
460# CONFIG_TIFM_CORE is not set 462# CONFIG_TIFM_CORE is not set
461# CONFIG_SONY_LAPTOP is not set 463# CONFIG_SONY_LAPTOP is not set
464# CONFIG_THINKPAD_ACPI is not set
462 465
463# 466#
464# ATA/ATAPI/MFM/RLL support 467# ATA/ATAPI/MFM/RLL support
@@ -494,7 +497,6 @@ CONFIG_BLK_DEV_IDEPCI=y
494# CONFIG_BLK_DEV_RZ1000 is not set 497# CONFIG_BLK_DEV_RZ1000 is not set
495CONFIG_BLK_DEV_IDEDMA_PCI=y 498CONFIG_BLK_DEV_IDEDMA_PCI=y
496# CONFIG_BLK_DEV_IDEDMA_FORCED is not set 499# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
497CONFIG_IDEDMA_PCI_AUTO=y
498# CONFIG_IDEDMA_ONLYDISK is not set 500# CONFIG_IDEDMA_ONLYDISK is not set
499# CONFIG_BLK_DEV_AEC62XX is not set 501# CONFIG_BLK_DEV_AEC62XX is not set
500# CONFIG_BLK_DEV_ALI15X3 is not set 502# CONFIG_BLK_DEV_ALI15X3 is not set
@@ -525,7 +527,6 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y
525# CONFIG_IDE_ARM is not set 527# CONFIG_IDE_ARM is not set
526CONFIG_BLK_DEV_IDEDMA=y 528CONFIG_BLK_DEV_IDEDMA=y
527# CONFIG_IDEDMA_IVB is not set 529# CONFIG_IDEDMA_IVB is not set
528CONFIG_IDEDMA_AUTO=y
529# CONFIG_BLK_DEV_HD is not set 530# CONFIG_BLK_DEV_HD is not set
530 531
531# 532#
@@ -584,11 +585,9 @@ CONFIG_AIC79XX_DEBUG_MASK=0
584# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set 585# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
585# CONFIG_SCSI_AIC94XX is not set 586# CONFIG_SCSI_AIC94XX is not set
586# CONFIG_SCSI_ARCMSR is not set 587# CONFIG_SCSI_ARCMSR is not set
587CONFIG_MEGARAID_NEWGEN=y 588# CONFIG_MEGARAID_NEWGEN is not set
588CONFIG_MEGARAID_MM=y
589CONFIG_MEGARAID_MAILBOX=y
590# CONFIG_MEGARAID_LEGACY is not set 589# CONFIG_MEGARAID_LEGACY is not set
591CONFIG_MEGARAID_SAS=y 590# CONFIG_MEGARAID_SAS is not set
592# CONFIG_SCSI_HPTIOP is not set 591# CONFIG_SCSI_HPTIOP is not set
593# CONFIG_SCSI_BUSLOGIC is not set 592# CONFIG_SCSI_BUSLOGIC is not set
594# CONFIG_SCSI_DMX3191D is not set 593# CONFIG_SCSI_DMX3191D is not set
@@ -608,6 +607,7 @@ CONFIG_MEGARAID_SAS=y
608# CONFIG_SCSI_DC395x is not set 607# CONFIG_SCSI_DC395x is not set
609# CONFIG_SCSI_DC390T is not set 608# CONFIG_SCSI_DC390T is not set
610# CONFIG_SCSI_DEBUG is not set 609# CONFIG_SCSI_DEBUG is not set
610# CONFIG_SCSI_ESP_CORE is not set
611# CONFIG_SCSI_SRP is not set 611# CONFIG_SCSI_SRP is not set
612 612
613# 613#
@@ -636,6 +636,7 @@ CONFIG_SATA_ACPI=y
636# CONFIG_PATA_AMD is not set 636# CONFIG_PATA_AMD is not set
637# CONFIG_PATA_ARTOP is not set 637# CONFIG_PATA_ARTOP is not set
638# CONFIG_PATA_ATIIXP is not set 638# CONFIG_PATA_ATIIXP is not set
639# CONFIG_PATA_CMD640_PCI is not set
639# CONFIG_PATA_CMD64X is not set 640# CONFIG_PATA_CMD64X is not set
640# CONFIG_PATA_CS5520 is not set 641# CONFIG_PATA_CS5520 is not set
641# CONFIG_PATA_CS5530 is not set 642# CONFIG_PATA_CS5530 is not set
@@ -687,7 +688,7 @@ CONFIG_BLK_DEV_DM=y
687CONFIG_FUSION=y 688CONFIG_FUSION=y
688CONFIG_FUSION_SPI=y 689CONFIG_FUSION_SPI=y
689# CONFIG_FUSION_FC is not set 690# CONFIG_FUSION_FC is not set
690CONFIG_FUSION_SAS=y 691# CONFIG_FUSION_SAS is not set
691CONFIG_FUSION_MAX_SGE=128 692CONFIG_FUSION_MAX_SGE=128
692# CONFIG_FUSION_CTL is not set 693# CONFIG_FUSION_CTL is not set
693 694
@@ -700,19 +701,22 @@ CONFIG_IEEE1394=y
700# Subsystem Options 701# Subsystem Options
701# 702#
702# CONFIG_IEEE1394_VERBOSEDEBUG is not set 703# CONFIG_IEEE1394_VERBOSEDEBUG is not set
703# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set
704 704
705# 705#
706# Device Drivers 706# Controllers
707#
708
709#
710# Texas Instruments PCILynx requires I2C
707# 711#
708# CONFIG_IEEE1394_PCILYNX is not set
709CONFIG_IEEE1394_OHCI1394=y 712CONFIG_IEEE1394_OHCI1394=y
710 713
711# 714#
712# Protocol Drivers 715# Protocols
713# 716#
714# CONFIG_IEEE1394_VIDEO1394 is not set 717# CONFIG_IEEE1394_VIDEO1394 is not set
715# CONFIG_IEEE1394_SBP2 is not set 718# CONFIG_IEEE1394_SBP2 is not set
719# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set
716# CONFIG_IEEE1394_ETH1394 is not set 720# CONFIG_IEEE1394_ETH1394 is not set
717# CONFIG_IEEE1394_DV1394 is not set 721# CONFIG_IEEE1394_DV1394 is not set
718CONFIG_IEEE1394_RAWIO=y 722CONFIG_IEEE1394_RAWIO=y
@@ -775,7 +779,8 @@ CONFIG_TULIP=y
775# CONFIG_HP100 is not set 779# CONFIG_HP100 is not set
776CONFIG_NET_PCI=y 780CONFIG_NET_PCI=y
777# CONFIG_PCNET32 is not set 781# CONFIG_PCNET32 is not set
778# CONFIG_AMD8111_ETH is not set 782CONFIG_AMD8111_ETH=y
783# CONFIG_AMD8111E_NAPI is not set
779# CONFIG_ADAPTEC_STARFIRE is not set 784# CONFIG_ADAPTEC_STARFIRE is not set
780CONFIG_B44=y 785CONFIG_B44=y
781CONFIG_FORCEDETH=y 786CONFIG_FORCEDETH=y
@@ -837,9 +842,10 @@ CONFIG_S2IO=m
837# CONFIG_TR is not set 842# CONFIG_TR is not set
838 843
839# 844#
840# Wireless LAN (non-hamradio) 845# Wireless LAN
841# 846#
842# CONFIG_NET_RADIO is not set 847# CONFIG_WLAN_PRE80211 is not set
848# CONFIG_WLAN_80211 is not set
843 849
844# 850#
845# Wan interfaces 851# Wan interfaces
@@ -853,7 +859,6 @@ CONFIG_S2IO=m
853# CONFIG_SHAPER is not set 859# CONFIG_SHAPER is not set
854CONFIG_NETCONSOLE=y 860CONFIG_NETCONSOLE=y
855CONFIG_NETPOLL=y 861CONFIG_NETPOLL=y
856# CONFIG_NETPOLL_RX is not set
857# CONFIG_NETPOLL_TRAP is not set 862# CONFIG_NETPOLL_TRAP is not set
858CONFIG_NET_POLL_CONTROLLER=y 863CONFIG_NET_POLL_CONTROLLER=y
859 864
@@ -987,57 +992,7 @@ CONFIG_HPET_MMAP=y
987# 992#
988# I2C support 993# I2C support
989# 994#
990CONFIG_I2C=m 995# CONFIG_I2C is not set
991CONFIG_I2C_CHARDEV=m
992
993#
994# I2C Algorithms
995#
996# CONFIG_I2C_ALGOBIT is not set
997# CONFIG_I2C_ALGOPCF is not set
998# CONFIG_I2C_ALGOPCA is not set
999
1000#
1001# I2C Hardware Bus support
1002#
1003# CONFIG_I2C_ALI1535 is not set
1004# CONFIG_I2C_ALI1563 is not set
1005# CONFIG_I2C_ALI15X3 is not set
1006# CONFIG_I2C_AMD756 is not set
1007# CONFIG_I2C_AMD8111 is not set
1008# CONFIG_I2C_I801 is not set
1009# CONFIG_I2C_I810 is not set
1010# CONFIG_I2C_PIIX4 is not set
1011CONFIG_I2C_ISA=m
1012# CONFIG_I2C_NFORCE2 is not set
1013# CONFIG_I2C_OCORES is not set
1014# CONFIG_I2C_PARPORT_LIGHT is not set
1015# CONFIG_I2C_PASEMI is not set
1016# CONFIG_I2C_PROSAVAGE is not set
1017# CONFIG_I2C_SAVAGE4 is not set
1018# CONFIG_I2C_SIS5595 is not set
1019# CONFIG_I2C_SIS630 is not set
1020# CONFIG_I2C_SIS96X is not set
1021# CONFIG_I2C_STUB is not set
1022# CONFIG_I2C_VIA is not set
1023# CONFIG_I2C_VIAPRO is not set
1024# CONFIG_I2C_VOODOO3 is not set
1025# CONFIG_I2C_PCA_ISA is not set
1026
1027#
1028# Miscellaneous I2C Chip support
1029#
1030# CONFIG_SENSORS_DS1337 is not set
1031# CONFIG_SENSORS_DS1374 is not set
1032# CONFIG_SENSORS_EEPROM is not set
1033# CONFIG_SENSORS_PCF8574 is not set
1034# CONFIG_SENSORS_PCA9539 is not set
1035# CONFIG_SENSORS_PCF8591 is not set
1036# CONFIG_SENSORS_MAX6875 is not set
1037# CONFIG_I2C_DEBUG_CORE is not set
1038# CONFIG_I2C_DEBUG_ALGO is not set
1039# CONFIG_I2C_DEBUG_BUS is not set
1040# CONFIG_I2C_DEBUG_CHIP is not set
1041 996
1042# 997#
1043# SPI support 998# SPI support
@@ -1053,54 +1008,8 @@ CONFIG_I2C_ISA=m
1053# 1008#
1054# Hardware Monitoring support 1009# Hardware Monitoring support
1055# 1010#
1056CONFIG_HWMON=y 1011# CONFIG_HWMON is not set
1057# CONFIG_HWMON_VID is not set 1012# CONFIG_HWMON_VID is not set
1058# CONFIG_SENSORS_ABITUGURU is not set
1059# CONFIG_SENSORS_ADM1021 is not set
1060# CONFIG_SENSORS_ADM1025 is not set
1061# CONFIG_SENSORS_ADM1026 is not set
1062# CONFIG_SENSORS_ADM1029 is not set
1063# CONFIG_SENSORS_ADM1031 is not set
1064# CONFIG_SENSORS_ADM9240 is not set
1065# CONFIG_SENSORS_K8TEMP is not set
1066# CONFIG_SENSORS_ASB100 is not set
1067# CONFIG_SENSORS_ATXP1 is not set
1068# CONFIG_SENSORS_DS1621 is not set
1069# CONFIG_SENSORS_F71805F is not set
1070# CONFIG_SENSORS_FSCHER is not set
1071# CONFIG_SENSORS_FSCPOS is not set
1072# CONFIG_SENSORS_GL518SM is not set
1073# CONFIG_SENSORS_GL520SM is not set
1074# CONFIG_SENSORS_IT87 is not set
1075# CONFIG_SENSORS_LM63 is not set
1076# CONFIG_SENSORS_LM75 is not set
1077# CONFIG_SENSORS_LM77 is not set
1078# CONFIG_SENSORS_LM78 is not set
1079# CONFIG_SENSORS_LM80 is not set
1080# CONFIG_SENSORS_LM83 is not set
1081# CONFIG_SENSORS_LM85 is not set
1082# CONFIG_SENSORS_LM87 is not set
1083# CONFIG_SENSORS_LM90 is not set
1084# CONFIG_SENSORS_LM92 is not set
1085# CONFIG_SENSORS_MAX1619 is not set
1086# CONFIG_SENSORS_PC87360 is not set
1087# CONFIG_SENSORS_PC87427 is not set
1088# CONFIG_SENSORS_SIS5595 is not set
1089# CONFIG_SENSORS_SMSC47M1 is not set
1090# CONFIG_SENSORS_SMSC47M192 is not set
1091CONFIG_SENSORS_SMSC47B397=m
1092# CONFIG_SENSORS_VIA686A is not set
1093# CONFIG_SENSORS_VT1211 is not set
1094# CONFIG_SENSORS_VT8231 is not set
1095# CONFIG_SENSORS_W83781D is not set
1096# CONFIG_SENSORS_W83791D is not set
1097# CONFIG_SENSORS_W83792D is not set
1098# CONFIG_SENSORS_W83793 is not set
1099# CONFIG_SENSORS_W83L785TS is not set
1100# CONFIG_SENSORS_W83627HF is not set
1101# CONFIG_SENSORS_W83627EHF is not set
1102# CONFIG_SENSORS_HDAPS is not set
1103# CONFIG_HWMON_DEBUG_CHIP is not set
1104 1013
1105# 1014#
1106# Multifunction device drivers 1015# Multifunction device drivers
@@ -1147,8 +1056,9 @@ CONFIG_SOUND=y
1147# Open Sound System 1056# Open Sound System
1148# 1057#
1149CONFIG_SOUND_PRIME=y 1058CONFIG_SOUND_PRIME=y
1150# CONFIG_OBSOLETE_OSS is not set 1059CONFIG_OBSOLETE_OSS=y
1151# CONFIG_SOUND_BT878 is not set 1060# CONFIG_SOUND_BT878 is not set
1061# CONFIG_SOUND_ES1371 is not set
1152CONFIG_SOUND_ICH=y 1062CONFIG_SOUND_ICH=y
1153# CONFIG_SOUND_TRIDENT is not set 1063# CONFIG_SOUND_TRIDENT is not set
1154# CONFIG_SOUND_MSNDCLAS is not set 1064# CONFIG_SOUND_MSNDCLAS is not set
@@ -1163,6 +1073,14 @@ CONFIG_HID=y
1163# CONFIG_HID_DEBUG is not set 1073# CONFIG_HID_DEBUG is not set
1164 1074
1165# 1075#
1076# USB Input Devices
1077#
1078CONFIG_USB_HID=y
1079# CONFIG_USB_HIDINPUT_POWERBOOK is not set
1080# CONFIG_HID_FF is not set
1081# CONFIG_USB_HIDDEV is not set
1082
1083#
1166# USB support 1084# USB support
1167# 1085#
1168CONFIG_USB_ARCH_HAS_HCD=y 1086CONFIG_USB_ARCH_HAS_HCD=y
@@ -1175,6 +1093,7 @@ CONFIG_USB=y
1175# Miscellaneous USB options 1093# Miscellaneous USB options
1176# 1094#
1177CONFIG_USB_DEVICEFS=y 1095CONFIG_USB_DEVICEFS=y
1096# CONFIG_USB_DEVICE_CLASS is not set
1178# CONFIG_USB_DYNAMIC_MINORS is not set 1097# CONFIG_USB_DYNAMIC_MINORS is not set
1179# CONFIG_USB_SUSPEND is not set 1098# CONFIG_USB_SUSPEND is not set
1180# CONFIG_USB_OTG is not set 1099# CONFIG_USB_OTG is not set
@@ -1225,10 +1144,6 @@ CONFIG_USB_STORAGE=y
1225# 1144#
1226# USB Input Devices 1145# USB Input Devices
1227# 1146#
1228CONFIG_USB_HID=y
1229# CONFIG_USB_HIDINPUT_POWERBOOK is not set
1230# CONFIG_HID_FF is not set
1231# CONFIG_USB_HIDDEV is not set
1232# CONFIG_USB_AIPTEK is not set 1147# CONFIG_USB_AIPTEK is not set
1233# CONFIG_USB_WACOM is not set 1148# CONFIG_USB_WACOM is not set
1234# CONFIG_USB_ACECAD is not set 1149# CONFIG_USB_ACECAD is not set
@@ -1556,7 +1471,7 @@ CONFIG_DEBUG_KERNEL=y
1556CONFIG_LOG_BUF_SHIFT=18 1471CONFIG_LOG_BUF_SHIFT=18
1557CONFIG_DETECT_SOFTLOCKUP=y 1472CONFIG_DETECT_SOFTLOCKUP=y
1558# CONFIG_SCHEDSTATS is not set 1473# CONFIG_SCHEDSTATS is not set
1559# CONFIG_TIMER_STATS is not set 1474CONFIG_TIMER_STATS=y
1560# CONFIG_DEBUG_SLAB is not set 1475# CONFIG_DEBUG_SLAB is not set
1561# CONFIG_DEBUG_RT_MUTEXES is not set 1476# CONFIG_DEBUG_RT_MUTEXES is not set
1562# CONFIG_RT_MUTEX_TESTER is not set 1477# CONFIG_RT_MUTEX_TESTER is not set
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 071100ea125..185399baaf6 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -5,6 +5,11 @@
5 * This tricks binfmt_elf.c into loading 32bit binaries using lots 5 * This tricks binfmt_elf.c into loading 32bit binaries using lots
6 * of ugly preprocessor tricks. Talk about very very poor man's inheritance. 6 * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
7 */ 7 */
8#define __ASM_X86_64_ELF_H 1
9
10#undef ELF_CLASS
11#define ELF_CLASS ELFCLASS32
12
8#include <linux/types.h> 13#include <linux/types.h>
9#include <linux/stddef.h> 14#include <linux/stddef.h>
10#include <linux/rwsem.h> 15#include <linux/rwsem.h>
@@ -50,9 +55,6 @@ struct elf_phdr;
50#undef ELF_ARCH 55#undef ELF_ARCH
51#define ELF_ARCH EM_386 56#define ELF_ARCH EM_386
52 57
53#undef ELF_CLASS
54#define ELF_CLASS ELFCLASS32
55
56#define ELF_DATA ELFDATA2LSB 58#define ELF_DATA ELFDATA2LSB
57 59
58#define USE_ELF_CORE_DUMP 1 60#define USE_ELF_CORE_DUMP 1
@@ -136,7 +138,7 @@ struct elf_prpsinfo
136 138
137#define user user32 139#define user user32
138 140
139#define __ASM_X86_64_ELF_H 1 141#undef elf_read_implies_exec
140#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) 142#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X)
141//#include <asm/ia32.h> 143//#include <asm/ia32.h>
142#include <linux/elf.h> 144#include <linux/elf.h>
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 796df6992f6..c48087db6f7 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -481,11 +481,7 @@ ia32_sys_call_table:
481 .quad sys_symlink 481 .quad sys_symlink
482 .quad sys_lstat 482 .quad sys_lstat
483 .quad sys_readlink /* 85 */ 483 .quad sys_readlink /* 85 */
484#ifdef CONFIG_IA32_AOUT
485 .quad sys_uselib 484 .quad sys_uselib
486#else
487 .quad quiet_ni_syscall
488#endif
489 .quad sys_swapon 485 .quad sys_swapon
490 .quad sys_reboot 486 .quad sys_reboot
491 .quad compat_sys_old_readdir 487 .quad compat_sys_old_readdir
diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c
index 568ff0df89e..fc4419ff035 100644
--- a/arch/x86_64/ia32/syscall32.c
+++ b/arch/x86_64/ia32/syscall32.c
@@ -13,6 +13,7 @@
13#include <asm/proto.h> 13#include <asm/proto.h>
14#include <asm/tlbflush.h> 14#include <asm/tlbflush.h>
15#include <asm/ia32_unistd.h> 15#include <asm/ia32_unistd.h>
16#include <asm/vsyscall32.h>
16 17
17extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; 18extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
18extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; 19extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index bb47e86f3d0..4d94c51803d 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,8 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
9 x8664_ksyms.o i387.o syscall.o vsyscall.o \ 9 x8664_ksyms.o i387.o syscall.o vsyscall.o \
10 setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ 10 setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
11 pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o 11 pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \
12 perfctr-watchdog.o
12 13
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 14obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o 15obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o
@@ -21,8 +22,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o 22obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o 23obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o
23obj-y += apic.o nmi.o 24obj-y += apic.o nmi.o
24obj-y += io_apic.o mpparse.o \ 25obj-y += io_apic.o mpparse.o genapic.o genapic_flat.o
25 genapic.o genapic_cluster.o genapic_flat.o
26obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o 26obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
27obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 27obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
28obj-$(CONFIG_PM) += suspend.o 28obj-$(CONFIG_PM) += suspend.o
@@ -58,3 +58,4 @@ i8237-y += ../../i386/kernel/i8237.o
58msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o 58msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
59alternative-y += ../../i386/kernel/alternative.o 59alternative-y += ../../i386/kernel/alternative.o
60pcspeaker-y += ../../i386/kernel/pcspeaker.o 60pcspeaker-y += ../../i386/kernel/pcspeaker.o
61perfctr-watchdog-y += ../../i386/kernel/cpu/perfctr-watchdog.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index e1548fbe95a..195b7034a14 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -60,19 +60,6 @@ extern char wakeup_start, wakeup_end;
60 60
61extern unsigned long acpi_copy_wakeup_routine(unsigned long); 61extern unsigned long acpi_copy_wakeup_routine(unsigned long);
62 62
63static pgd_t low_ptr;
64
65static void init_low_mapping(void)
66{
67 pgd_t *slot0 = pgd_offset(current->mm, 0UL);
68 low_ptr = *slot0;
69 /* FIXME: We're playing with the current task's page tables here, which
70 * is potentially dangerous on SMP systems.
71 */
72 set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
73 local_flush_tlb();
74}
75
76/** 63/**
77 * acpi_save_state_mem - save kernel state 64 * acpi_save_state_mem - save kernel state
78 * 65 *
@@ -81,8 +68,6 @@ static void init_low_mapping(void)
81 */ 68 */
82int acpi_save_state_mem(void) 69int acpi_save_state_mem(void)
83{ 70{
84 init_low_mapping();
85
86 memcpy((void *)acpi_wakeup_address, &wakeup_start, 71 memcpy((void *)acpi_wakeup_address, &wakeup_start,
87 &wakeup_end - &wakeup_start); 72 &wakeup_end - &wakeup_start);
88 acpi_copy_wakeup_routine(acpi_wakeup_address); 73 acpi_copy_wakeup_routine(acpi_wakeup_address);
@@ -95,8 +80,6 @@ int acpi_save_state_mem(void)
95 */ 80 */
96void acpi_restore_state_mem(void) 81void acpi_restore_state_mem(void)
97{ 82{
98 set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
99 local_flush_tlb();
100} 83}
101 84
102/** 85/**
@@ -109,10 +92,11 @@ void acpi_restore_state_mem(void)
109 */ 92 */
110void __init acpi_reserve_bootmem(void) 93void __init acpi_reserve_bootmem(void)
111{ 94{
112 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); 95 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
113 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) 96 if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
114 printk(KERN_CRIT 97 printk(KERN_CRIT
115 "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); 98 "ACPI: Wakeup code way too big, will crash on attempt"
99 " to suspend\n");
116} 100}
117 101
118static int __init acpi_sleep_setup(char *str) 102static int __init acpi_sleep_setup(char *str)
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index 185faa911db..8550a6ffa27 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -1,6 +1,7 @@
1.text 1.text
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/pgtable.h>
4#include <asm/page.h> 5#include <asm/page.h>
5#include <asm/msr.h> 6#include <asm/msr.h>
6 7
@@ -30,22 +31,28 @@ wakeup_code:
30 cld 31 cld
31 # setup data segment 32 # setup data segment
32 movw %cs, %ax 33 movw %cs, %ax
33 movw %ax, %ds # Make ds:0 point to wakeup_start 34 movw %ax, %ds # Make ds:0 point to wakeup_start
34 movw %ax, %ss 35 movw %ax, %ss
35 mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board 36 # Private stack is needed for ASUS board
37 mov $(wakeup_stack - wakeup_code), %sp
36 38
37 pushl $0 # Kill any dangerous flags 39 pushl $0 # Kill any dangerous flags
38 popfl 40 popfl
39 41
40 movl real_magic - wakeup_code, %eax 42 movl real_magic - wakeup_code, %eax
41 cmpl $0x12345678, %eax 43 cmpl $0x12345678, %eax
42 jne bogus_real_magic 44 jne bogus_real_magic
43 45
46 call verify_cpu # Verify the cpu supports long
47 # mode
48 testl %eax, %eax
49 jnz no_longmode
50
44 testl $1, video_flags - wakeup_code 51 testl $1, video_flags - wakeup_code
45 jz 1f 52 jz 1f
46 lcall $0xc000,$3 53 lcall $0xc000,$3
47 movw %cs, %ax 54 movw %cs, %ax
48 movw %ax, %ds # Bios might have played with that 55 movw %ax, %ds # Bios might have played with that
49 movw %ax, %ss 56 movw %ax, %ss
501: 571:
51 58
@@ -61,12 +68,15 @@ wakeup_code:
61 68
62 movb $0xa2, %al ; outb %al, $0x80 69 movb $0xa2, %al ; outb %al, $0x80
63 70
64 lidt %ds:idt_48a - wakeup_code 71 mov %ds, %ax # Find 32bit wakeup_code addr
65 xorl %eax, %eax 72 movzx %ax, %esi # (Convert %ds:gdt to a liner ptr)
66 movw %ds, %ax # (Convert %ds:gdt to a linear ptr) 73 shll $4, %esi
67 shll $4, %eax 74 # Fix up the vectors
68 addl $(gdta - wakeup_code), %eax 75 addl %esi, wakeup_32_vector - wakeup_code
69 movl %eax, gdt_48a +2 - wakeup_code 76 addl %esi, wakeup_long64_vector - wakeup_code
77 addl %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer
78
79 lidtl %ds:idt_48a - wakeup_code
70 lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is 80 lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is
71 # appropriate 81 # appropriate
72 82
@@ -75,86 +85,63 @@ wakeup_code:
75 jmp 1f 85 jmp 1f
761: 861:
77 87
78 .byte 0x66, 0xea # prefix + jmpi-opcode 88 ljmpl *(wakeup_32_vector - wakeup_code)
79 .long wakeup_32 - __START_KERNEL_map 89
80 .word __KERNEL_CS 90 .balign 4
91wakeup_32_vector:
92 .long wakeup_32 - wakeup_code
93 .word __KERNEL32_CS, 0
81 94
82 .code32 95 .code32
83wakeup_32: 96wakeup_32:
84# Running in this code, but at low address; paging is not yet turned on. 97# Running in this code, but at low address; paging is not yet turned on.
85 movb $0xa5, %al ; outb %al, $0x80 98 movb $0xa5, %al ; outb %al, $0x80
86 99
87 /* Check if extended functions are implemented */ 100 movl $__KERNEL_DS, %eax
88 movl $0x80000000, %eax 101 movl %eax, %ds
89 cpuid
90 cmpl $0x80000000, %eax
91 jbe bogus_cpu
92 wbinvd
93 mov $0x80000001, %eax
94 cpuid
95 btl $29, %edx
96 jnc bogus_cpu
97 movl %edx,%edi
98
99 movw $__KERNEL_DS, %ax
100 movw %ax, %ds
101 movw %ax, %es
102 movw %ax, %fs
103 movw %ax, %gs
104
105 movw $__KERNEL_DS, %ax
106 movw %ax, %ss
107 102
108 mov $(wakeup_stack - __START_KERNEL_map), %esp 103 movw $0x0e00 + 'i', %ds:(0xb8012)
109 movl saved_magic - __START_KERNEL_map, %eax 104 movb $0xa8, %al ; outb %al, $0x80;
110 cmpl $0x9abcdef0, %eax
111 jne bogus_32_magic
112 105
113 /* 106 /*
114 * Prepare for entering 64bits mode 107 * Prepare for entering 64bits mode
115 */ 108 */
116 109
117 /* Enable PAE mode and PGE */ 110 /* Enable PAE */
118 xorl %eax, %eax 111 xorl %eax, %eax
119 btsl $5, %eax 112 btsl $5, %eax
120 btsl $7, %eax
121 movl %eax, %cr4 113 movl %eax, %cr4
122 114
123 /* Setup early boot stage 4 level pagetables */ 115 /* Setup early boot stage 4 level pagetables */
124 movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax 116 leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax
125 movl %eax, %cr3 117 movl %eax, %cr3
126 118
127 /* Setup EFER (Extended Feature Enable Register) */ 119 /* Check if nx is implemented */
128 movl $MSR_EFER, %ecx 120 movl $0x80000001, %eax
129 rdmsr 121 cpuid
130 /* Fool rdmsr and reset %eax to avoid dependences */ 122 movl %edx,%edi
131 xorl %eax, %eax 123
132 /* Enable Long Mode */ 124 /* Enable Long Mode */
125 xorl %eax, %eax
133 btsl $_EFER_LME, %eax 126 btsl $_EFER_LME, %eax
134 /* Enable System Call */
135 btsl $_EFER_SCE, %eax
136 127
137 /* No Execute supported? */ 128 /* No Execute supported? */
138 btl $20,%edi 129 btl $20,%edi
139 jnc 1f 130 jnc 1f
140 btsl $_EFER_NX, %eax 131 btsl $_EFER_NX, %eax
1411:
142 132
143 /* Make changes effective */ 133 /* Make changes effective */
1341: movl $MSR_EFER, %ecx
135 xorl %edx, %edx
144 wrmsr 136 wrmsr
145 wbinvd
146 137
147 xorl %eax, %eax 138 xorl %eax, %eax
148 btsl $31, %eax /* Enable paging and in turn activate Long Mode */ 139 btsl $31, %eax /* Enable paging and in turn activate Long Mode */
149 btsl $0, %eax /* Enable protected mode */ 140 btsl $0, %eax /* Enable protected mode */
150 btsl $1, %eax /* Enable MP */
151 btsl $4, %eax /* Enable ET */
152 btsl $5, %eax /* Enable NE */
153 btsl $16, %eax /* Enable WP */
154 btsl $18, %eax /* Enable AM */
155 141
156 /* Make changes effective */ 142 /* Make changes effective */
157 movl %eax, %cr0 143 movl %eax, %cr0
144
158 /* At this point: 145 /* At this point:
159 CR4.PAE must be 1 146 CR4.PAE must be 1
160 CS.L must be 0 147 CS.L must be 0
@@ -162,11 +149,6 @@ wakeup_32:
162 Next instruction must be a branch 149 Next instruction must be a branch
163 This must be on identity-mapped page 150 This must be on identity-mapped page
164 */ 151 */
165 jmp reach_compatibility_mode
166reach_compatibility_mode:
167 movw $0x0e00 + 'i', %ds:(0xb8012)
168 movb $0xa8, %al ; outb %al, $0x80;
169
170 /* 152 /*
171 * At this point we're in long mode but in 32bit compatibility mode 153 * At this point we're in long mode but in 32bit compatibility mode
172 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn 154 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
@@ -174,24 +156,19 @@ reach_compatibility_mode:
174 * the new gdt/idt that has __KERNEL_CS with CS.L = 1. 156 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
175 */ 157 */
176 158
177 movw $0x0e00 + 'n', %ds:(0xb8014)
178 movb $0xa9, %al ; outb %al, $0x80
179
180 /* Load new GDT with the 64bit segment using 32bit descriptor */
181 movl $(pGDT32 - __START_KERNEL_map), %eax
182 lgdt (%eax)
183
184 movl $(wakeup_jumpvector - __START_KERNEL_map), %eax
185 /* Finally jump in 64bit mode */ 159 /* Finally jump in 64bit mode */
186 ljmp *(%eax) 160 ljmp *(wakeup_long64_vector - wakeup_code)(%esi)
187 161
188wakeup_jumpvector: 162 .balign 4
189 .long wakeup_long64 - __START_KERNEL_map 163wakeup_long64_vector:
190 .word __KERNEL_CS 164 .long wakeup_long64 - wakeup_code
165 .word __KERNEL_CS, 0
191 166
192.code64 167.code64
193 168
194 /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ 169 /* Hooray, we are in Long 64-bit mode (but still running in
170 * low memory)
171 */
195wakeup_long64: 172wakeup_long64:
196 /* 173 /*
197 * We must switch to a new descriptor in kernel space for the GDT 174 * We must switch to a new descriptor in kernel space for the GDT
@@ -199,7 +176,15 @@ wakeup_long64:
199 * addresses where we're currently running on. We have to do that here 176 * addresses where we're currently running on. We have to do that here
200 * because in 32bit we couldn't load a 64bit linear address. 177 * because in 32bit we couldn't load a 64bit linear address.
201 */ 178 */
202 lgdt cpu_gdt_descr - __START_KERNEL_map 179 lgdt cpu_gdt_descr
180
181 movw $0x0e00 + 'n', %ds:(0xb8014)
182 movb $0xa9, %al ; outb %al, $0x80
183
184 movq saved_magic, %rax
185 movq $0x123456789abcdef0, %rdx
186 cmpq %rdx, %rax
187 jne bogus_64_magic
203 188
204 movw $0x0e00 + 'u', %ds:(0xb8016) 189 movw $0x0e00 + 'u', %ds:(0xb8016)
205 190
@@ -211,75 +196,58 @@ wakeup_long64:
211 movw %ax, %es 196 movw %ax, %es
212 movw %ax, %fs 197 movw %ax, %fs
213 movw %ax, %gs 198 movw %ax, %gs
214 movq saved_esp, %rsp 199 movq saved_rsp, %rsp
215 200
216 movw $0x0e00 + 'x', %ds:(0xb8018) 201 movw $0x0e00 + 'x', %ds:(0xb8018)
217 movq saved_ebx, %rbx 202 movq saved_rbx, %rbx
218 movq saved_edi, %rdi 203 movq saved_rdi, %rdi
219 movq saved_esi, %rsi 204 movq saved_rsi, %rsi
220 movq saved_ebp, %rbp 205 movq saved_rbp, %rbp
221 206
222 movw $0x0e00 + '!', %ds:(0xb801a) 207 movw $0x0e00 + '!', %ds:(0xb801a)
223 movq saved_eip, %rax 208 movq saved_rip, %rax
224 jmp *%rax 209 jmp *%rax
225 210
226.code32 211.code32
227 212
228 .align 64 213 .align 64
229gdta: 214gdta:
215 /* Its good to keep gdt in sync with one in trampoline.S */
230 .word 0, 0, 0, 0 # dummy 216 .word 0, 0, 0, 0 # dummy
231 217 /* ??? Why I need the accessed bit set in order for this to work? */
232 .word 0, 0, 0, 0 # unused 218 .quad 0x00cf9b000000ffff # __KERNEL32_CS
233 219 .quad 0x00af9b000000ffff # __KERNEL_CS
234 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) 220 .quad 0x00cf93000000ffff # __KERNEL_DS
235 .word 0 # base address = 0
236 .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?)
237 .word 0x00CF # granularity = 4096, 386
238 # (+5th nibble of limit)
239
240 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
241 .word 0 # base address = 0
242 .word 0x9200 # data read/write
243 .word 0x00CF # granularity = 4096, 386
244 # (+5th nibble of limit)
245# this is 64bit descriptor for code
246 .word 0xFFFF
247 .word 0
248 .word 0x9A00 # code read/exec
249 .word 0x00AF # as above, but it is long mode and with D=0
250 221
251idt_48a: 222idt_48a:
252 .word 0 # idt limit = 0 223 .word 0 # idt limit = 0
253 .word 0, 0 # idt base = 0L 224 .word 0, 0 # idt base = 0L
254 225
255gdt_48a: 226gdt_48a:
256 .word 0x8000 # gdt limit=2048, 227 .word 0x800 # gdt limit=2048,
257 # 256 GDT entries 228 # 256 GDT entries
258 .word 0, 0 # gdt base (filled in later) 229 .long gdta - wakeup_code # gdt base (relocated in later)
259
260 230
261real_save_gdt: .word 0
262 .quad 0
263real_magic: .quad 0 231real_magic: .quad 0
264video_mode: .quad 0 232video_mode: .quad 0
265video_flags: .quad 0 233video_flags: .quad 0
266 234
235.code16
267bogus_real_magic: 236bogus_real_magic:
268 movb $0xba,%al ; outb %al,$0x80 237 movb $0xba,%al ; outb %al,$0x80
269 jmp bogus_real_magic 238 jmp bogus_real_magic
270 239
271bogus_32_magic: 240.code64
241bogus_64_magic:
272 movb $0xb3,%al ; outb %al,$0x80 242 movb $0xb3,%al ; outb %al,$0x80
273 jmp bogus_32_magic 243 jmp bogus_64_magic
274 244
275bogus_31_magic: 245.code16
276 movb $0xb1,%al ; outb %al,$0x80 246no_longmode:
277 jmp bogus_31_magic 247 movb $0xbc,%al ; outb %al,$0x80
278 248 jmp no_longmode
279bogus_cpu:
280 movb $0xbc,%al ; outb %al,$0x80
281 jmp bogus_cpu
282 249
250#include "../verify_cpu.S"
283 251
284/* This code uses an extended set of video mode numbers. These include: 252/* This code uses an extended set of video mode numbers. These include:
285 * Aliases for standard modes 253 * Aliases for standard modes
@@ -301,6 +269,7 @@ bogus_cpu:
301#define VIDEO_FIRST_V7 0x0900 269#define VIDEO_FIRST_V7 0x0900
302 270
303# Setting of user mode (AX=mode ID) => CF=success 271# Setting of user mode (AX=mode ID) => CF=success
272.code16
304mode_seta: 273mode_seta:
305 movw %ax, %bx 274 movw %ax, %bx
306#if 0 275#if 0
@@ -346,21 +315,18 @@ check_vesaa:
346 315
347_setbada: jmp setbada 316_setbada: jmp setbada
348 317
349 .code64
350bogus_magic:
351 movw $0x0e00 + 'B', %ds:(0xb8018)
352 jmp bogus_magic
353
354bogus_magic2:
355 movw $0x0e00 + '2', %ds:(0xb8018)
356 jmp bogus_magic2
357
358
359wakeup_stack_begin: # Stack grows down 318wakeup_stack_begin: # Stack grows down
360 319
361.org 0xff0 320.org 0xff0
362wakeup_stack: # Just below end of page 321wakeup_stack: # Just below end of page
363 322
323.org 0x1000
324ENTRY(wakeup_level4_pgt)
325 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
326 .fill 510,8,0
327 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
328 .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
329
364ENTRY(wakeup_end) 330ENTRY(wakeup_end)
365 331
366## 332##
@@ -373,28 +339,11 @@ ENTRY(wakeup_end)
373# 339#
374# Returned address is location of code in low memory (past data and stack) 340# Returned address is location of code in low memory (past data and stack)
375# 341#
342 .code64
376ENTRY(acpi_copy_wakeup_routine) 343ENTRY(acpi_copy_wakeup_routine)
377 pushq %rax 344 pushq %rax
378 pushq %rcx
379 pushq %rdx 345 pushq %rdx
380 346
381 sgdt saved_gdt
382 sidt saved_idt
383 sldt saved_ldt
384 str saved_tss
385
386 movq %cr3, %rdx
387 movq %rdx, saved_cr3
388 movq %cr4, %rdx
389 movq %rdx, saved_cr4
390 movq %cr0, %rdx
391 movq %rdx, saved_cr0
392 sgdt real_save_gdt - wakeup_start (,%rdi)
393 movl $MSR_EFER, %ecx
394 rdmsr
395 movl %eax, saved_efer
396 movl %edx, saved_efer2
397
398 movl saved_video_mode, %edx 347 movl saved_video_mode, %edx
399 movl %edx, video_mode - wakeup_start (,%rdi) 348 movl %edx, video_mode - wakeup_start (,%rdi)
400 movl acpi_video_flags, %edx 349 movl acpi_video_flags, %edx
@@ -403,21 +352,13 @@ ENTRY(acpi_copy_wakeup_routine)
403 movq $0x123456789abcdef0, %rdx 352 movq $0x123456789abcdef0, %rdx
404 movq %rdx, saved_magic 353 movq %rdx, saved_magic
405 354
406 movl saved_magic - __START_KERNEL_map, %eax 355 movq saved_magic, %rax
407 cmpl $0x9abcdef0, %eax 356 movq $0x123456789abcdef0, %rdx
408 jne bogus_32_magic 357 cmpq %rdx, %rax
409 358 jne bogus_64_magic
410 # make sure %cr4 is set correctly (features, etc)
411 movl saved_cr4 - __START_KERNEL_map, %eax
412 movq %rax, %cr4
413 359
414 movl saved_cr0 - __START_KERNEL_map, %eax
415 movq %rax, %cr0
416 jmp 1f # Flush pipelines
4171:
418 # restore the regs we used 360 # restore the regs we used
419 popq %rdx 361 popq %rdx
420 popq %rcx
421 popq %rax 362 popq %rax
422ENTRY(do_suspend_lowlevel_s4bios) 363ENTRY(do_suspend_lowlevel_s4bios)
423 ret 364 ret
@@ -450,13 +391,13 @@ do_suspend_lowlevel:
450 movq %r15, saved_context_r15(%rip) 391 movq %r15, saved_context_r15(%rip)
451 pushfq ; popq saved_context_eflags(%rip) 392 pushfq ; popq saved_context_eflags(%rip)
452 393
453 movq $.L97, saved_eip(%rip) 394 movq $.L97, saved_rip(%rip)
454 395
455 movq %rsp,saved_esp 396 movq %rsp,saved_rsp
456 movq %rbp,saved_ebp 397 movq %rbp,saved_rbp
457 movq %rbx,saved_ebx 398 movq %rbx,saved_rbx
458 movq %rdi,saved_edi 399 movq %rdi,saved_rdi
459 movq %rsi,saved_esi 400 movq %rsi,saved_rsi
460 401
461 addq $8, %rsp 402 addq $8, %rsp
462 movl $3, %edi 403 movl $3, %edi
@@ -503,25 +444,12 @@ do_suspend_lowlevel:
503 444
504.data 445.data
505ALIGN 446ALIGN
506ENTRY(saved_ebp) .quad 0 447ENTRY(saved_rbp) .quad 0
507ENTRY(saved_esi) .quad 0 448ENTRY(saved_rsi) .quad 0
508ENTRY(saved_edi) .quad 0 449ENTRY(saved_rdi) .quad 0
509ENTRY(saved_ebx) .quad 0 450ENTRY(saved_rbx) .quad 0
510 451
511ENTRY(saved_eip) .quad 0 452ENTRY(saved_rip) .quad 0
512ENTRY(saved_esp) .quad 0 453ENTRY(saved_rsp) .quad 0
513 454
514ENTRY(saved_magic) .quad 0 455ENTRY(saved_magic) .quad 0
515
516ALIGN
517# saved registers
518saved_gdt: .quad 0,0
519saved_idt: .quad 0,0
520saved_ldt: .quad 0
521saved_tss: .quad 0
522
523saved_cr0: .quad 0
524saved_cr3: .quad 0
525saved_cr4: .quad 0
526saved_efer: .quad 0
527saved_efer2: .quad 0
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index b487396c4c5..a52af582059 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -51,7 +51,6 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
51 51
52static u32 __init allocate_aperture(void) 52static u32 __init allocate_aperture(void)
53{ 53{
54 pg_data_t *nd0 = NODE_DATA(0);
55 u32 aper_size; 54 u32 aper_size;
56 void *p; 55 void *p;
57 56
@@ -65,12 +64,12 @@ static u32 __init allocate_aperture(void)
65 * Unfortunately we cannot move it up because that would make the 64 * Unfortunately we cannot move it up because that would make the
66 * IOMMU useless. 65 * IOMMU useless.
67 */ 66 */
68 p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); 67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
69 if (!p || __pa(p)+aper_size > 0xffffffff) { 68 if (!p || __pa(p)+aper_size > 0xffffffff) {
70 printk("Cannot allocate aperture memory hole (%p,%uK)\n", 69 printk("Cannot allocate aperture memory hole (%p,%uK)\n",
71 p, aper_size>>10); 70 p, aper_size>>10);
72 if (p) 71 if (p)
73 free_bootmem_node(nd0, __pa(p), aper_size); 72 free_bootmem(__pa(p), aper_size);
74 return 0; 73 return 0;
75 } 74 }
76 printk("Mapping aperture over %d KB of RAM @ %lx\n", 75 printk("Mapping aperture over %d KB of RAM @ %lx\n",
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index bd3e45d47c3..d198f7d82e5 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -68,6 +68,28 @@ int using_apic_timer __read_mostly = 0;
68 68
69static void apic_pm_activate(void); 69static void apic_pm_activate(void);
70 70
71void apic_wait_icr_idle(void)
72{
73 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
74 cpu_relax();
75}
76
77unsigned int safe_apic_wait_icr_idle(void)
78{
79 unsigned int send_status;
80 int timeout;
81
82 timeout = 0;
83 do {
84 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
85 if (!send_status)
86 break;
87 udelay(100);
88 } while (timeout++ < 1000);
89
90 return send_status;
91}
92
71void enable_NMI_through_LVT0 (void * dummy) 93void enable_NMI_through_LVT0 (void * dummy)
72{ 94{
73 unsigned int v; 95 unsigned int v;
@@ -817,14 +839,15 @@ static void setup_APIC_timer(unsigned int clocks)
817 839
818static int __init calibrate_APIC_clock(void) 840static int __init calibrate_APIC_clock(void)
819{ 841{
820 int apic, apic_start, tsc, tsc_start; 842 unsigned apic, apic_start;
843 unsigned long tsc, tsc_start;
821 int result; 844 int result;
822 /* 845 /*
823 * Put whatever arbitrary (but long enough) timeout 846 * Put whatever arbitrary (but long enough) timeout
824 * value into the APIC clock, we just want to get the 847 * value into the APIC clock, we just want to get the
825 * counter running for calibration. 848 * counter running for calibration.
826 */ 849 */
827 __setup_APIC_LVTT(1000000000); 850 __setup_APIC_LVTT(4000000000);
828 851
829 apic_start = apic_read(APIC_TMCCT); 852 apic_start = apic_read(APIC_TMCCT);
830#ifdef CONFIG_X86_PM_TIMER 853#ifdef CONFIG_X86_PM_TIMER
@@ -835,15 +858,15 @@ static int __init calibrate_APIC_clock(void)
835 } else 858 } else
836#endif 859#endif
837 { 860 {
838 rdtscl(tsc_start); 861 rdtscll(tsc_start);
839 862
840 do { 863 do {
841 apic = apic_read(APIC_TMCCT); 864 apic = apic_read(APIC_TMCCT);
842 rdtscl(tsc); 865 rdtscll(tsc);
843 } while ((tsc - tsc_start) < TICK_COUNT && 866 } while ((tsc - tsc_start) < TICK_COUNT &&
844 (apic - apic_start) < TICK_COUNT); 867 (apic_start - apic) < TICK_COUNT);
845 868
846 result = (apic_start - apic) * 1000L * cpu_khz / 869 result = (apic_start - apic) * 1000L * tsc_khz /
847 (tsc - tsc_start); 870 (tsc - tsc_start);
848 } 871 }
849 printk("result %d\n", result); 872 printk("result %d\n", result);
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
index 96687e2beb2..778953bc636 100644
--- a/arch/x86_64/kernel/asm-offsets.c
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -21,6 +21,14 @@
21 21
22#define BLANK() asm volatile("\n->" : : ) 22#define BLANK() asm volatile("\n->" : : )
23 23
24#define __NO_STUBS 1
25#undef __SYSCALL
26#undef _ASM_X86_64_UNISTD_H_
27#define __SYSCALL(nr, sym) [nr] = 1,
28static char syscalls[] = {
29#include <asm/unistd.h>
30};
31
24int main(void) 32int main(void)
25{ 33{
26#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) 34#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
@@ -71,5 +79,7 @@ int main(void)
71 DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); 79 DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
72 BLANK(); 80 BLANK();
73 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); 81 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
82 BLANK();
83 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
74 return 0; 84 return 0;
75} 85}
diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c
new file mode 100644
index 00000000000..12b585b5345
--- /dev/null
+++ b/arch/x86_64/kernel/bugs.c
@@ -0,0 +1,21 @@
1/*
2 * arch/x86_64/kernel/bugs.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 * Copyright (C) 2000 SuSE
6 */
7
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <asm/alternative.h>
11#include <asm/processor.h>
12
13void __init check_bugs(void)
14{
15 identify_cpu(&boot_cpu_data);
16#if !defined(CONFIG_SMP)
17 printk("CPU: ");
18 print_cpu_info(&boot_cpu_data);
19#endif
20 alternative_instructions();
21}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index a490fabfcf4..be8965427a9 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -25,7 +25,7 @@
25#include <asm/bootsetup.h> 25#include <asm/bootsetup.h>
26#include <asm/sections.h> 26#include <asm/sections.h>
27 27
28struct e820map e820 __initdata; 28struct e820map e820;
29 29
30/* 30/*
31 * PFN of last memory page. 31 * PFN of last memory page.
@@ -98,7 +98,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
98 * This function checks if any part of the range <start,end> is mapped 98 * This function checks if any part of the range <start,end> is mapped
99 * with type. 99 * with type.
100 */ 100 */
101int __meminit 101int
102e820_any_mapped(unsigned long start, unsigned long end, unsigned type) 102e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
103{ 103{
104 int i; 104 int i;
@@ -112,6 +112,7 @@ e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
112 } 112 }
113 return 0; 113 return 0;
114} 114}
115EXPORT_SYMBOL_GPL(e820_any_mapped);
115 116
116/* 117/*
117 * This function checks if the entire range <start,end> is mapped with type. 118 * This function checks if the entire range <start,end> is mapped with type.
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index fede55a5399..990d9c218a5 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -71,18 +71,6 @@ static void __init ati_bugs(void)
71 } 71 }
72} 72}
73 73
74static void intel_bugs(void)
75{
76 u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
77
78#ifdef CONFIG_SMP
79 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
80 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
81 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
82 quirk_intel_irqbalance();
83#endif
84}
85
86struct chipset { 74struct chipset {
87 u16 vendor; 75 u16 vendor;
88 void (*f)(void); 76 void (*f)(void);
@@ -92,7 +80,6 @@ static struct chipset early_qrk[] __initdata = {
92 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, 80 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
93 { PCI_VENDOR_ID_VIA, via_bugs }, 81 { PCI_VENDOR_ID_VIA, via_bugs },
94 { PCI_VENDOR_ID_ATI, ati_bugs }, 82 { PCI_VENDOR_ID_ATI, ati_bugs },
95 { PCI_VENDOR_ID_INTEL, intel_bugs},
96 {} 83 {}
97}; 84};
98 85
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 47b6d90349d..92213d2b7c1 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -11,11 +11,10 @@
11 11
12#ifdef __i386__ 12#ifdef __i386__
13#include <asm/setup.h> 13#include <asm/setup.h>
14#define VGABASE (__ISA_IO_base + 0xb8000)
15#else 14#else
16#include <asm/bootsetup.h> 15#include <asm/bootsetup.h>
17#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
18#endif 16#endif
17#define VGABASE (__ISA_IO_base + 0xb8000)
19 18
20static int max_ypos = 25, max_xpos = 80; 19static int max_ypos = 25, max_xpos = 80;
21static int current_ypos = 25, current_xpos = 0; 20static int current_ypos = 25, current_xpos = 0;
@@ -176,7 +175,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
176 return ret; 175 return ret;
177} 176}
178 177
179void __init simnow_init(char *str) 178static void __init simnow_init(char *str)
180{ 179{
181 char *fn = "klog"; 180 char *fn = "klog";
182 if (*str == '=') 181 if (*str == '=')
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index ed4350ced3d..fa984b53e7e 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -701,6 +701,7 @@ END(spurious_interrupt)
701 CFI_ADJUST_CFA_OFFSET 8 701 CFI_ADJUST_CFA_OFFSET 8
702 pushq %rax /* push real oldrax to the rdi slot */ 702 pushq %rax /* push real oldrax to the rdi slot */
703 CFI_ADJUST_CFA_OFFSET 8 703 CFI_ADJUST_CFA_OFFSET 8
704 CFI_REL_OFFSET rax,0
704 leaq \sym(%rip),%rax 705 leaq \sym(%rip),%rax
705 jmp error_entry 706 jmp error_entry
706 CFI_ENDPROC 707 CFI_ENDPROC
@@ -710,6 +711,7 @@ END(spurious_interrupt)
710 XCPT_FRAME 711 XCPT_FRAME
711 pushq %rax 712 pushq %rax
712 CFI_ADJUST_CFA_OFFSET 8 713 CFI_ADJUST_CFA_OFFSET 8
714 CFI_REL_OFFSET rax,0
713 leaq \sym(%rip),%rax 715 leaq \sym(%rip),%rax
714 jmp error_entry 716 jmp error_entry
715 CFI_ENDPROC 717 CFI_ENDPROC
@@ -817,6 +819,7 @@ paranoid_schedule\trace:
817 */ 819 */
818KPROBE_ENTRY(error_entry) 820KPROBE_ENTRY(error_entry)
819 _frame RDI 821 _frame RDI
822 CFI_REL_OFFSET rax,0
820 /* rdi slot contains rax, oldrax contains error code */ 823 /* rdi slot contains rax, oldrax contains error code */
821 cld 824 cld
822 subq $14*8,%rsp 825 subq $14*8,%rsp
@@ -824,6 +827,7 @@ KPROBE_ENTRY(error_entry)
824 movq %rsi,13*8(%rsp) 827 movq %rsi,13*8(%rsp)
825 CFI_REL_OFFSET rsi,RSI 828 CFI_REL_OFFSET rsi,RSI
826 movq 14*8(%rsp),%rsi /* load rax from rdi slot */ 829 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
830 CFI_REGISTER rax,rsi
827 movq %rdx,12*8(%rsp) 831 movq %rdx,12*8(%rsp)
828 CFI_REL_OFFSET rdx,RDX 832 CFI_REL_OFFSET rdx,RDX
829 movq %rcx,11*8(%rsp) 833 movq %rcx,11*8(%rsp)
@@ -857,6 +861,7 @@ error_swapgs:
857 swapgs 861 swapgs
858error_sti: 862error_sti:
859 movq %rdi,RDI(%rsp) 863 movq %rdi,RDI(%rsp)
864 CFI_REL_OFFSET rdi,RDI
860 movq %rsp,%rdi 865 movq %rsp,%rdi
861 movq ORIG_RAX(%rsp),%rsi /* get error code */ 866 movq ORIG_RAX(%rsp),%rsi /* get error code */
862 movq $-1,ORIG_RAX(%rsp) 867 movq $-1,ORIG_RAX(%rsp)
diff --git a/arch/x86_64/kernel/functionlist b/arch/x86_64/kernel/functionlist
deleted file mode 100644
index 7ae18ec1245..00000000000
--- a/arch/x86_64/kernel/functionlist
+++ /dev/null
@@ -1,1284 +0,0 @@
1*(.text.flush_thread)
2*(.text.check_poison_obj)
3*(.text.copy_page)
4*(.text.__set_personality)
5*(.text.gart_map_sg)
6*(.text.kmem_cache_free)
7*(.text.find_get_page)
8*(.text._raw_spin_lock)
9*(.text.ide_outb)
10*(.text.unmap_vmas)
11*(.text.copy_page_range)
12*(.text.kprobe_handler)
13*(.text.__handle_mm_fault)
14*(.text.__d_lookup)
15*(.text.copy_user_generic)
16*(.text.__link_path_walk)
17*(.text.get_page_from_freelist)
18*(.text.kmem_cache_alloc)
19*(.text.drive_cmd_intr)
20*(.text.ia32_setup_sigcontext)
21*(.text.huge_pte_offset)
22*(.text.do_page_fault)
23*(.text.page_remove_rmap)
24*(.text.release_pages)
25*(.text.ide_end_request)
26*(.text.__mutex_lock_slowpath)
27*(.text.__find_get_block)
28*(.text.kfree)
29*(.text.vfs_read)
30*(.text._raw_spin_unlock)
31*(.text.free_hot_cold_page)
32*(.text.fget_light)
33*(.text.schedule)
34*(.text.memcmp)
35*(.text.touch_atime)
36*(.text.__might_sleep)
37*(.text.__down_read_trylock)
38*(.text.arch_pick_mmap_layout)
39*(.text.find_vma)
40*(.text.__make_request)
41*(.text.do_generic_mapping_read)
42*(.text.mutex_lock_interruptible)
43*(.text.__generic_file_aio_read)
44*(.text._atomic_dec_and_lock)
45*(.text.__wake_up_bit)
46*(.text.add_to_page_cache)
47*(.text.cache_alloc_debugcheck_after)
48*(.text.vm_normal_page)
49*(.text.mutex_debug_check_no_locks_freed)
50*(.text.net_rx_action)
51*(.text.__find_first_zero_bit)
52*(.text.put_page)
53*(.text._raw_read_lock)
54*(.text.__delay)
55*(.text.dnotify_parent)
56*(.text.do_path_lookup)
57*(.text.do_sync_read)
58*(.text.do_lookup)
59*(.text.bit_waitqueue)
60*(.text.file_read_actor)
61*(.text.strncpy_from_user)
62*(.text.__pagevec_lru_add_active)
63*(.text.fget)
64*(.text.dput)
65*(.text.__strnlen_user)
66*(.text.inotify_inode_queue_event)
67*(.text.rw_verify_area)
68*(.text.ide_intr)
69*(.text.inotify_dentry_parent_queue_event)
70*(.text.permission)
71*(.text.memscan)
72*(.text.hpet_rtc_interrupt)
73*(.text.do_mmap_pgoff)
74*(.text.current_fs_time)
75*(.text.vfs_getattr)
76*(.text.kmem_flagcheck)
77*(.text.mark_page_accessed)
78*(.text.free_pages_and_swap_cache)
79*(.text.generic_fillattr)
80*(.text.__block_prepare_write)
81*(.text.__set_page_dirty_nobuffers)
82*(.text.link_path_walk)
83*(.text.find_get_pages_tag)
84*(.text.ide_do_request)
85*(.text.__alloc_pages)
86*(.text.generic_permission)
87*(.text.mod_page_state_offset)
88*(.text.free_pgd_range)
89*(.text.generic_file_buffered_write)
90*(.text.number)
91*(.text.ide_do_rw_disk)
92*(.text.__brelse)
93*(.text.__mod_page_state_offset)
94*(.text.rotate_reclaimable_page)
95*(.text.find_vma_prepare)
96*(.text.find_vma_prev)
97*(.text.lru_cache_add_active)
98*(.text.__kmalloc_track_caller)
99*(.text.smp_invalidate_interrupt)
100*(.text.handle_IRQ_event)
101*(.text.__find_get_block_slow)
102*(.text.do_wp_page)
103*(.text.do_select)
104*(.text.set_user_nice)
105*(.text.sys_read)
106*(.text.do_munmap)
107*(.text.csum_partial)
108*(.text.__do_softirq)
109*(.text.may_open)
110*(.text.getname)
111*(.text.get_empty_filp)
112*(.text.__fput)
113*(.text.remove_mapping)
114*(.text.filp_ctor)
115*(.text.poison_obj)
116*(.text.unmap_region)
117*(.text.test_set_page_writeback)
118*(.text.__do_page_cache_readahead)
119*(.text.sock_def_readable)
120*(.text.ide_outl)
121*(.text.shrink_zone)
122*(.text.rb_insert_color)
123*(.text.get_request)
124*(.text.sys_pread64)
125*(.text.spin_bug)
126*(.text.ide_outsl)
127*(.text.mask_and_ack_8259A)
128*(.text.filemap_nopage)
129*(.text.page_add_file_rmap)
130*(.text.find_lock_page)
131*(.text.tcp_poll)
132*(.text.__mark_inode_dirty)
133*(.text.file_ra_state_init)
134*(.text.generic_file_llseek)
135*(.text.__pagevec_lru_add)
136*(.text.page_cache_readahead)
137*(.text.n_tty_receive_buf)
138*(.text.zonelist_policy)
139*(.text.vma_adjust)
140*(.text.test_clear_page_dirty)
141*(.text.sync_buffer)
142*(.text.do_exit)
143*(.text.__bitmap_weight)
144*(.text.alloc_pages_current)
145*(.text.get_unused_fd)
146*(.text.zone_watermark_ok)
147*(.text.cpuset_update_task_memory_state)
148*(.text.__bitmap_empty)
149*(.text.sys_munmap)
150*(.text.__inode_dir_notify)
151*(.text.__generic_file_aio_write_nolock)
152*(.text.__pte_alloc)
153*(.text.sys_select)
154*(.text.vm_acct_memory)
155*(.text.vfs_write)
156*(.text.__lru_add_drain)
157*(.text.prio_tree_insert)
158*(.text.generic_file_aio_read)
159*(.text.vma_merge)
160*(.text.block_write_full_page)
161*(.text.__page_set_anon_rmap)
162*(.text.apic_timer_interrupt)
163*(.text.release_console_sem)
164*(.text.sys_write)
165*(.text.sys_brk)
166*(.text.dup_mm)
167*(.text.read_current_timer)
168*(.text.ll_rw_block)
169*(.text.blk_rq_map_sg)
170*(.text.dbg_userword)
171*(.text.__block_commit_write)
172*(.text.cache_grow)
173*(.text.copy_strings)
174*(.text.release_task)
175*(.text.do_sync_write)
176*(.text.unlock_page)
177*(.text.load_elf_binary)
178*(.text.__follow_mount)
179*(.text.__getblk)
180*(.text.do_sys_open)
181*(.text.current_kernel_time)
182*(.text.call_rcu)
183*(.text.write_chan)
184*(.text.vsnprintf)
185*(.text.dummy_inode_setsecurity)
186*(.text.submit_bh)
187*(.text.poll_freewait)
188*(.text.bio_alloc_bioset)
189*(.text.skb_clone)
190*(.text.page_waitqueue)
191*(.text.__mutex_lock_interruptible_slowpath)
192*(.text.get_index)
193*(.text.csum_partial_copy_generic)
194*(.text.bad_range)
195*(.text.remove_vma)
196*(.text.cp_new_stat)
197*(.text.alloc_arraycache)
198*(.text.test_clear_page_writeback)
199*(.text.strsep)
200*(.text.open_namei)
201*(.text._raw_read_unlock)
202*(.text.get_vma_policy)
203*(.text.__down_write_trylock)
204*(.text.find_get_pages)
205*(.text.tcp_rcv_established)
206*(.text.generic_make_request)
207*(.text.__block_write_full_page)
208*(.text.cfq_set_request)
209*(.text.sys_inotify_init)
210*(.text.split_vma)
211*(.text.__mod_timer)
212*(.text.get_options)
213*(.text.vma_link)
214*(.text.mpage_writepages)
215*(.text.truncate_complete_page)
216*(.text.tcp_recvmsg)
217*(.text.sigprocmask)
218*(.text.filemap_populate)
219*(.text.sys_close)
220*(.text.inotify_dev_queue_event)
221*(.text.do_task_stat)
222*(.text.__dentry_open)
223*(.text.unlink_file_vma)
224*(.text.__pollwait)
225*(.text.packet_rcv_spkt)
226*(.text.drop_buffers)
227*(.text.free_pgtables)
228*(.text.generic_file_direct_write)
229*(.text.copy_process)
230*(.text.netif_receive_skb)
231*(.text.dnotify_flush)
232*(.text.print_bad_pte)
233*(.text.anon_vma_unlink)
234*(.text.sys_mprotect)
235*(.text.sync_sb_inodes)
236*(.text.find_inode_fast)
237*(.text.dummy_inode_readlink)
238*(.text.putname)
239*(.text.init_smp_flush)
240*(.text.dbg_redzone2)
241*(.text.sk_run_filter)
242*(.text.may_expand_vm)
243*(.text.generic_file_aio_write)
244*(.text.find_next_zero_bit)
245*(.text.file_kill)
246*(.text.audit_getname)
247*(.text.arch_unmap_area_topdown)
248*(.text.alloc_page_vma)
249*(.text.tcp_transmit_skb)
250*(.text.rb_next)
251*(.text.dbg_redzone1)
252*(.text.generic_file_mmap)
253*(.text.vfs_fstat)
254*(.text.sys_time)
255*(.text.page_lock_anon_vma)
256*(.text.get_unmapped_area)
257*(.text.remote_llseek)
258*(.text.__up_read)
259*(.text.fd_install)
260*(.text.eventpoll_init_file)
261*(.text.dma_alloc_coherent)
262*(.text.create_empty_buffers)
263*(.text.__mutex_unlock_slowpath)
264*(.text.dup_fd)
265*(.text.d_alloc)
266*(.text.tty_ldisc_try)
267*(.text.sys_stime)
268*(.text.__rb_rotate_right)
269*(.text.d_validate)
270*(.text.rb_erase)
271*(.text.path_release)
272*(.text.memmove)
273*(.text.invalidate_complete_page)
274*(.text.clear_inode)
275*(.text.cache_estimate)
276*(.text.alloc_buffer_head)
277*(.text.smp_call_function_interrupt)
278*(.text.flush_tlb_others)
279*(.text.file_move)
280*(.text.balance_dirty_pages_ratelimited)
281*(.text.vma_prio_tree_add)
282*(.text.timespec_trunc)
283*(.text.mempool_alloc)
284*(.text.iget_locked)
285*(.text.d_alloc_root)
286*(.text.cpuset_populate_dir)
287*(.text.anon_vma_prepare)
288*(.text.sys_newstat)
289*(.text.alloc_page_interleave)
290*(.text.__path_lookup_intent_open)
291*(.text.__pagevec_free)
292*(.text.inode_init_once)
293*(.text.free_vfsmnt)
294*(.text.__user_walk_fd)
295*(.text.cfq_idle_slice_timer)
296*(.text.sys_mmap)
297*(.text.sys_llseek)
298*(.text.prio_tree_remove)
299*(.text.filp_close)
300*(.text.file_permission)
301*(.text.vma_prio_tree_remove)
302*(.text.tcp_ack)
303*(.text.nameidata_to_filp)
304*(.text.sys_lseek)
305*(.text.percpu_counter_mod)
306*(.text.igrab)
307*(.text.__bread)
308*(.text.alloc_inode)
309*(.text.filldir)
310*(.text.__rb_rotate_left)
311*(.text.irq_affinity_write_proc)
312*(.text.init_request_from_bio)
313*(.text.find_or_create_page)
314*(.text.tty_poll)
315*(.text.tcp_sendmsg)
316*(.text.ide_wait_stat)
317*(.text.free_buffer_head)
318*(.text.flush_signal_handlers)
319*(.text.tcp_v4_rcv)
320*(.text.nr_blockdev_pages)
321*(.text.locks_remove_flock)
322*(.text.__iowrite32_copy)
323*(.text.do_filp_open)
324*(.text.try_to_release_page)
325*(.text.page_add_new_anon_rmap)
326*(.text.kmem_cache_size)
327*(.text.eth_type_trans)
328*(.text.try_to_free_buffers)
329*(.text.schedule_tail)
330*(.text.proc_lookup)
331*(.text.no_llseek)
332*(.text.kfree_skbmem)
333*(.text.do_wait)
334*(.text.do_mpage_readpage)
335*(.text.vfs_stat_fd)
336*(.text.tty_write)
337*(.text.705)
338*(.text.sync_page)
339*(.text.__remove_shared_vm_struct)
340*(.text.__kfree_skb)
341*(.text.sock_poll)
342*(.text.get_request_wait)
343*(.text.do_sigaction)
344*(.text.do_brk)
345*(.text.tcp_event_data_recv)
346*(.text.read_chan)
347*(.text.pipe_writev)
348*(.text.__emul_lookup_dentry)
349*(.text.rtc_get_rtc_time)
350*(.text.print_objinfo)
351*(.text.file_update_time)
352*(.text.do_signal)
353*(.text.disable_8259A_irq)
354*(.text.blk_queue_bounce)
355*(.text.__anon_vma_link)
356*(.text.__vma_link)
357*(.text.vfs_rename)
358*(.text.sys_newlstat)
359*(.text.sys_newfstat)
360*(.text.sys_mknod)
361*(.text.__show_regs)
362*(.text.iput)
363*(.text.get_signal_to_deliver)
364*(.text.flush_tlb_page)
365*(.text.debug_mutex_wake_waiter)
366*(.text.copy_thread)
367*(.text.clear_page_dirty_for_io)
368*(.text.buffer_io_error)
369*(.text.vfs_permission)
370*(.text.truncate_inode_pages_range)
371*(.text.sys_recvfrom)
372*(.text.remove_suid)
373*(.text.mark_buffer_dirty)
374*(.text.local_bh_enable)
375*(.text.get_zeroed_page)
376*(.text.get_vmalloc_info)
377*(.text.flush_old_exec)
378*(.text.dummy_inode_permission)
379*(.text.__bio_add_page)
380*(.text.prio_tree_replace)
381*(.text.notify_change)
382*(.text.mntput_no_expire)
383*(.text.fput)
384*(.text.__end_that_request_first)
385*(.text.wake_up_bit)
386*(.text.unuse_mm)
387*(.text.shrink_icache_memory)
388*(.text.sched_balance_self)
389*(.text.__pmd_alloc)
390*(.text.pipe_poll)
391*(.text.normal_poll)
392*(.text.__free_pages)
393*(.text.follow_mount)
394*(.text.cdrom_start_packet_command)
395*(.text.blk_recount_segments)
396*(.text.bio_put)
397*(.text.__alloc_skb)
398*(.text.__wake_up)
399*(.text.vm_stat_account)
400*(.text.sys_fcntl)
401*(.text.sys_fadvise64)
402*(.text._raw_write_unlock)
403*(.text.__pud_alloc)
404*(.text.alloc_page_buffers)
405*(.text.vfs_llseek)
406*(.text.sockfd_lookup)
407*(.text._raw_write_lock)
408*(.text.put_compound_page)
409*(.text.prune_dcache)
410*(.text.pipe_readv)
411*(.text.mempool_free)
412*(.text.make_ahead_window)
413*(.text.lru_add_drain)
414*(.text.constant_test_bit)
415*(.text.__clear_user)
416*(.text.arch_unmap_area)
417*(.text.anon_vma_link)
418*(.text.sys_chroot)
419*(.text.setup_arg_pages)
420*(.text.radix_tree_preload)
421*(.text.init_rwsem)
422*(.text.generic_osync_inode)
423*(.text.generic_delete_inode)
424*(.text.do_sys_poll)
425*(.text.dev_queue_xmit)
426*(.text.default_llseek)
427*(.text.__writeback_single_inode)
428*(.text.vfs_ioctl)
429*(.text.__up_write)
430*(.text.unix_poll)
431*(.text.sys_rt_sigprocmask)
432*(.text.sock_recvmsg)
433*(.text.recalc_bh_state)
434*(.text.__put_unused_fd)
435*(.text.process_backlog)
436*(.text.locks_remove_posix)
437*(.text.lease_modify)
438*(.text.expand_files)
439*(.text.end_buffer_read_nobh)
440*(.text.d_splice_alias)
441*(.text.debug_mutex_init_waiter)
442*(.text.copy_from_user)
443*(.text.cap_vm_enough_memory)
444*(.text.show_vfsmnt)
445*(.text.release_sock)
446*(.text.pfifo_fast_enqueue)
447*(.text.half_md4_transform)
448*(.text.fs_may_remount_ro)
449*(.text.do_fork)
450*(.text.copy_hugetlb_page_range)
451*(.text.cache_free_debugcheck)
452*(.text.__tcp_select_window)
453*(.text.task_handoff_register)
454*(.text.sys_open)
455*(.text.strlcpy)
456*(.text.skb_copy_datagram_iovec)
457*(.text.set_up_list3s)
458*(.text.release_open_intent)
459*(.text.qdisc_restart)
460*(.text.n_tty_chars_in_buffer)
461*(.text.inode_change_ok)
462*(.text.__downgrade_write)
463*(.text.debug_mutex_unlock)
464*(.text.add_timer_randomness)
465*(.text.sock_common_recvmsg)
466*(.text.set_bh_page)
467*(.text.printk_lock)
468*(.text.path_release_on_umount)
469*(.text.ip_output)
470*(.text.ide_build_dmatable)
471*(.text.__get_user_8)
472*(.text.end_buffer_read_sync)
473*(.text.__d_path)
474*(.text.d_move)
475*(.text.del_timer)
476*(.text.constant_test_bit)
477*(.text.blockable_page_cache_readahead)
478*(.text.tty_read)
479*(.text.sys_readlink)
480*(.text.sys_faccessat)
481*(.text.read_swap_cache_async)
482*(.text.pty_write_room)
483*(.text.page_address_in_vma)
484*(.text.kthread)
485*(.text.cfq_exit_io_context)
486*(.text.__tcp_push_pending_frames)
487*(.text.sys_pipe)
488*(.text.submit_bio)
489*(.text.pid_revalidate)
490*(.text.page_referenced_file)
491*(.text.lock_sock)
492*(.text.get_page_state_node)
493*(.text.generic_block_bmap)
494*(.text.do_setitimer)
495*(.text.dev_queue_xmit_nit)
496*(.text.copy_from_read_buf)
497*(.text.__const_udelay)
498*(.text.console_conditional_schedule)
499*(.text.wake_up_new_task)
500*(.text.wait_for_completion_interruptible)
501*(.text.tcp_rcv_rtt_update)
502*(.text.sys_mlockall)
503*(.text.set_fs_altroot)
504*(.text.schedule_timeout)
505*(.text.nr_free_pagecache_pages)
506*(.text.nf_iterate)
507*(.text.mapping_tagged)
508*(.text.ip_queue_xmit)
509*(.text.ip_local_deliver)
510*(.text.follow_page)
511*(.text.elf_map)
512*(.text.dummy_file_permission)
513*(.text.dispose_list)
514*(.text.dentry_open)
515*(.text.dentry_iput)
516*(.text.bio_alloc)
517*(.text.wait_on_page_bit)
518*(.text.vfs_readdir)
519*(.text.vfs_lstat)
520*(.text.seq_escape)
521*(.text.__posix_lock_file)
522*(.text.mm_release)
523*(.text.kref_put)
524*(.text.ip_rcv)
525*(.text.__iget)
526*(.text.free_pages)
527*(.text.find_mergeable_anon_vma)
528*(.text.find_extend_vma)
529*(.text.dummy_inode_listsecurity)
530*(.text.bio_add_page)
531*(.text.__vm_enough_memory)
532*(.text.vfs_stat)
533*(.text.tty_paranoia_check)
534*(.text.tcp_read_sock)
535*(.text.tcp_data_queue)
536*(.text.sys_uname)
537*(.text.sys_renameat)
538*(.text.__strncpy_from_user)
539*(.text.__mutex_init)
540*(.text.__lookup_hash)
541*(.text.kref_get)
542*(.text.ip_route_input)
543*(.text.__insert_inode_hash)
544*(.text.do_sock_write)
545*(.text.blk_done_softirq)
546*(.text.__wake_up_sync)
547*(.text.__vma_link_rb)
548*(.text.tty_ioctl)
549*(.text.tracesys)
550*(.text.sys_getdents)
551*(.text.sys_dup)
552*(.text.stub_execve)
553*(.text.sha_transform)
554*(.text.radix_tree_tag_clear)
555*(.text.put_unused_fd)
556*(.text.put_files_struct)
557*(.text.mpage_readpages)
558*(.text.may_delete)
559*(.text.kmem_cache_create)
560*(.text.ip_mc_output)
561*(.text.interleave_nodes)
562*(.text.groups_search)
563*(.text.generic_drop_inode)
564*(.text.generic_commit_write)
565*(.text.fcntl_setlk)
566*(.text.exit_mmap)
567*(.text.end_page_writeback)
568*(.text.__d_rehash)
569*(.text.debug_mutex_free_waiter)
570*(.text.csum_ipv6_magic)
571*(.text.count)
572*(.text.cleanup_rbuf)
573*(.text.check_spinlock_acquired_node)
574*(.text.can_vma_merge_after)
575*(.text.bio_endio)
576*(.text.alloc_pidmap)
577*(.text.write_ldt)
578*(.text.vmtruncate_range)
579*(.text.vfs_create)
580*(.text.__user_walk)
581*(.text.update_send_head)
582*(.text.unmap_underlying_metadata)
583*(.text.tty_ldisc_deref)
584*(.text.tcp_setsockopt)
585*(.text.tcp_send_ack)
586*(.text.sys_pause)
587*(.text.sys_gettimeofday)
588*(.text.sync_dirty_buffer)
589*(.text.strncmp)
590*(.text.release_posix_timer)
591*(.text.proc_file_read)
592*(.text.prepare_to_wait)
593*(.text.locks_mandatory_locked)
594*(.text.interruptible_sleep_on_timeout)
595*(.text.inode_sub_bytes)
596*(.text.in_group_p)
597*(.text.hrtimer_try_to_cancel)
598*(.text.filldir64)
599*(.text.fasync_helper)
600*(.text.dummy_sb_pivotroot)
601*(.text.d_lookup)
602*(.text.d_instantiate)
603*(.text.__d_find_alias)
604*(.text.cpu_idle_wait)
605*(.text.cond_resched_lock)
606*(.text.chown_common)
607*(.text.blk_congestion_wait)
608*(.text.activate_page)
609*(.text.unlock_buffer)
610*(.text.tty_wakeup)
611*(.text.tcp_v4_do_rcv)
612*(.text.tcp_current_mss)
613*(.text.sys_openat)
614*(.text.sys_fchdir)
615*(.text.strnlen_user)
616*(.text.strnlen)
617*(.text.strchr)
618*(.text.sock_common_getsockopt)
619*(.text.skb_checksum)
620*(.text.remove_wait_queue)
621*(.text.rb_replace_node)
622*(.text.radix_tree_node_ctor)
623*(.text.pty_chars_in_buffer)
624*(.text.profile_hit)
625*(.text.prio_tree_left)
626*(.text.pgd_clear_bad)
627*(.text.pfifo_fast_dequeue)
628*(.text.page_referenced)
629*(.text.open_exec)
630*(.text.mmput)
631*(.text.mm_init)
632*(.text.__ide_dma_off_quietly)
633*(.text.ide_dma_intr)
634*(.text.hrtimer_start)
635*(.text.get_io_context)
636*(.text.__get_free_pages)
637*(.text.find_first_zero_bit)
638*(.text.file_free_rcu)
639*(.text.dummy_socket_sendmsg)
640*(.text.do_unlinkat)
641*(.text.do_arch_prctl)
642*(.text.destroy_inode)
643*(.text.can_vma_merge_before)
644*(.text.block_sync_page)
645*(.text.block_prepare_write)
646*(.text.bio_init)
647*(.text.arch_ptrace)
648*(.text.wake_up_inode)
649*(.text.wait_on_retry_sync_kiocb)
650*(.text.vma_prio_tree_next)
651*(.text.tcp_rcv_space_adjust)
652*(.text.__tcp_ack_snd_check)
653*(.text.sys_utime)
654*(.text.sys_recvmsg)
655*(.text.sys_mremap)
656*(.text.sys_bdflush)
657*(.text.sleep_on)
658*(.text.set_page_dirty_lock)
659*(.text.seq_path)
660*(.text.schedule_timeout_interruptible)
661*(.text.sched_fork)
662*(.text.rt_run_flush)
663*(.text.profile_munmap)
664*(.text.prepare_binprm)
665*(.text.__pagevec_release_nonlru)
666*(.text.m_show)
667*(.text.lookup_mnt)
668*(.text.__lookup_mnt)
669*(.text.lock_timer_base)
670*(.text.is_subdir)
671*(.text.invalidate_bh_lru)
672*(.text.init_buffer_head)
673*(.text.ifind_fast)
674*(.text.ide_dma_start)
675*(.text.__get_page_state)
676*(.text.flock_to_posix_lock)
677*(.text.__find_symbol)
678*(.text.do_futex)
679*(.text.do_execve)
680*(.text.dirty_writeback_centisecs_handler)
681*(.text.dev_watchdog)
682*(.text.can_share_swap_page)
683*(.text.blkdev_put)
684*(.text.bio_get_nr_vecs)
685*(.text.xfrm_compile_policy)
686*(.text.vma_prio_tree_insert)
687*(.text.vfs_lstat_fd)
688*(.text.__user_path_lookup_open)
689*(.text.thread_return)
690*(.text.tcp_send_delayed_ack)
691*(.text.sock_def_error_report)
692*(.text.shrink_slab)
693*(.text.serial_out)
694*(.text.seq_read)
695*(.text.secure_ip_id)
696*(.text.search_binary_handler)
697*(.text.proc_pid_unhash)
698*(.text.pagevec_lookup)
699*(.text.new_inode)
700*(.text.memcpy_toiovec)
701*(.text.locks_free_lock)
702*(.text.__lock_page)
703*(.text.__lock_buffer)
704*(.text.load_module)
705*(.text.is_bad_inode)
706*(.text.invalidate_inode_buffers)
707*(.text.insert_vm_struct)
708*(.text.inode_setattr)
709*(.text.inode_add_bytes)
710*(.text.ide_read_24)
711*(.text.ide_get_error_location)
712*(.text.ide_do_drive_cmd)
713*(.text.get_locked_pte)
714*(.text.get_filesystem_list)
715*(.text.generic_file_open)
716*(.text.follow_down)
717*(.text.find_next_bit)
718*(.text.__find_first_bit)
719*(.text.exit_mm)
720*(.text.exec_keys)
721*(.text.end_buffer_write_sync)
722*(.text.end_bio_bh_io_sync)
723*(.text.dummy_socket_shutdown)
724*(.text.d_rehash)
725*(.text.d_path)
726*(.text.do_ioctl)
727*(.text.dget_locked)
728*(.text.copy_thread_group_keys)
729*(.text.cdrom_end_request)
730*(.text.cap_bprm_apply_creds)
731*(.text.blk_rq_bio_prep)
732*(.text.__bitmap_intersects)
733*(.text.bio_phys_segments)
734*(.text.bio_free)
735*(.text.arch_get_unmapped_area_topdown)
736*(.text.writeback_in_progress)
737*(.text.vfs_follow_link)
738*(.text.tcp_rcv_state_process)
739*(.text.tcp_check_space)
740*(.text.sys_stat)
741*(.text.sys_rt_sigreturn)
742*(.text.sys_rt_sigaction)
743*(.text.sys_remap_file_pages)
744*(.text.sys_pwrite64)
745*(.text.sys_fchownat)
746*(.text.sys_fchmodat)
747*(.text.strncat)
748*(.text.strlcat)
749*(.text.strcmp)
750*(.text.steal_locks)
751*(.text.sock_create)
752*(.text.sk_stream_rfree)
753*(.text.sk_stream_mem_schedule)
754*(.text.skip_atoi)
755*(.text.sk_alloc)
756*(.text.show_stat)
757*(.text.set_fs_pwd)
758*(.text.set_binfmt)
759*(.text.pty_unthrottle)
760*(.text.proc_symlink)
761*(.text.pipe_release)
762*(.text.pageout)
763*(.text.n_tty_write_wakeup)
764*(.text.n_tty_ioctl)
765*(.text.nr_free_zone_pages)
766*(.text.migration_thread)
767*(.text.mempool_free_slab)
768*(.text.meminfo_read_proc)
769*(.text.max_sane_readahead)
770*(.text.lru_cache_add)
771*(.text.kill_fasync)
772*(.text.kernel_read)
773*(.text.invalidate_mapping_pages)
774*(.text.inode_has_buffers)
775*(.text.init_once)
776*(.text.inet_sendmsg)
777*(.text.idedisk_issue_flush)
778*(.text.generic_file_write)
779*(.text.free_more_memory)
780*(.text.__free_fdtable)
781*(.text.filp_dtor)
782*(.text.exit_sem)
783*(.text.exit_itimers)
784*(.text.error_interrupt)
785*(.text.end_buffer_async_write)
786*(.text.eligible_child)
787*(.text.elf_map)
788*(.text.dump_task_regs)
789*(.text.dummy_task_setscheduler)
790*(.text.dummy_socket_accept)
791*(.text.dummy_file_free_security)
792*(.text.__down_read)
793*(.text.do_sock_read)
794*(.text.do_sigaltstack)
795*(.text.do_mremap)
796*(.text.current_io_context)
797*(.text.cpu_swap_callback)
798*(.text.copy_vma)
799*(.text.cap_bprm_set_security)
800*(.text.blk_insert_request)
801*(.text.bio_map_kern_endio)
802*(.text.bio_hw_segments)
803*(.text.bictcp_cong_avoid)
804*(.text.add_interrupt_randomness)
805*(.text.wait_for_completion)
806*(.text.version_read_proc)
807*(.text.unix_write_space)
808*(.text.tty_ldisc_ref_wait)
809*(.text.tty_ldisc_put)
810*(.text.try_to_wake_up)
811*(.text.tcp_v4_tw_remember_stamp)
812*(.text.tcp_try_undo_dsack)
813*(.text.tcp_may_send_now)
814*(.text.sys_waitid)
815*(.text.sys_sched_getparam)
816*(.text.sys_getppid)
817*(.text.sys_getcwd)
818*(.text.sys_dup2)
819*(.text.sys_chmod)
820*(.text.sys_chdir)
821*(.text.sprintf)
822*(.text.sock_wfree)
823*(.text.sock_aio_write)
824*(.text.skb_drop_fraglist)
825*(.text.skb_dequeue)
826*(.text.set_close_on_exec)
827*(.text.set_brk)
828*(.text.seq_puts)
829*(.text.SELECT_DRIVE)
830*(.text.sched_exec)
831*(.text.return_EIO)
832*(.text.remove_from_page_cache)
833*(.text.rcu_start_batch)
834*(.text.__put_task_struct)
835*(.text.proc_pid_readdir)
836*(.text.proc_get_inode)
837*(.text.prepare_to_wait_exclusive)
838*(.text.pipe_wait)
839*(.text.pipe_new)
840*(.text.pdflush_operation)
841*(.text.__pagevec_release)
842*(.text.pagevec_lookup_tag)
843*(.text.packet_rcv)
844*(.text.n_tty_set_room)
845*(.text.nr_free_pages)
846*(.text.__net_timestamp)
847*(.text.mpage_end_io_read)
848*(.text.mod_timer)
849*(.text.__memcpy)
850*(.text.mb_cache_shrink_fn)
851*(.text.lock_rename)
852*(.text.kstrdup)
853*(.text.is_ignored)
854*(.text.int_very_careful)
855*(.text.inotify_inode_is_dead)
856*(.text.inotify_get_cookie)
857*(.text.inode_get_bytes)
858*(.text.init_timer)
859*(.text.init_dev)
860*(.text.inet_getname)
861*(.text.ide_map_sg)
862*(.text.__ide_dma_end)
863*(.text.hrtimer_get_remaining)
864*(.text.get_task_mm)
865*(.text.get_random_int)
866*(.text.free_pipe_info)
867*(.text.filemap_write_and_wait_range)
868*(.text.exit_thread)
869*(.text.enter_idle)
870*(.text.end_that_request_first)
871*(.text.end_8259A_irq)
872*(.text.dummy_file_alloc_security)
873*(.text.do_group_exit)
874*(.text.debug_mutex_init)
875*(.text.cpuset_exit)
876*(.text.cpu_idle)
877*(.text.copy_semundo)
878*(.text.copy_files)
879*(.text.chrdev_open)
880*(.text.cdrom_transfer_packet_command)
881*(.text.cdrom_mode_sense)
882*(.text.blk_phys_contig_segment)
883*(.text.blk_get_queue)
884*(.text.bio_split)
885*(.text.audit_alloc)
886*(.text.anon_pipe_buf_release)
887*(.text.add_wait_queue_exclusive)
888*(.text.add_wait_queue)
889*(.text.acct_process)
890*(.text.account)
891*(.text.zeromap_page_range)
892*(.text.yield)
893*(.text.writeback_acquire)
894*(.text.worker_thread)
895*(.text.wait_on_page_writeback_range)
896*(.text.__wait_on_buffer)
897*(.text.vscnprintf)
898*(.text.vmalloc_to_pfn)
899*(.text.vgacon_save_screen)
900*(.text.vfs_unlink)
901*(.text.vfs_rmdir)
902*(.text.unregister_md_personality)
903*(.text.unlock_new_inode)
904*(.text.unix_stream_sendmsg)
905*(.text.unix_stream_recvmsg)
906*(.text.unhash_process)
907*(.text.udp_v4_lookup_longway)
908*(.text.tty_ldisc_flush)
909*(.text.tty_ldisc_enable)
910*(.text.tty_hung_up_p)
911*(.text.tty_buffer_free_all)
912*(.text.tso_fragment)
913*(.text.try_to_del_timer_sync)
914*(.text.tcp_v4_err)
915*(.text.tcp_unhash)
916*(.text.tcp_seq_next)
917*(.text.tcp_select_initial_window)
918*(.text.tcp_sacktag_write_queue)
919*(.text.tcp_cwnd_validate)
920*(.text.sys_vhangup)
921*(.text.sys_uselib)
922*(.text.sys_symlink)
923*(.text.sys_signal)
924*(.text.sys_poll)
925*(.text.sys_mount)
926*(.text.sys_kill)
927*(.text.sys_ioctl)
928*(.text.sys_inotify_add_watch)
929*(.text.sys_getuid)
930*(.text.sys_getrlimit)
931*(.text.sys_getitimer)
932*(.text.sys_getgroups)
933*(.text.sys_ftruncate)
934*(.text.sysfs_lookup)
935*(.text.sys_exit_group)
936*(.text.stub_fork)
937*(.text.sscanf)
938*(.text.sock_map_fd)
939*(.text.sock_get_timestamp)
940*(.text.__sock_create)
941*(.text.smp_call_function_single)
942*(.text.sk_stop_timer)
943*(.text.skb_copy_and_csum_datagram)
944*(.text.__skb_checksum_complete)
945*(.text.single_next)
946*(.text.sigqueue_alloc)
947*(.text.shrink_dcache_parent)
948*(.text.select_idle_routine)
949*(.text.run_workqueue)
950*(.text.run_local_timers)
951*(.text.remove_inode_hash)
952*(.text.remove_dquot_ref)
953*(.text.register_binfmt)
954*(.text.read_cache_pages)
955*(.text.rb_last)
956*(.text.pty_open)
957*(.text.proc_root_readdir)
958*(.text.proc_pid_flush)
959*(.text.proc_pident_lookup)
960*(.text.proc_fill_super)
961*(.text.proc_exe_link)
962*(.text.posix_locks_deadlock)
963*(.text.pipe_iov_copy_from_user)
964*(.text.opost)
965*(.text.nf_register_hook)
966*(.text.netif_rx_ni)
967*(.text.m_start)
968*(.text.mpage_writepage)
969*(.text.mm_alloc)
970*(.text.memory_open)
971*(.text.mark_buffer_async_write)
972*(.text.lru_add_drain_all)
973*(.text.locks_init_lock)
974*(.text.locks_delete_lock)
975*(.text.lock_hrtimer_base)
976*(.text.load_script)
977*(.text.__kill_fasync)
978*(.text.ip_mc_sf_allow)
979*(.text.__ioremap)
980*(.text.int_with_check)
981*(.text.int_sqrt)
982*(.text.install_thread_keyring)
983*(.text.init_page_buffers)
984*(.text.inet_sock_destruct)
985*(.text.idle_notifier_register)
986*(.text.ide_execute_command)
987*(.text.ide_end_drive_cmd)
988*(.text.__ide_dma_host_on)
989*(.text.hrtimer_run_queues)
990*(.text.hpet_mask_rtc_irq_bit)
991*(.text.__get_zone_counts)
992*(.text.get_zone_counts)
993*(.text.get_write_access)
994*(.text.get_fs_struct)
995*(.text.get_dirty_limits)
996*(.text.generic_readlink)
997*(.text.free_hot_page)
998*(.text.finish_wait)
999*(.text.find_inode)
1000*(.text.find_first_bit)
1001*(.text.__filemap_fdatawrite_range)
1002*(.text.__filemap_copy_from_user_iovec)
1003*(.text.exit_aio)
1004*(.text.elv_set_request)
1005*(.text.elv_former_request)
1006*(.text.dup_namespace)
1007*(.text.dupfd)
1008*(.text.dummy_socket_getsockopt)
1009*(.text.dummy_sb_post_mountroot)
1010*(.text.dummy_quotactl)
1011*(.text.dummy_inode_rename)
1012*(.text.__do_SAK)
1013*(.text.do_pipe)
1014*(.text.do_fsync)
1015*(.text.d_instantiate_unique)
1016*(.text.d_find_alias)
1017*(.text.deny_write_access)
1018*(.text.dentry_unhash)
1019*(.text.d_delete)
1020*(.text.datagram_poll)
1021*(.text.cpuset_fork)
1022*(.text.cpuid_read)
1023*(.text.copy_namespace)
1024*(.text.cond_resched)
1025*(.text.check_version)
1026*(.text.__change_page_attr)
1027*(.text.cfq_slab_kill)
1028*(.text.cfq_completed_request)
1029*(.text.cdrom_pc_intr)
1030*(.text.cdrom_decode_status)
1031*(.text.cap_capset_check)
1032*(.text.blk_put_request)
1033*(.text.bio_fs_destructor)
1034*(.text.bictcp_min_cwnd)
1035*(.text.alloc_chrdev_region)
1036*(.text.add_element)
1037*(.text.acct_update_integrals)
1038*(.text.write_boundary_block)
1039*(.text.writeback_release)
1040*(.text.writeback_inodes)
1041*(.text.wake_up_state)
1042*(.text.__wake_up_locked)
1043*(.text.wake_futex)
1044*(.text.wait_task_inactive)
1045*(.text.__wait_on_freeing_inode)
1046*(.text.wait_noreap_copyout)
1047*(.text.vmstat_start)
1048*(.text.vgacon_do_font_op)
1049*(.text.vfs_readv)
1050*(.text.vfs_quota_sync)
1051*(.text.update_queue)
1052*(.text.unshare_files)
1053*(.text.unmap_vm_area)
1054*(.text.unix_socketpair)
1055*(.text.unix_release_sock)
1056*(.text.unix_detach_fds)
1057*(.text.unix_create1)
1058*(.text.unix_bind)
1059*(.text.udp_sendmsg)
1060*(.text.udp_rcv)
1061*(.text.udp_queue_rcv_skb)
1062*(.text.uart_write)
1063*(.text.uart_startup)
1064*(.text.uart_open)
1065*(.text.tty_vhangup)
1066*(.text.tty_termios_baud_rate)
1067*(.text.tty_release)
1068*(.text.tty_ldisc_ref)
1069*(.text.throttle_vm_writeout)
1070*(.text.058)
1071*(.text.tcp_xmit_probe_skb)
1072*(.text.tcp_v4_send_check)
1073*(.text.tcp_v4_destroy_sock)
1074*(.text.tcp_sync_mss)
1075*(.text.tcp_snd_test)
1076*(.text.tcp_slow_start)
1077*(.text.tcp_send_fin)
1078*(.text.tcp_rtt_estimator)
1079*(.text.tcp_parse_options)
1080*(.text.tcp_ioctl)
1081*(.text.tcp_init_tso_segs)
1082*(.text.tcp_init_cwnd)
1083*(.text.tcp_getsockopt)
1084*(.text.tcp_fin)
1085*(.text.tcp_connect)
1086*(.text.tcp_cong_avoid)
1087*(.text.__tcp_checksum_complete_user)
1088*(.text.task_dumpable)
1089*(.text.sys_wait4)
1090*(.text.sys_utimes)
1091*(.text.sys_symlinkat)
1092*(.text.sys_socketpair)
1093*(.text.sys_rmdir)
1094*(.text.sys_readahead)
1095*(.text.sys_nanosleep)
1096*(.text.sys_linkat)
1097*(.text.sys_fstat)
1098*(.text.sysfs_readdir)
1099*(.text.sys_execve)
1100*(.text.sysenter_tracesys)
1101*(.text.sys_chown)
1102*(.text.stub_clone)
1103*(.text.strrchr)
1104*(.text.strncpy)
1105*(.text.stopmachine_set_state)
1106*(.text.sock_sendmsg)
1107*(.text.sock_release)
1108*(.text.sock_fasync)
1109*(.text.sock_close)
1110*(.text.sk_stream_write_space)
1111*(.text.sk_reset_timer)
1112*(.text.skb_split)
1113*(.text.skb_recv_datagram)
1114*(.text.skb_queue_tail)
1115*(.text.sk_attach_filter)
1116*(.text.si_swapinfo)
1117*(.text.simple_strtoll)
1118*(.text.set_termios)
1119*(.text.set_task_comm)
1120*(.text.set_shrinker)
1121*(.text.set_normalized_timespec)
1122*(.text.set_brk)
1123*(.text.serial_in)
1124*(.text.seq_printf)
1125*(.text.secure_dccp_sequence_number)
1126*(.text.rwlock_bug)
1127*(.text.rt_hash_code)
1128*(.text.__rta_fill)
1129*(.text.__request_resource)
1130*(.text.relocate_new_kernel)
1131*(.text.release_thread)
1132*(.text.release_mem)
1133*(.text.rb_prev)
1134*(.text.rb_first)
1135*(.text.random_poll)
1136*(.text.__put_super_and_need_restart)
1137*(.text.pty_write)
1138*(.text.ptrace_stop)
1139*(.text.proc_self_readlink)
1140*(.text.proc_root_lookup)
1141*(.text.proc_root_link)
1142*(.text.proc_pid_make_inode)
1143*(.text.proc_pid_attr_write)
1144*(.text.proc_lookupfd)
1145*(.text.proc_delete_inode)
1146*(.text.posix_same_owner)
1147*(.text.posix_block_lock)
1148*(.text.poll_initwait)
1149*(.text.pipe_write)
1150*(.text.pipe_read_fasync)
1151*(.text.pipe_ioctl)
1152*(.text.pdflush)
1153*(.text.pci_user_read_config_dword)
1154*(.text.page_readlink)
1155*(.text.null_lseek)
1156*(.text.nf_hook_slow)
1157*(.text.netlink_sock_destruct)
1158*(.text.netlink_broadcast)
1159*(.text.neigh_resolve_output)
1160*(.text.name_to_int)
1161*(.text.mwait_idle)
1162*(.text.mutex_trylock)
1163*(.text.mutex_debug_check_no_locks_held)
1164*(.text.m_stop)
1165*(.text.mpage_end_io_write)
1166*(.text.mpage_alloc)
1167*(.text.move_page_tables)
1168*(.text.mounts_open)
1169*(.text.__memset)
1170*(.text.memcpy_fromiovec)
1171*(.text.make_8259A_irq)
1172*(.text.lookup_user_key_possessed)
1173*(.text.lookup_create)
1174*(.text.locks_insert_lock)
1175*(.text.locks_alloc_lock)
1176*(.text.kthread_should_stop)
1177*(.text.kswapd)
1178*(.text.kobject_uevent)
1179*(.text.kobject_get_path)
1180*(.text.kobject_get)
1181*(.text.klist_children_put)
1182*(.text.__ip_route_output_key)
1183*(.text.ip_flush_pending_frames)
1184*(.text.ip_compute_csum)
1185*(.text.ip_append_data)
1186*(.text.ioc_set_batching)
1187*(.text.invalidate_inode_pages)
1188*(.text.__invalidate_device)
1189*(.text.install_arg_page)
1190*(.text.in_sched_functions)
1191*(.text.inotify_unmount_inodes)
1192*(.text.init_once)
1193*(.text.init_cdrom_command)
1194*(.text.inet_stream_connect)
1195*(.text.inet_sk_rebuild_header)
1196*(.text.inet_csk_addr2sockaddr)
1197*(.text.inet_create)
1198*(.text.ifind)
1199*(.text.ide_setup_dma)
1200*(.text.ide_outsw)
1201*(.text.ide_fixstring)
1202*(.text.ide_dma_setup)
1203*(.text.ide_cdrom_packet)
1204*(.text.ide_cd_put)
1205*(.text.ide_build_sglist)
1206*(.text.i8259A_shutdown)
1207*(.text.hung_up_tty_ioctl)
1208*(.text.hrtimer_nanosleep)
1209*(.text.hrtimer_init)
1210*(.text.hrtimer_cancel)
1211*(.text.hash_futex)
1212*(.text.group_send_sig_info)
1213*(.text.grab_cache_page_nowait)
1214*(.text.get_wchan)
1215*(.text.get_stack)
1216*(.text.get_page_state)
1217*(.text.getnstimeofday)
1218*(.text.get_node)
1219*(.text.get_kprobe)
1220*(.text.generic_unplug_device)
1221*(.text.free_task)
1222*(.text.frag_show)
1223*(.text.find_next_zero_string)
1224*(.text.filp_open)
1225*(.text.fillonedir)
1226*(.text.exit_io_context)
1227*(.text.exit_idle)
1228*(.text.exact_lock)
1229*(.text.eth_header)
1230*(.text.dummy_unregister_security)
1231*(.text.dummy_socket_post_create)
1232*(.text.dummy_socket_listen)
1233*(.text.dummy_quota_on)
1234*(.text.dummy_inode_follow_link)
1235*(.text.dummy_file_receive)
1236*(.text.dummy_file_mprotect)
1237*(.text.dummy_file_lock)
1238*(.text.dummy_file_ioctl)
1239*(.text.dummy_bprm_post_apply_creds)
1240*(.text.do_writepages)
1241*(.text.__down_interruptible)
1242*(.text.do_notify_resume)
1243*(.text.do_acct_process)
1244*(.text.del_timer_sync)
1245*(.text.default_rebuild_header)
1246*(.text.d_callback)
1247*(.text.dcache_readdir)
1248*(.text.ctrl_dumpfamily)
1249*(.text.cpuset_rmdir)
1250*(.text.copy_strings_kernel)
1251*(.text.con_write_room)
1252*(.text.complete_all)
1253*(.text.collect_sigign_sigcatch)
1254*(.text.clear_user)
1255*(.text.check_unthrottle)
1256*(.text.cdrom_release)
1257*(.text.cdrom_newpc_intr)
1258*(.text.cdrom_ioctl)
1259*(.text.cdrom_check_status)
1260*(.text.cdev_put)
1261*(.text.cdev_add)
1262*(.text.cap_ptrace)
1263*(.text.cap_bprm_secureexec)
1264*(.text.cache_alloc_refill)
1265*(.text.bmap)
1266*(.text.blk_run_queue)
1267*(.text.blk_queue_dma_alignment)
1268*(.text.blk_ordered_req_seq)
1269*(.text.blk_backing_dev_unplug)
1270*(.text.__bitmap_subset)
1271*(.text.__bitmap_and)
1272*(.text.bio_unmap_user)
1273*(.text.__bforget)
1274*(.text.bd_forget)
1275*(.text.bad_pipe_w)
1276*(.text.bad_get_user)
1277*(.text.audit_free)
1278*(.text.anon_vma_ctor)
1279*(.text.anon_pipe_buf_map)
1280*(.text.alloc_sock_iocb)
1281*(.text.alloc_fdset)
1282*(.text.aio_kick_handler)
1283*(.text.__add_entropy_words)
1284*(.text.add_disk_randomness)
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 0b3603adf56..47496a40e84 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -11,120 +11,54 @@
11#include <linux/threads.h> 11#include <linux/threads.h>
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/string.h> 13#include <linux/string.h>
14#include <linux/module.h>
14#include <linux/kernel.h> 15#include <linux/kernel.h>
15#include <linux/ctype.h> 16#include <linux/ctype.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/module.h>
18 18
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/ipi.h> 20#include <asm/ipi.h>
21#include <asm/genapic.h>
21 22
22#if defined(CONFIG_ACPI) 23#ifdef CONFIG_ACPI
23#include <acpi/acpi_bus.h> 24#include <acpi/acpi_bus.h>
24#endif 25#endif
25 26
26/* which logical CPU number maps to which CPU (physical APIC ID) */ 27/* which logical CPU number maps to which CPU (physical APIC ID) */
27u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; 28u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
29 = { [0 ... NR_CPUS-1] = BAD_APICID };
28EXPORT_SYMBOL(x86_cpu_to_apicid); 30EXPORT_SYMBOL(x86_cpu_to_apicid);
29u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
30 31
31extern struct genapic apic_cluster; 32u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
32extern struct genapic apic_flat;
33extern struct genapic apic_physflat;
34 33
35struct genapic *genapic = &apic_flat; 34struct genapic __read_mostly *genapic = &apic_flat;
36struct genapic *genapic_force;
37 35
38/* 36/*
39 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 37 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
40 */ 38 */
41void __init clustered_apic_check(void) 39void __init setup_apic_routing(void)
42{ 40{
43 long i; 41#ifdef CONFIG_ACPI
44 u8 clusters, max_cluster;
45 u8 id;
46 u8 cluster_cnt[NUM_APIC_CLUSTERS];
47 int max_apic = 0;
48
49 /* genapic selection can be forced because of certain quirks.
50 */
51 if (genapic_force) {
52 genapic = genapic_force;
53 goto print;
54 }
55
56#if defined(CONFIG_ACPI)
57 /* 42 /*
58 * Some x86_64 machines use physical APIC mode regardless of how many 43 * Quirk: some x86_64 machines can only use physical APIC mode
59 * procs/clusters are present (x86_64 ES7000 is an example). 44 * regardless of how many processors are present (x86_64 ES7000
45 * is an example).
60 */ 46 */
61 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID) 47 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
62 if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) { 48 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
63 genapic = &apic_cluster;
64 goto print;
65 }
66#endif
67
68 memset(cluster_cnt, 0, sizeof(cluster_cnt));
69 for (i = 0; i < NR_CPUS; i++) {
70 id = bios_cpu_apicid[i];
71 if (id == BAD_APICID)
72 continue;
73 if (id > max_apic)
74 max_apic = id;
75 cluster_cnt[APIC_CLUSTERID(id)]++;
76 }
77
78 /* Don't use clustered mode on AMD platforms. */
79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
80 genapic = &apic_physflat; 49 genapic = &apic_physflat;
81#ifndef CONFIG_HOTPLUG_CPU 50 else
82 /* In the CPU hotplug case we cannot use broadcast mode
83 because that opens a race when a CPU is removed.
84 Stay at physflat mode in this case.
85 It is bad to do this unconditionally though. Once
86 we have ACPI platform support for CPU hotplug
87 we should detect hotplug capablity from ACPI tables and
88 only do this when really needed. -AK */
89 if (max_apic <= 8)
90 genapic = &apic_flat;
91#endif 51#endif
92 goto print;
93 }
94 52
95 clusters = 0; 53 if (cpus_weight(cpu_possible_map) <= 8)
96 max_cluster = 0;
97
98 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
99 if (cluster_cnt[i] > 0) {
100 ++clusters;
101 if (cluster_cnt[i] > max_cluster)
102 max_cluster = cluster_cnt[i];
103 }
104 }
105
106 /*
107 * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
108 * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
109 * else physical mode.
110 * (We don't use lowest priority delivery + HW APIC IRQ steering, so
111 * can ignore the clustered logical case and go straight to physical.)
112 */
113 if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
114#ifdef CONFIG_HOTPLUG_CPU
115 /* Don't use APIC shortcuts in CPU hotplug to avoid races */
116 genapic = &apic_physflat;
117#else
118 genapic = &apic_flat; 54 genapic = &apic_flat;
119#endif 55 else
120 } else 56 genapic = &apic_physflat;
121 genapic = &apic_cluster;
122 57
123print:
124 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 58 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
125} 59}
126 60
127/* Same for both flat and clustered. */ 61/* Same for both flat and physical. */
128 62
129void send_IPI_self(int vector) 63void send_IPI_self(int vector)
130{ 64{
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
deleted file mode 100644
index 73d76308b95..00000000000
--- a/arch/x86_64/kernel/genapic_cluster.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Clustered APIC subarch code. Up to 255 CPUs, physical delivery.
6 * (A more realistic maximum is around 230 CPUs.)
7 *
8 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
9 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
10 * James Cleverdon.
11 */
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <asm/smp.h>
19#include <asm/ipi.h>
20
21
22/*
23 * Set up the logical destination ID.
24 *
25 * Intel recommends to set DFR, LDR and TPR before enabling
26 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
27 * document number 292116). So here it goes...
28 */
29static void cluster_init_apic_ldr(void)
30{
31 unsigned long val, id;
32 long i, count;
33 u8 lid;
34 u8 my_id = hard_smp_processor_id();
35 u8 my_cluster = APIC_CLUSTER(my_id);
36
37 /* Create logical APIC IDs by counting CPUs already in cluster. */
38 for (count = 0, i = NR_CPUS; --i >= 0; ) {
39 lid = x86_cpu_to_log_apicid[i];
40 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
41 ++count;
42 }
43 /*
44 * We only have a 4 wide bitmap in cluster mode. There's no way
45 * to get above 60 CPUs and still give each one it's own bit.
46 * But, we're using physical IRQ delivery, so we don't care.
47 * Use bit 3 for the 4th through Nth CPU in each cluster.
48 */
49 if (count >= XAPIC_DEST_CPUS_SHIFT)
50 count = 3;
51 id = my_cluster | (1UL << count);
52 x86_cpu_to_log_apicid[smp_processor_id()] = id;
53 apic_write(APIC_DFR, APIC_DFR_CLUSTER);
54 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
55 val |= SET_APIC_LOGICAL_ID(id);
56 apic_write(APIC_LDR, val);
57}
58
59/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
60
61static cpumask_t cluster_target_cpus(void)
62{
63 return cpumask_of_cpu(0);
64}
65
66static cpumask_t cluster_vector_allocation_domain(int cpu)
67{
68 cpumask_t domain = CPU_MASK_NONE;
69 cpu_set(cpu, domain);
70 return domain;
71}
72
73static void cluster_send_IPI_mask(cpumask_t mask, int vector)
74{
75 send_IPI_mask_sequence(mask, vector);
76}
77
78static void cluster_send_IPI_allbutself(int vector)
79{
80 cpumask_t mask = cpu_online_map;
81
82 cpu_clear(smp_processor_id(), mask);
83
84 if (!cpus_empty(mask))
85 cluster_send_IPI_mask(mask, vector);
86}
87
88static void cluster_send_IPI_all(int vector)
89{
90 cluster_send_IPI_mask(cpu_online_map, vector);
91}
92
93static int cluster_apic_id_registered(void)
94{
95 return 1;
96}
97
98static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
99{
100 int cpu;
101
102 /*
103 * We're using fixed IRQ delivery, can only return one phys APIC ID.
104 * May as well be the first.
105 */
106 cpu = first_cpu(cpumask);
107 if ((unsigned)cpu < NR_CPUS)
108 return x86_cpu_to_apicid[cpu];
109 else
110 return BAD_APICID;
111}
112
113/* cpuid returns the value latched in the HW at reset, not the APIC ID
114 * register's value. For any box whose BIOS changes APIC IDs, like
115 * clustered APIC systems, we must use hard_smp_processor_id.
116 *
117 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
118 */
119static unsigned int phys_pkg_id(int index_msb)
120{
121 return hard_smp_processor_id() >> index_msb;
122}
123
124struct genapic apic_cluster = {
125 .name = "clustered",
126 .int_delivery_mode = dest_Fixed,
127 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
128 .target_cpus = cluster_target_cpus,
129 .vector_allocation_domain = cluster_vector_allocation_domain,
130 .apic_id_registered = cluster_apic_id_registered,
131 .init_apic_ldr = cluster_init_apic_ldr,
132 .send_IPI_all = cluster_send_IPI_all,
133 .send_IPI_allbutself = cluster_send_IPI_allbutself,
134 .send_IPI_mask = cluster_send_IPI_mask,
135 .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
136 .phys_pkg_id = phys_pkg_id,
137};
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 7c01db8fa9d..ecb01eefdd2 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -8,6 +8,7 @@
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and 8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon. 9 * James Cleverdon.
10 */ 10 */
11#include <linux/errno.h>
11#include <linux/threads.h> 12#include <linux/threads.h>
12#include <linux/cpumask.h> 13#include <linux/cpumask.h>
13#include <linux/string.h> 14#include <linux/string.h>
@@ -16,6 +17,7 @@
16#include <linux/init.h> 17#include <linux/init.h>
17#include <asm/smp.h> 18#include <asm/smp.h>
18#include <asm/ipi.h> 19#include <asm/ipi.h>
20#include <asm/genapic.h>
19 21
20static cpumask_t flat_target_cpus(void) 22static cpumask_t flat_target_cpus(void)
21{ 23{
@@ -60,31 +62,10 @@ static void flat_init_apic_ldr(void)
60static void flat_send_IPI_mask(cpumask_t cpumask, int vector) 62static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
61{ 63{
62 unsigned long mask = cpus_addr(cpumask)[0]; 64 unsigned long mask = cpus_addr(cpumask)[0];
63 unsigned long cfg;
64 unsigned long flags; 65 unsigned long flags;
65 66
66 local_irq_save(flags); 67 local_irq_save(flags);
67 68 __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
68 /*
69 * Wait for idle.
70 */
71 apic_wait_icr_idle();
72
73 /*
74 * prepare target chip field
75 */
76 cfg = __prepare_ICR2(mask);
77 apic_write(APIC_ICR2, cfg);
78
79 /*
80 * program the ICR
81 */
82 cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL);
83
84 /*
85 * Send the IPI. The write to APIC_ICR fires this off.
86 */
87 apic_write(APIC_ICR, cfg);
88 local_irq_restore(flags); 69 local_irq_restore(flags);
89} 70}
90 71
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 598a4d0351f..1fab487dee8 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -5,6 +5,7 @@
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> 6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> 7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
8 * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
8 */ 9 */
9 10
10 11
@@ -13,97 +14,131 @@
13#include <linux/init.h> 14#include <linux/init.h>
14#include <asm/desc.h> 15#include <asm/desc.h>
15#include <asm/segment.h> 16#include <asm/segment.h>
17#include <asm/pgtable.h>
16#include <asm/page.h> 18#include <asm/page.h>
17#include <asm/msr.h> 19#include <asm/msr.h>
18#include <asm/cache.h> 20#include <asm/cache.h>
19 21
20/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 22/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
21 * because we need identity-mapped pages on setup so define __START_KERNEL to 23 * because we need identity-mapped pages.
22 * 0x100000 for this stage 24 *
23 *
24 */ 25 */
25 26
26 .text 27 .text
27 .section .bootstrap.text 28 .section .bootstrap.text
28 .code32 29 .code64
29 .globl startup_32 30 .globl startup_64
30/* %bx: 1 if coming from smp trampoline on secondary cpu */ 31startup_64:
31startup_32: 32
32
33 /* 33 /*
34 * At this point the CPU runs in 32bit protected mode (CS.D = 1) with 34 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
35 * paging disabled and the point of this file is to switch to 64bit 35 * and someone has loaded an identity mapped page table
36 * long mode with a kernel mapping for kerneland to jump into the 36 * for us. These identity mapped page tables map all of the
37 * kernel virtual addresses. 37 * kernel pages and possibly all of memory.
38 * There is no stack until we set one up. 38 *
39 * %esi holds a physical pointer to real_mode_data.
40 *
41 * We come here either directly from a 64bit bootloader, or from
42 * arch/x86_64/boot/compressed/head.S.
43 *
44 * We only come here initially at boot nothing else comes here.
45 *
46 * Since we may be loaded at an address different from what we were
47 * compiled to run at we first fixup the physical addresses in our page
48 * tables and then reload them.
39 */ 49 */
40 50
41 /* Initialize the %ds segment register */ 51 /* Compute the delta between the address I am compiled to run at and the
42 movl $__KERNEL_DS,%eax 52 * address I am actually running at.
43 movl %eax,%ds
44
45 /* Load new GDT with the 64bit segments using 32bit descriptor */
46 lgdt pGDT32 - __START_KERNEL_map
47
48 /* If the CPU doesn't support CPUID this will double fault.
49 * Unfortunately it is hard to check for CPUID without a stack.
50 */ 53 */
51 54 leaq _text(%rip), %rbp
52 /* Check if extended functions are implemented */ 55 subq $_text - __START_KERNEL_map, %rbp
53 movl $0x80000000, %eax 56
54 cpuid 57 /* Is the address not 2M aligned? */
55 cmpl $0x80000000, %eax 58 movq %rbp, %rax
56 jbe no_long_mode 59 andl $~LARGE_PAGE_MASK, %eax
57 /* Check if long mode is implemented */ 60 testl %eax, %eax
58 mov $0x80000001, %eax 61 jnz bad_address
59 cpuid 62
60 btl $29, %edx 63 /* Is the address too large? */
61 jnc no_long_mode 64 leaq _text(%rip), %rdx
62 65 movq $PGDIR_SIZE, %rax
63 /* 66 cmpq %rax, %rdx
64 * Prepare for entering 64bits mode 67 jae bad_address
68
69 /* Fixup the physical addresses in the page table
65 */ 70 */
71 addq %rbp, init_level4_pgt + 0(%rip)
72 addq %rbp, init_level4_pgt + (258*8)(%rip)
73 addq %rbp, init_level4_pgt + (511*8)(%rip)
74
75 addq %rbp, level3_ident_pgt + 0(%rip)
76 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
77
78 /* Add an Identity mapping if I am above 1G */
79 leaq _text(%rip), %rdi
80 andq $LARGE_PAGE_MASK, %rdi
81
82 movq %rdi, %rax
83 shrq $PUD_SHIFT, %rax
84 andq $(PTRS_PER_PUD - 1), %rax
85 jz ident_complete
86
87 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
88 leaq level3_ident_pgt(%rip), %rbx
89 movq %rdx, 0(%rbx, %rax, 8)
90
91 movq %rdi, %rax
92 shrq $PMD_SHIFT, %rax
93 andq $(PTRS_PER_PMD - 1), %rax
94 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
95 leaq level2_spare_pgt(%rip), %rbx
96 movq %rdx, 0(%rbx, %rax, 8)
97ident_complete:
98
99 /* Fixup the kernel text+data virtual addresses
100 */
101 leaq level2_kernel_pgt(%rip), %rdi
102 leaq 4096(%rdi), %r8
103 /* See if it is a valid page table entry */
1041: testq $1, 0(%rdi)
105 jz 2f
106 addq %rbp, 0(%rdi)
107 /* Go to the next page */
1082: addq $8, %rdi
109 cmp %r8, %rdi
110 jne 1b
111
112 /* Fixup phys_base */
113 addq %rbp, phys_base(%rip)
66 114
67 /* Enable PAE mode */ 115#ifdef CONFIG_SMP
68 xorl %eax, %eax 116 addq %rbp, trampoline_level4_pgt + 0(%rip)
69 btsl $5, %eax 117 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
70 movl %eax, %cr4 118#endif
71 119#ifdef CONFIG_ACPI_SLEEP
72 /* Setup early boot stage 4 level pagetables */ 120 addq %rbp, wakeup_level4_pgt + 0(%rip)
73 movl $(boot_level4_pgt - __START_KERNEL_map), %eax 121 addq %rbp, wakeup_level4_pgt + (511*8)(%rip)
74 movl %eax, %cr3 122#endif
75
76 /* Setup EFER (Extended Feature Enable Register) */
77 movl $MSR_EFER, %ecx
78 rdmsr
79
80 /* Enable Long Mode */
81 btsl $_EFER_LME, %eax
82
83 /* Make changes effective */
84 wrmsr
85 123
86 xorl %eax, %eax 124 /* Due to ENTRY(), sometimes the empty space gets filled with
87 btsl $31, %eax /* Enable paging and in turn activate Long Mode */ 125 * zeros. Better take a jmp than relying on empty space being
88 btsl $0, %eax /* Enable protected mode */ 126 * filled with 0x90 (nop)
89 /* Make changes effective */
90 movl %eax, %cr0
91 /*
92 * At this point we're in long mode but in 32bit compatibility mode
93 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
94 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
95 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
96 */ 127 */
97 ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) 128 jmp secondary_startup_64
98 129ENTRY(secondary_startup_64)
99 .code64 130 /*
100 .org 0x100 131 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
101 .globl startup_64 132 * and someone has loaded a mapped page table.
102startup_64: 133 *
103 /* We come here either from startup_32 134 * %esi holds a physical pointer to real_mode_data.
104 * or directly from a 64bit bootloader. 135 *
105 * Since we may have come directly from a bootloader we 136 * We come here either from startup_64 (using physical addresses)
106 * reload the page tables here. 137 * or from trampoline.S (using virtual addresses).
138 *
139 * Using virtual addresses from trampoline.S removes the need
140 * to have any identity mapped pages in the kernel page table
141 * after the boot processor executes this code.
107 */ 142 */
108 143
109 /* Enable PAE mode and PGE */ 144 /* Enable PAE mode and PGE */
@@ -113,9 +148,15 @@ startup_64:
113 movq %rax, %cr4 148 movq %rax, %cr4
114 149
115 /* Setup early boot stage 4 level pagetables. */ 150 /* Setup early boot stage 4 level pagetables. */
116 movq $(boot_level4_pgt - __START_KERNEL_map), %rax 151 movq $(init_level4_pgt - __START_KERNEL_map), %rax
152 addq phys_base(%rip), %rax
117 movq %rax, %cr3 153 movq %rax, %cr3
118 154
155 /* Ensure I am executing from virtual addresses */
156 movq $1f, %rax
157 jmp *%rax
1581:
159
119 /* Check if nx is implemented */ 160 /* Check if nx is implemented */
120 movl $0x80000001, %eax 161 movl $0x80000001, %eax
121 cpuid 162 cpuid
@@ -124,17 +165,11 @@ startup_64:
124 /* Setup EFER (Extended Feature Enable Register) */ 165 /* Setup EFER (Extended Feature Enable Register) */
125 movl $MSR_EFER, %ecx 166 movl $MSR_EFER, %ecx
126 rdmsr 167 rdmsr
127 168 btsl $_EFER_SCE, %eax /* Enable System Call */
128 /* Enable System Call */ 169 btl $20,%edi /* No Execute supported? */
129 btsl $_EFER_SCE, %eax
130
131 /* No Execute supported? */
132 btl $20,%edi
133 jnc 1f 170 jnc 1f
134 btsl $_EFER_NX, %eax 171 btsl $_EFER_NX, %eax
1351: 1721: wrmsr /* Make changes effective */
136 /* Make changes effective */
137 wrmsr
138 173
139 /* Setup cr0 */ 174 /* Setup cr0 */
140#define CR0_PM 1 /* protected mode */ 175#define CR0_PM 1 /* protected mode */
@@ -161,7 +196,7 @@ startup_64:
161 * addresses where we're currently running on. We have to do that here 196 * addresses where we're currently running on. We have to do that here
162 * because in 32bit we couldn't load a 64bit linear address. 197 * because in 32bit we couldn't load a 64bit linear address.
163 */ 198 */
164 lgdt cpu_gdt_descr 199 lgdt cpu_gdt_descr(%rip)
165 200
166 /* set up data segments. actually 0 would do too */ 201 /* set up data segments. actually 0 would do too */
167 movl $__KERNEL_DS,%eax 202 movl $__KERNEL_DS,%eax
@@ -212,6 +247,9 @@ initial_code:
212init_rsp: 247init_rsp:
213 .quad init_thread_union+THREAD_SIZE-8 248 .quad init_thread_union+THREAD_SIZE-8
214 249
250bad_address:
251 jmp bad_address
252
215ENTRY(early_idt_handler) 253ENTRY(early_idt_handler)
216 cmpl $2,early_recursion_flag(%rip) 254 cmpl $2,early_recursion_flag(%rip)
217 jz 1f 255 jz 1f
@@ -240,110 +278,66 @@ early_idt_msg:
240early_idt_ripmsg: 278early_idt_ripmsg:
241 .asciz "RIP %s\n" 279 .asciz "RIP %s\n"
242 280
243.code32 281.balign PAGE_SIZE
244ENTRY(no_long_mode)
245 /* This isn't an x86-64 CPU so hang */
2461:
247 jmp 1b
248
249.org 0xf00
250 .globl pGDT32
251pGDT32:
252 .word gdt_end-cpu_gdt_table-1
253 .long cpu_gdt_table-__START_KERNEL_map
254
255.org 0xf10
256ljumpvector:
257 .long startup_64-__START_KERNEL_map
258 .word __KERNEL_CS
259 282
260ENTRY(stext)
261ENTRY(_stext)
262
263 $page = 0
264#define NEXT_PAGE(name) \ 283#define NEXT_PAGE(name) \
265 $page = $page + 1; \ 284 .balign PAGE_SIZE; \
266 .org $page * 0x1000; \
267 phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
268ENTRY(name) 285ENTRY(name)
269 286
287/* Automate the creation of 1 to 1 mapping pmd entries */
288#define PMDS(START, PERM, COUNT) \
289 i = 0 ; \
290 .rept (COUNT) ; \
291 .quad (START) + (i << 21) + (PERM) ; \
292 i = i + 1 ; \
293 .endr
294
295 /*
296 * This default setting generates an ident mapping at address 0x100000
297 * and a mapping for the kernel that precisely maps virtual address
298 * 0xffffffff80000000 to physical address 0x000000. (always using
299 * 2Mbyte large pages provided by PAE mode)
300 */
270NEXT_PAGE(init_level4_pgt) 301NEXT_PAGE(init_level4_pgt)
271 /* This gets initialized in x86_64_start_kernel */ 302 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
272 .fill 512,8,0 303 .fill 257,8,0
304 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
305 .fill 252,8,0
306 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
307 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
273 308
274NEXT_PAGE(level3_ident_pgt) 309NEXT_PAGE(level3_ident_pgt)
275 .quad phys_level2_ident_pgt | 0x007 310 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
276 .fill 511,8,0 311 .fill 511,8,0
277 312
278NEXT_PAGE(level3_kernel_pgt) 313NEXT_PAGE(level3_kernel_pgt)
279 .fill 510,8,0 314 .fill 510,8,0
280 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 315 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
281 .quad phys_level2_kernel_pgt | 0x007 316 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
282 .fill 1,8,0 317 .fill 1,8,0
283 318
284NEXT_PAGE(level2_ident_pgt) 319NEXT_PAGE(level2_ident_pgt)
285 /* 40MB for bootup. */ 320 /* Since I easily can, map the first 1G.
286 i = 0 321 * Don't set NX because code runs from these pages.
287 .rept 20 322 */
288 .quad i << 21 | 0x083 323 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
289 i = i + 1 324
290 .endr
291 /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
292 .globl temp_boot_pmds
293temp_boot_pmds:
294 .fill 492,8,0
295
296NEXT_PAGE(level2_kernel_pgt) 325NEXT_PAGE(level2_kernel_pgt)
297 /* 40MB kernel mapping. The kernel code cannot be bigger than that. 326 /* 40MB kernel mapping. The kernel code cannot be bigger than that.
298 When you change this change KERNEL_TEXT_SIZE in page.h too. */ 327 When you change this change KERNEL_TEXT_SIZE in page.h too. */
299 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ 328 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
300 i = 0 329 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
301 .rept 20 330 KERNEL_TEXT_SIZE/PMD_SIZE)
302 .quad i << 21 | 0x183
303 i = i + 1
304 .endr
305 /* Module mapping starts here */ 331 /* Module mapping starts here */
306 .fill 492,8,0 332 .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
307 333
308NEXT_PAGE(level3_physmem_pgt) 334NEXT_PAGE(level2_spare_pgt)
309 .quad phys_level2_kernel_pgt | 0x007 /* so that __va works even before pagetable_init */ 335 .fill 512,8,0
310 .fill 511,8,0
311 336
337#undef PMDS
312#undef NEXT_PAGE 338#undef NEXT_PAGE
313 339
314 .data 340 .data
315
316#ifdef CONFIG_ACPI_SLEEP
317 .align PAGE_SIZE
318ENTRY(wakeup_level4_pgt)
319 .quad phys_level3_ident_pgt | 0x007
320 .fill 255,8,0
321 .quad phys_level3_physmem_pgt | 0x007
322 .fill 254,8,0
323 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
324 .quad phys_level3_kernel_pgt | 0x007
325#endif
326
327#ifndef CONFIG_HOTPLUG_CPU
328 __INITDATA
329#endif
330 /*
331 * This default setting generates an ident mapping at address 0x100000
332 * and a mapping for the kernel that precisely maps virtual address
333 * 0xffffffff80000000 to physical address 0x000000. (always using
334 * 2Mbyte large pages provided by PAE mode)
335 */
336 .align PAGE_SIZE
337ENTRY(boot_level4_pgt)
338 .quad phys_level3_ident_pgt | 0x007
339 .fill 255,8,0
340 .quad phys_level3_physmem_pgt | 0x007
341 .fill 254,8,0
342 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
343 .quad phys_level3_kernel_pgt | 0x007
344
345 .data
346
347 .align 16 341 .align 16
348 .globl cpu_gdt_descr 342 .globl cpu_gdt_descr
349cpu_gdt_descr: 343cpu_gdt_descr:
@@ -357,6 +351,10 @@ gdt:
357 .endr 351 .endr
358#endif 352#endif
359 353
354ENTRY(phys_base)
355 /* This must match the first entry in level2_kernel_pgt */
356 .quad 0x0000000000000000
357
360/* We need valid kernel segments for data and code in long mode too 358/* We need valid kernel segments for data and code in long mode too
361 * IRET will check the segment types kkeil 2000/10/28 359 * IRET will check the segment types kkeil 2000/10/28
362 * Also sysret mandates a special GDT layout 360 * Also sysret mandates a special GDT layout
@@ -370,13 +368,13 @@ gdt:
370 368
371ENTRY(cpu_gdt_table) 369ENTRY(cpu_gdt_table)
372 .quad 0x0000000000000000 /* NULL descriptor */ 370 .quad 0x0000000000000000 /* NULL descriptor */
371 .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
372 .quad 0x00af9b000000ffff /* __KERNEL_CS */
373 .quad 0x00cf93000000ffff /* __KERNEL_DS */
374 .quad 0x00cffb000000ffff /* __USER32_CS */
375 .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
376 .quad 0x00affb000000ffff /* __USER_CS */
373 .quad 0x0 /* unused */ 377 .quad 0x0 /* unused */
374 .quad 0x00af9a000000ffff /* __KERNEL_CS */
375 .quad 0x00cf92000000ffff /* __KERNEL_DS */
376 .quad 0x00cffa000000ffff /* __USER32_CS */
377 .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
378 .quad 0x00affa000000ffff /* __USER_CS */
379 .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
380 .quad 0,0 /* TSS */ 378 .quad 0,0 /* TSS */
381 .quad 0,0 /* LDT */ 379 .quad 0,0 /* LDT */
382 .quad 0,0,0 /* three TLS descriptors */ 380 .quad 0,0,0 /* three TLS descriptors */
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 5f197b0a330..213d90e0475 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -18,8 +18,16 @@
18#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/pgtable.h> 20#include <asm/pgtable.h>
21#include <asm/tlbflush.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22 23
24static void __init zap_identity_mappings(void)
25{
26 pgd_t *pgd = pgd_offset_k(0UL);
27 pgd_clear(pgd);
28 __flush_tlb();
29}
30
23/* Don't add a printk in there. printk relies on the PDA which is not initialized 31/* Don't add a printk in there. printk relies on the PDA which is not initialized
24 yet. */ 32 yet. */
25static void __init clear_bss(void) 33static void __init clear_bss(void)
@@ -29,25 +37,24 @@ static void __init clear_bss(void)
29} 37}
30 38
31#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ 39#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
32#define OLD_CL_MAGIC_ADDR 0x90020 40#define OLD_CL_MAGIC_ADDR 0x20
33#define OLD_CL_MAGIC 0xA33F 41#define OLD_CL_MAGIC 0xA33F
34#define OLD_CL_BASE_ADDR 0x90000 42#define OLD_CL_OFFSET 0x22
35#define OLD_CL_OFFSET 0x90022
36 43
37static void __init copy_bootdata(char *real_mode_data) 44static void __init copy_bootdata(char *real_mode_data)
38{ 45{
39 int new_data; 46 unsigned long new_data;
40 char * command_line; 47 char * command_line;
41 48
42 memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); 49 memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
43 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); 50 new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
44 if (!new_data) { 51 if (!new_data) {
45 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { 52 if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
46 return; 53 return;
47 } 54 }
48 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; 55 new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
49 } 56 }
50 command_line = (char *) ((u64)(new_data)); 57 command_line = __va(new_data);
51 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 58 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
52} 59}
53 60
@@ -55,26 +62,30 @@ void __init x86_64_start_kernel(char * real_mode_data)
55{ 62{
56 int i; 63 int i;
57 64
65 /*
66 * Make sure kernel is aligned to 2MB address. Catching it at compile
67 * time is better. Change your config file and compile the kernel
68 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
69 */
70 BUILD_BUG_ON(CONFIG_PHYSICAL_START & (__KERNEL_ALIGN - 1));
71
58 /* clear bss before set_intr_gate with early_idt_handler */ 72 /* clear bss before set_intr_gate with early_idt_handler */
59 clear_bss(); 73 clear_bss();
60 74
75 /* Make NULL pointers segfault */
76 zap_identity_mappings();
77
61 for (i = 0; i < IDT_ENTRIES; i++) 78 for (i = 0; i < IDT_ENTRIES; i++)
62 set_intr_gate(i, early_idt_handler); 79 set_intr_gate(i, early_idt_handler);
63 asm volatile("lidt %0" :: "m" (idt_descr)); 80 asm volatile("lidt %0" :: "m" (idt_descr));
64 81
65 early_printk("Kernel alive\n"); 82 early_printk("Kernel alive\n");
66 83
67 /*
68 * switch to init_level4_pgt from boot_level4_pgt
69 */
70 memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
71 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
72
73 for (i = 0; i < NR_CPUS; i++) 84 for (i = 0; i < NR_CPUS; i++)
74 cpu_pda(i) = &boot_cpu_pda[i]; 85 cpu_pda(i) = &boot_cpu_pda[i];
75 86
76 pda_init(0); 87 pda_init(0);
77 copy_bootdata(real_mode_data); 88 copy_bootdata(__va(real_mode_data));
78#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
79 cpu_set(0, cpu_online_map); 90 cpu_set(0, cpu_online_map);
80#endif 91#endif
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index b7d2b76b92d..2a2df14dab7 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -907,10 +907,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
907 enable_8259A_irq(0); 907 enable_8259A_irq(0);
908} 908}
909 909
910void __init UNEXPECTED_IO_APIC(void)
911{
912}
913
914void __apicdebuginit print_IO_APIC(void) 910void __apicdebuginit print_IO_APIC(void)
915{ 911{
916 int apic, i; 912 int apic, i;
@@ -946,40 +942,16 @@ void __apicdebuginit print_IO_APIC(void)
946 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); 942 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
947 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 943 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
948 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 944 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
949 if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
950 UNEXPECTED_IO_APIC();
951 945
952 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01); 946 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
953 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 947 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
954 if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
955 (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
956 (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
957 (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
958 (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
959 (reg_01.bits.entries != 0x2E) &&
960 (reg_01.bits.entries != 0x3F) &&
961 (reg_01.bits.entries != 0x03)
962 )
963 UNEXPECTED_IO_APIC();
964 948
965 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 949 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
966 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 950 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
967 if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
968 (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
969 (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
970 (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
971 (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
972 (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
973 )
974 UNEXPECTED_IO_APIC();
975 if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
976 UNEXPECTED_IO_APIC();
977 951
978 if (reg_01.bits.version >= 0x10) { 952 if (reg_01.bits.version >= 0x10) {
979 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); 953 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
980 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); 954 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
981 if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
982 UNEXPECTED_IO_APIC();
983 } 955 }
984 956
985 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 957 printk(KERN_DEBUG ".... IRQ redirection table:\n");
@@ -1407,8 +1379,7 @@ static void irq_complete_move(unsigned int irq)
1407 1379
1408 vector = ~get_irq_regs()->orig_rax; 1380 vector = ~get_irq_regs()->orig_rax;
1409 me = smp_processor_id(); 1381 me = smp_processor_id();
1410 if ((vector == cfg->vector) && 1382 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1411 cpu_isset(smp_processor_id(), cfg->domain)) {
1412 cpumask_t cleanup_mask; 1383 cpumask_t cleanup_mask;
1413 1384
1414 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 1385 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index 745b1f0f494..387d347b0e0 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -16,6 +16,7 @@
16#include <linux/stddef.h> 16#include <linux/stddef.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/thread_info.h> 18#include <linux/thread_info.h>
19#include <linux/syscalls.h>
19 20
20/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 21/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
21static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) 22static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 0497e3bd5bf..a8bb33c1a8f 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -191,19 +191,19 @@ NORET_TYPE void machine_kexec(struct kimage *image)
191 191
192 page_list[PA_CONTROL_PAGE] = __pa(control_page); 192 page_list[PA_CONTROL_PAGE] = __pa(control_page);
193 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 193 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
194 page_list[PA_PGD] = __pa(kexec_pgd); 194 page_list[PA_PGD] = __pa_symbol(&kexec_pgd);
195 page_list[VA_PGD] = (unsigned long)kexec_pgd; 195 page_list[VA_PGD] = (unsigned long)kexec_pgd;
196 page_list[PA_PUD_0] = __pa(kexec_pud0); 196 page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0);
197 page_list[VA_PUD_0] = (unsigned long)kexec_pud0; 197 page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
198 page_list[PA_PMD_0] = __pa(kexec_pmd0); 198 page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0);
199 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; 199 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
200 page_list[PA_PTE_0] = __pa(kexec_pte0); 200 page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0);
201 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 201 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
202 page_list[PA_PUD_1] = __pa(kexec_pud1); 202 page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1);
203 page_list[VA_PUD_1] = (unsigned long)kexec_pud1; 203 page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
204 page_list[PA_PMD_1] = __pa(kexec_pmd1); 204 page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1);
205 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; 205 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
206 page_list[PA_PTE_1] = __pa(kexec_pte1); 206 page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1);
207 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 207 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
208 208
209 page_list[PA_TABLE_PAGE] = 209 page_list[PA_TABLE_PAGE] =
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 8011a8e1c7d..fa267268247 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -323,10 +323,13 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
323#endif /* CONFIG_X86_MCE_INTEL */ 323#endif /* CONFIG_X86_MCE_INTEL */
324 324
325/* 325/*
326 * Periodic polling timer for "silent" machine check errors. 326 * Periodic polling timer for "silent" machine check errors. If the
327 * poller finds an MCE, poll 2x faster. When the poller finds no more
328 * errors, poll 2x slower (up to check_interval seconds).
327 */ 329 */
328 330
329static int check_interval = 5 * 60; /* 5 minutes */ 331static int check_interval = 5 * 60; /* 5 minutes */
332static int next_interval; /* in jiffies */
330static void mcheck_timer(struct work_struct *work); 333static void mcheck_timer(struct work_struct *work);
331static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 334static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
332 335
@@ -339,7 +342,6 @@ static void mcheck_check_cpu(void *info)
339static void mcheck_timer(struct work_struct *work) 342static void mcheck_timer(struct work_struct *work)
340{ 343{
341 on_each_cpu(mcheck_check_cpu, NULL, 1, 1); 344 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
342 schedule_delayed_work(&mcheck_work, check_interval * HZ);
343 345
344 /* 346 /*
345 * It's ok to read stale data here for notify_user and 347 * It's ok to read stale data here for notify_user and
@@ -349,17 +351,30 @@ static void mcheck_timer(struct work_struct *work)
349 * writes. 351 * writes.
350 */ 352 */
351 if (notify_user && console_logged) { 353 if (notify_user && console_logged) {
354 static unsigned long last_print;
355 unsigned long now = jiffies;
356
357 /* if we logged an MCE, reduce the polling interval */
358 next_interval = max(next_interval/2, HZ/100);
352 notify_user = 0; 359 notify_user = 0;
353 clear_bit(0, &console_logged); 360 clear_bit(0, &console_logged);
354 printk(KERN_INFO "Machine check events logged\n"); 361 if (time_after_eq(now, last_print + (check_interval*HZ))) {
362 last_print = now;
363 printk(KERN_INFO "Machine check events logged\n");
364 }
365 } else {
366 next_interval = min(next_interval*2, check_interval*HZ);
355 } 367 }
368
369 schedule_delayed_work(&mcheck_work, next_interval);
356} 370}
357 371
358 372
359static __init int periodic_mcheck_init(void) 373static __init int periodic_mcheck_init(void)
360{ 374{
361 if (check_interval) 375 next_interval = check_interval * HZ;
362 schedule_delayed_work(&mcheck_work, check_interval*HZ); 376 if (next_interval)
377 schedule_delayed_work(&mcheck_work, next_interval);
363 return 0; 378 return 0;
364} 379}
365__initcall(periodic_mcheck_init); 380__initcall(periodic_mcheck_init);
@@ -597,12 +612,13 @@ static int mce_resume(struct sys_device *dev)
597/* Reinit MCEs after user configuration changes */ 612/* Reinit MCEs after user configuration changes */
598static void mce_restart(void) 613static void mce_restart(void)
599{ 614{
600 if (check_interval) 615 if (next_interval)
601 cancel_delayed_work(&mcheck_work); 616 cancel_delayed_work(&mcheck_work);
602 /* Timer race is harmless here */ 617 /* Timer race is harmless here */
603 on_each_cpu(mce_init, NULL, 1, 1); 618 on_each_cpu(mce_init, NULL, 1, 1);
604 if (check_interval) 619 next_interval = check_interval * HZ;
605 schedule_delayed_work(&mcheck_work, check_interval*HZ); 620 if (next_interval)
621 schedule_delayed_work(&mcheck_work, next_interval);
606} 622}
607 623
608static struct sysdev_class mce_sysclass = { 624static struct sysdev_class mce_sysclass = {
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 455aa0b932f..d0dc4891599 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
300 } 300 }
301 } 301 }
302 } 302 }
303 clustered_apic_check(); 303 setup_apic_routing();
304 if (!num_processors) 304 if (!num_processors)
305 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 305 printk(KERN_ERR "MPTABLE: no processors registered!\n");
306 return num_processors; 306 return num_processors;
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index dfab9f16736..6cd2b30e2ff 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -27,28 +27,11 @@
27#include <asm/proto.h> 27#include <asm/proto.h>
28#include <asm/kdebug.h> 28#include <asm/kdebug.h>
29#include <asm/mce.h> 29#include <asm/mce.h>
30#include <asm/intel_arch_perfmon.h>
31 30
32int unknown_nmi_panic; 31int unknown_nmi_panic;
33int nmi_watchdog_enabled; 32int nmi_watchdog_enabled;
34int panic_on_unrecovered_nmi; 33int panic_on_unrecovered_nmi;
35 34
36/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
37 * evtsel_nmi_owner tracks the ownership of the event selection
38 * - different performance counters/ event selection may be reserved for
39 * different subsystems this reservation system just tries to coordinate
40 * things a little
41 */
42
43/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
44 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
45 */
46#define NMI_MAX_COUNTER_BITS 66
47#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS)
48
49static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]);
50static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]);
51
52static cpumask_t backtrace_mask = CPU_MASK_NONE; 35static cpumask_t backtrace_mask = CPU_MASK_NONE;
53 36
54/* nmi_active: 37/* nmi_active:
@@ -63,191 +46,11 @@ int panic_on_timeout;
63unsigned int nmi_watchdog = NMI_DEFAULT; 46unsigned int nmi_watchdog = NMI_DEFAULT;
64static unsigned int nmi_hz = HZ; 47static unsigned int nmi_hz = HZ;
65 48
66struct nmi_watchdog_ctlblk { 49static DEFINE_PER_CPU(short, wd_enabled);
67 int enabled;
68 u64 check_bit;
69 unsigned int cccr_msr;
70 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
71 unsigned int evntsel_msr; /* the MSR to select the events to handle */
72};
73static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
74 50
75/* local prototypes */ 51/* local prototypes */
76static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); 52static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
77 53
78/* converts an msr to an appropriate reservation bit */
79static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
80{
81 /* returns the bit offset of the performance counter register */
82 switch (boot_cpu_data.x86_vendor) {
83 case X86_VENDOR_AMD:
84 return (msr - MSR_K7_PERFCTR0);
85 case X86_VENDOR_INTEL:
86 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
87 return (msr - MSR_ARCH_PERFMON_PERFCTR0);
88 else
89 return (msr - MSR_P4_BPU_PERFCTR0);
90 }
91 return 0;
92}
93
94/* converts an msr to an appropriate reservation bit */
95static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
96{
97 /* returns the bit offset of the event selection register */
98 switch (boot_cpu_data.x86_vendor) {
99 case X86_VENDOR_AMD:
100 return (msr - MSR_K7_EVNTSEL0);
101 case X86_VENDOR_INTEL:
102 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
103 return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
104 else
105 return (msr - MSR_P4_BSU_ESCR0);
106 }
107 return 0;
108}
109
110/* checks for a bit availability (hack for oprofile) */
111int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
112{
113 int cpu;
114 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
115 for_each_possible_cpu (cpu) {
116 if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)))
117 return 0;
118 }
119 return 1;
120}
121
122/* checks the an msr for availability */
123int avail_to_resrv_perfctr_nmi(unsigned int msr)
124{
125 unsigned int counter;
126 int cpu;
127
128 counter = nmi_perfctr_msr_to_bit(msr);
129 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
130
131 for_each_possible_cpu (cpu) {
132 if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)))
133 return 0;
134 }
135 return 1;
136}
137
138static int __reserve_perfctr_nmi(int cpu, unsigned int msr)
139{
140 unsigned int counter;
141 if (cpu < 0)
142 cpu = smp_processor_id();
143
144 counter = nmi_perfctr_msr_to_bit(msr);
145 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
146
147 if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)))
148 return 1;
149 return 0;
150}
151
152static void __release_perfctr_nmi(int cpu, unsigned int msr)
153{
154 unsigned int counter;
155 if (cpu < 0)
156 cpu = smp_processor_id();
157
158 counter = nmi_perfctr_msr_to_bit(msr);
159 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
160
161 clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu));
162}
163
164int reserve_perfctr_nmi(unsigned int msr)
165{
166 int cpu, i;
167 for_each_possible_cpu (cpu) {
168 if (!__reserve_perfctr_nmi(cpu, msr)) {
169 for_each_possible_cpu (i) {
170 if (i >= cpu)
171 break;
172 __release_perfctr_nmi(i, msr);
173 }
174 return 0;
175 }
176 }
177 return 1;
178}
179
180void release_perfctr_nmi(unsigned int msr)
181{
182 int cpu;
183 for_each_possible_cpu (cpu)
184 __release_perfctr_nmi(cpu, msr);
185}
186
187int __reserve_evntsel_nmi(int cpu, unsigned int msr)
188{
189 unsigned int counter;
190 if (cpu < 0)
191 cpu = smp_processor_id();
192
193 counter = nmi_evntsel_msr_to_bit(msr);
194 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
195
196 if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]))
197 return 1;
198 return 0;
199}
200
201static void __release_evntsel_nmi(int cpu, unsigned int msr)
202{
203 unsigned int counter;
204 if (cpu < 0)
205 cpu = smp_processor_id();
206
207 counter = nmi_evntsel_msr_to_bit(msr);
208 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
209
210 clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]);
211}
212
213int reserve_evntsel_nmi(unsigned int msr)
214{
215 int cpu, i;
216 for_each_possible_cpu (cpu) {
217 if (!__reserve_evntsel_nmi(cpu, msr)) {
218 for_each_possible_cpu (i) {
219 if (i >= cpu)
220 break;
221 __release_evntsel_nmi(i, msr);
222 }
223 return 0;
224 }
225 }
226 return 1;
227}
228
229void release_evntsel_nmi(unsigned int msr)
230{
231 int cpu;
232 for_each_possible_cpu (cpu) {
233 __release_evntsel_nmi(cpu, msr);
234 }
235}
236
237static __cpuinit inline int nmi_known_cpu(void)
238{
239 switch (boot_cpu_data.x86_vendor) {
240 case X86_VENDOR_AMD:
241 return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16;
242 case X86_VENDOR_INTEL:
243 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
244 return 1;
245 else
246 return (boot_cpu_data.x86 == 15);
247 }
248 return 0;
249}
250
251/* Run after command line and cpu_init init, but before all other checks */ 54/* Run after command line and cpu_init init, but before all other checks */
252void nmi_watchdog_default(void) 55void nmi_watchdog_default(void)
253{ 56{
@@ -277,23 +80,6 @@ static __init void nmi_cpu_busy(void *data)
277} 80}
278#endif 81#endif
279 82
280static unsigned int adjust_for_32bit_ctr(unsigned int hz)
281{
282 unsigned int retval = hz;
283
284 /*
285 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
286 * are writable, with higher bits sign extending from bit 31.
287 * So, we can only program the counter with 31 bit values and
288 * 32nd bit should be 1, for 33.. to be 1.
289 * Find the appropriate nmi_hz
290 */
291 if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) {
292 retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
293 }
294 return retval;
295}
296
297int __init check_nmi_watchdog (void) 83int __init check_nmi_watchdog (void)
298{ 84{
299 int *counts; 85 int *counts;
@@ -322,14 +108,14 @@ int __init check_nmi_watchdog (void)
322 mdelay((20*1000)/nmi_hz); // wait 20 ticks 108 mdelay((20*1000)/nmi_hz); // wait 20 ticks
323 109
324 for_each_online_cpu(cpu) { 110 for_each_online_cpu(cpu) {
325 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) 111 if (!per_cpu(wd_enabled, cpu))
326 continue; 112 continue;
327 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { 113 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
328 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", 114 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
329 cpu, 115 cpu,
330 counts[cpu], 116 counts[cpu],
331 cpu_pda(cpu)->__nmi_count); 117 cpu_pda(cpu)->__nmi_count);
332 per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; 118 per_cpu(wd_enabled, cpu) = 0;
333 atomic_dec(&nmi_active); 119 atomic_dec(&nmi_active);
334 } 120 }
335 } 121 }
@@ -344,13 +130,8 @@ int __init check_nmi_watchdog (void)
344 130
345 /* now that we know it works we can reduce NMI frequency to 131 /* now that we know it works we can reduce NMI frequency to
346 something more reasonable; makes a difference in some configs */ 132 something more reasonable; makes a difference in some configs */
347 if (nmi_watchdog == NMI_LOCAL_APIC) { 133 if (nmi_watchdog == NMI_LOCAL_APIC)
348 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 134 nmi_hz = lapic_adjust_nmi_hz(1);
349
350 nmi_hz = 1;
351 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0)
352 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
353 }
354 135
355 kfree(counts); 136 kfree(counts);
356 return 0; 137 return 0;
@@ -379,57 +160,6 @@ int __init setup_nmi_watchdog(char *str)
379 160
380__setup("nmi_watchdog=", setup_nmi_watchdog); 161__setup("nmi_watchdog=", setup_nmi_watchdog);
381 162
382static void disable_lapic_nmi_watchdog(void)
383{
384 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
385
386 if (atomic_read(&nmi_active) <= 0)
387 return;
388
389 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
390
391 BUG_ON(atomic_read(&nmi_active) != 0);
392}
393
394static void enable_lapic_nmi_watchdog(void)
395{
396 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
397
398 /* are we already enabled */
399 if (atomic_read(&nmi_active) != 0)
400 return;
401
402 /* are we lapic aware */
403 if (nmi_known_cpu() <= 0)
404 return;
405
406 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
407 touch_nmi_watchdog();
408}
409
410void disable_timer_nmi_watchdog(void)
411{
412 BUG_ON(nmi_watchdog != NMI_IO_APIC);
413
414 if (atomic_read(&nmi_active) <= 0)
415 return;
416
417 disable_irq(0);
418 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
419
420 BUG_ON(atomic_read(&nmi_active) != 0);
421}
422
423void enable_timer_nmi_watchdog(void)
424{
425 BUG_ON(nmi_watchdog != NMI_IO_APIC);
426
427 if (atomic_read(&nmi_active) == 0) {
428 touch_nmi_watchdog();
429 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
430 enable_irq(0);
431 }
432}
433 163
434static void __acpi_nmi_disable(void *__unused) 164static void __acpi_nmi_disable(void *__unused)
435{ 165{
@@ -515,275 +245,9 @@ late_initcall(init_lapic_nmi_sysfs);
515 245
516#endif /* CONFIG_PM */ 246#endif /* CONFIG_PM */
517 247
518/*
519 * Activate the NMI watchdog via the local APIC.
520 * Original code written by Keith Owens.
521 */
522
523/* Note that these events don't tick when the CPU idles. This means
524 the frequency varies with CPU load. */
525
526#define K7_EVNTSEL_ENABLE (1 << 22)
527#define K7_EVNTSEL_INT (1 << 20)
528#define K7_EVNTSEL_OS (1 << 17)
529#define K7_EVNTSEL_USR (1 << 16)
530#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
531#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
532
533static int setup_k7_watchdog(void)
534{
535 unsigned int perfctr_msr, evntsel_msr;
536 unsigned int evntsel;
537 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
538
539 perfctr_msr = MSR_K7_PERFCTR0;
540 evntsel_msr = MSR_K7_EVNTSEL0;
541 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
542 goto fail;
543
544 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
545 goto fail1;
546
547 /* Simulator may not support it */
548 if (checking_wrmsrl(evntsel_msr, 0UL))
549 goto fail2;
550 wrmsrl(perfctr_msr, 0UL);
551
552 evntsel = K7_EVNTSEL_INT
553 | K7_EVNTSEL_OS
554 | K7_EVNTSEL_USR
555 | K7_NMI_EVENT;
556
557 /* setup the timer */
558 wrmsr(evntsel_msr, evntsel, 0);
559 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
560 apic_write(APIC_LVTPC, APIC_DM_NMI);
561 evntsel |= K7_EVNTSEL_ENABLE;
562 wrmsr(evntsel_msr, evntsel, 0);
563
564 wd->perfctr_msr = perfctr_msr;
565 wd->evntsel_msr = evntsel_msr;
566 wd->cccr_msr = 0; //unused
567 wd->check_bit = 1ULL<<63;
568 return 1;
569fail2:
570 __release_evntsel_nmi(-1, evntsel_msr);
571fail1:
572 __release_perfctr_nmi(-1, perfctr_msr);
573fail:
574 return 0;
575}
576
577static void stop_k7_watchdog(void)
578{
579 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
580
581 wrmsr(wd->evntsel_msr, 0, 0);
582
583 __release_evntsel_nmi(-1, wd->evntsel_msr);
584 __release_perfctr_nmi(-1, wd->perfctr_msr);
585}
586
587/* Note that these events don't tick when the CPU idles. This means
588 the frequency varies with CPU load. */
589
590#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
591#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
592#define P4_ESCR_OS (1<<3)
593#define P4_ESCR_USR (1<<2)
594#define P4_CCCR_OVF_PMI0 (1<<26)
595#define P4_CCCR_OVF_PMI1 (1<<27)
596#define P4_CCCR_THRESHOLD(N) ((N)<<20)
597#define P4_CCCR_COMPLEMENT (1<<19)
598#define P4_CCCR_COMPARE (1<<18)
599#define P4_CCCR_REQUIRED (3<<16)
600#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
601#define P4_CCCR_ENABLE (1<<12)
602#define P4_CCCR_OVF (1<<31)
603/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
604 CRU_ESCR0 (with any non-null event selector) through a complemented
605 max threshold. [IA32-Vol3, Section 14.9.9] */
606
607static int setup_p4_watchdog(void)
608{
609 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
610 unsigned int evntsel, cccr_val;
611 unsigned int misc_enable, dummy;
612 unsigned int ht_num;
613 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
614
615 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
616 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
617 return 0;
618
619#ifdef CONFIG_SMP
620 /* detect which hyperthread we are on */
621 if (smp_num_siblings == 2) {
622 unsigned int ebx, apicid;
623
624 ebx = cpuid_ebx(1);
625 apicid = (ebx >> 24) & 0xff;
626 ht_num = apicid & 1;
627 } else
628#endif
629 ht_num = 0;
630
631 /* performance counters are shared resources
632 * assign each hyperthread its own set
633 * (re-use the ESCR0 register, seems safe
634 * and keeps the cccr_val the same)
635 */
636 if (!ht_num) {
637 /* logical cpu 0 */
638 perfctr_msr = MSR_P4_IQ_PERFCTR0;
639 evntsel_msr = MSR_P4_CRU_ESCR0;
640 cccr_msr = MSR_P4_IQ_CCCR0;
641 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
642 } else {
643 /* logical cpu 1 */
644 perfctr_msr = MSR_P4_IQ_PERFCTR1;
645 evntsel_msr = MSR_P4_CRU_ESCR0;
646 cccr_msr = MSR_P4_IQ_CCCR1;
647 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
648 }
649
650 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
651 goto fail;
652
653 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
654 goto fail1;
655
656 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
657 | P4_ESCR_OS
658 | P4_ESCR_USR;
659
660 cccr_val |= P4_CCCR_THRESHOLD(15)
661 | P4_CCCR_COMPLEMENT
662 | P4_CCCR_COMPARE
663 | P4_CCCR_REQUIRED;
664
665 wrmsr(evntsel_msr, evntsel, 0);
666 wrmsr(cccr_msr, cccr_val, 0);
667 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
668 apic_write(APIC_LVTPC, APIC_DM_NMI);
669 cccr_val |= P4_CCCR_ENABLE;
670 wrmsr(cccr_msr, cccr_val, 0);
671
672 wd->perfctr_msr = perfctr_msr;
673 wd->evntsel_msr = evntsel_msr;
674 wd->cccr_msr = cccr_msr;
675 wd->check_bit = 1ULL<<39;
676 return 1;
677fail1:
678 __release_perfctr_nmi(-1, perfctr_msr);
679fail:
680 return 0;
681}
682
683static void stop_p4_watchdog(void)
684{
685 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
686
687 wrmsr(wd->cccr_msr, 0, 0);
688 wrmsr(wd->evntsel_msr, 0, 0);
689
690 __release_evntsel_nmi(-1, wd->evntsel_msr);
691 __release_perfctr_nmi(-1, wd->perfctr_msr);
692}
693
694#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
695#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
696
697static int setup_intel_arch_watchdog(void)
698{
699 unsigned int ebx;
700 union cpuid10_eax eax;
701 unsigned int unused;
702 unsigned int perfctr_msr, evntsel_msr;
703 unsigned int evntsel;
704 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
705
706 /*
707 * Check whether the Architectural PerfMon supports
708 * Unhalted Core Cycles Event or not.
709 * NOTE: Corresponding bit = 0 in ebx indicates event present.
710 */
711 cpuid(10, &(eax.full), &ebx, &unused, &unused);
712 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
713 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
714 goto fail;
715
716 perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
717 evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
718
719 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
720 goto fail;
721
722 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
723 goto fail1;
724
725 wrmsrl(perfctr_msr, 0UL);
726
727 evntsel = ARCH_PERFMON_EVENTSEL_INT
728 | ARCH_PERFMON_EVENTSEL_OS
729 | ARCH_PERFMON_EVENTSEL_USR
730 | ARCH_PERFMON_NMI_EVENT_SEL
731 | ARCH_PERFMON_NMI_EVENT_UMASK;
732
733 /* setup the timer */
734 wrmsr(evntsel_msr, evntsel, 0);
735
736 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
737 wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
738
739 apic_write(APIC_LVTPC, APIC_DM_NMI);
740 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
741 wrmsr(evntsel_msr, evntsel, 0);
742
743 wd->perfctr_msr = perfctr_msr;
744 wd->evntsel_msr = evntsel_msr;
745 wd->cccr_msr = 0; //unused
746 wd->check_bit = 1ULL << (eax.split.bit_width - 1);
747 return 1;
748fail1:
749 __release_perfctr_nmi(-1, perfctr_msr);
750fail:
751 return 0;
752}
753
754static void stop_intel_arch_watchdog(void)
755{
756 unsigned int ebx;
757 union cpuid10_eax eax;
758 unsigned int unused;
759 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
760
761 /*
762 * Check whether the Architectural PerfMon supports
763 * Unhalted Core Cycles Event or not.
764 * NOTE: Corresponding bit = 0 in ebx indicates event present.
765 */
766 cpuid(10, &(eax.full), &ebx, &unused, &unused);
767 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
768 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
769 return;
770
771 wrmsr(wd->evntsel_msr, 0, 0);
772
773 __release_evntsel_nmi(-1, wd->evntsel_msr);
774 __release_perfctr_nmi(-1, wd->perfctr_msr);
775}
776
777void setup_apic_nmi_watchdog(void *unused) 248void setup_apic_nmi_watchdog(void *unused)
778{ 249{
779 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 250 if (__get_cpu_var(wd_enabled) == 1)
780
781 /* only support LOCAL and IO APICs for now */
782 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
783 (nmi_watchdog != NMI_IO_APIC))
784 return;
785
786 if (wd->enabled == 1)
787 return; 251 return;
788 252
789 /* cheap hack to support suspend/resume */ 253 /* cheap hack to support suspend/resume */
@@ -791,62 +255,31 @@ void setup_apic_nmi_watchdog(void *unused)
791 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) 255 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
792 return; 256 return;
793 257
794 if (nmi_watchdog == NMI_LOCAL_APIC) { 258 switch (nmi_watchdog) {
795 switch (boot_cpu_data.x86_vendor) { 259 case NMI_LOCAL_APIC:
796 case X86_VENDOR_AMD: 260 __get_cpu_var(wd_enabled) = 1;
797 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) 261 if (lapic_watchdog_init(nmi_hz) < 0) {
798 return; 262 __get_cpu_var(wd_enabled) = 0;
799 if (!setup_k7_watchdog())
800 return;
801 break;
802 case X86_VENDOR_INTEL:
803 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
804 if (!setup_intel_arch_watchdog())
805 return;
806 break;
807 }
808 if (!setup_p4_watchdog())
809 return;
810 break;
811 default:
812 return; 263 return;
813 } 264 }
265 /* FALL THROUGH */
266 case NMI_IO_APIC:
267 __get_cpu_var(wd_enabled) = 1;
268 atomic_inc(&nmi_active);
814 } 269 }
815 wd->enabled = 1;
816 atomic_inc(&nmi_active);
817} 270}
818 271
819void stop_apic_nmi_watchdog(void *unused) 272void stop_apic_nmi_watchdog(void *unused)
820{ 273{
821 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
822
823 /* only support LOCAL and IO APICs for now */ 274 /* only support LOCAL and IO APICs for now */
824 if ((nmi_watchdog != NMI_LOCAL_APIC) && 275 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
825 (nmi_watchdog != NMI_IO_APIC)) 276 (nmi_watchdog != NMI_IO_APIC))
826 return; 277 return;
827 278 if (__get_cpu_var(wd_enabled) == 0)
828 if (wd->enabled == 0)
829 return; 279 return;
830 280 if (nmi_watchdog == NMI_LOCAL_APIC)
831 if (nmi_watchdog == NMI_LOCAL_APIC) { 281 lapic_watchdog_stop();
832 switch (boot_cpu_data.x86_vendor) { 282 __get_cpu_var(wd_enabled) = 0;
833 case X86_VENDOR_AMD:
834 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
835 return;
836 stop_k7_watchdog();
837 break;
838 case X86_VENDOR_INTEL:
839 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
840 stop_intel_arch_watchdog();
841 break;
842 }
843 stop_p4_watchdog();
844 break;
845 default:
846 return;
847 }
848 }
849 wd->enabled = 0;
850 atomic_dec(&nmi_active); 283 atomic_dec(&nmi_active);
851} 284}
852 285
@@ -885,9 +318,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
885 int sum; 318 int sum;
886 int touched = 0; 319 int touched = 0;
887 int cpu = smp_processor_id(); 320 int cpu = smp_processor_id();
888 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 321 int rc = 0;
889 u64 dummy;
890 int rc=0;
891 322
892 /* check for other users first */ 323 /* check for other users first */
893 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 324 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
@@ -934,55 +365,20 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
934 } 365 }
935 366
936 /* see if the nmi watchdog went off */ 367 /* see if the nmi watchdog went off */
937 if (wd->enabled) { 368 if (!__get_cpu_var(wd_enabled))
938 if (nmi_watchdog == NMI_LOCAL_APIC) { 369 return rc;
939 rdmsrl(wd->perfctr_msr, dummy); 370 switch (nmi_watchdog) {
940 if (dummy & wd->check_bit){ 371 case NMI_LOCAL_APIC:
941 /* this wasn't a watchdog timer interrupt */ 372 rc |= lapic_wd_event(nmi_hz);
942 goto done; 373 break;
943 } 374 case NMI_IO_APIC:
944 375 /* don't know how to accurately check for this.
945 /* only Intel uses the cccr msr */ 376 * just assume it was a watchdog timer interrupt
946 if (wd->cccr_msr != 0) { 377 * This matches the old behaviour.
947 /* 378 */
948 * P4 quirks: 379 rc = 1;
949 * - An overflown perfctr will assert its interrupt 380 break;
950 * until the OVF flag in its CCCR is cleared.
951 * - LVTPC is masked on interrupt and must be
952 * unmasked by the LVTPC handler.
953 */
954 rdmsrl(wd->cccr_msr, dummy);
955 dummy &= ~P4_CCCR_OVF;
956 wrmsrl(wd->cccr_msr, dummy);
957 apic_write(APIC_LVTPC, APIC_DM_NMI);
958 /* start the cycle over again */
959 wrmsrl(wd->perfctr_msr,
960 -((u64)cpu_khz * 1000 / nmi_hz));
961 } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
962 /*
963 * ArchPerfom/Core Duo needs to re-unmask
964 * the apic vector
965 */
966 apic_write(APIC_LVTPC, APIC_DM_NMI);
967 /* ARCH_PERFMON has 32 bit counter writes */
968 wrmsr(wd->perfctr_msr,
969 (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
970 } else {
971 /* start the cycle over again */
972 wrmsrl(wd->perfctr_msr,
973 -((u64)cpu_khz * 1000 / nmi_hz));
974 }
975 rc = 1;
976 } else if (nmi_watchdog == NMI_IO_APIC) {
977 /* don't know how to accurately check for this.
978 * just assume it was a watchdog timer interrupt
979 * This matches the old behaviour.
980 */
981 rc = 1;
982 } else
983 printk(KERN_WARNING "Unknown enabled NMI hardware?!\n");
984 } 381 }
985done:
986 return rc; 382 return rc;
987} 383}
988 384
@@ -1067,12 +463,4 @@ void __trigger_all_cpu_backtrace(void)
1067 463
1068EXPORT_SYMBOL(nmi_active); 464EXPORT_SYMBOL(nmi_active);
1069EXPORT_SYMBOL(nmi_watchdog); 465EXPORT_SYMBOL(nmi_watchdog);
1070EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1071EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1072EXPORT_SYMBOL(reserve_perfctr_nmi);
1073EXPORT_SYMBOL(release_perfctr_nmi);
1074EXPORT_SYMBOL(reserve_evntsel_nmi);
1075EXPORT_SYMBOL(release_evntsel_nmi);
1076EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1077EXPORT_SYMBOL(enable_timer_nmi_watchdog);
1078EXPORT_SYMBOL(touch_nmi_watchdog); 466EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 04480c3b68f..5bd20b542c1 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -507,7 +507,7 @@ error:
507 return ret; 507 return ret;
508} 508}
509 509
510static struct dma_mapping_ops calgary_dma_ops = { 510static const struct dma_mapping_ops calgary_dma_ops = {
511 .alloc_coherent = calgary_alloc_coherent, 511 .alloc_coherent = calgary_alloc_coherent,
512 .map_single = calgary_map_single, 512 .map_single = calgary_map_single,
513 .unmap_single = calgary_unmap_single, 513 .unmap_single = calgary_unmap_single,
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 0bae862e9a5..0a762e10f2b 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -556,7 +556,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
556 556
557extern int agp_amd64_init(void); 557extern int agp_amd64_init(void);
558 558
559static struct dma_mapping_ops gart_dma_ops = { 559static const struct dma_mapping_ops gart_dma_ops = {
560 .mapping_error = NULL, 560 .mapping_error = NULL,
561 .map_single = gart_map_single, 561 .map_single = gart_map_single,
562 .map_simple = gart_map_simple, 562 .map_simple = gart_map_simple,
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index df09ab05a1b..6dade0c867c 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -79,7 +79,7 @@ void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
79{ 79{
80} 80}
81 81
82struct dma_mapping_ops nommu_dma_ops = { 82const struct dma_mapping_ops nommu_dma_ops = {
83 .map_single = nommu_map_single, 83 .map_single = nommu_map_single,
84 .unmap_single = nommu_unmap_single, 84 .unmap_single = nommu_unmap_single,
85 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
index eb18be5a656..4b4569abc60 100644
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -12,7 +12,7 @@
12int swiotlb __read_mostly; 12int swiotlb __read_mostly;
13EXPORT_SYMBOL(swiotlb); 13EXPORT_SYMBOL(swiotlb);
14 14
15struct dma_mapping_ops swiotlb_dma_ops = { 15const struct dma_mapping_ops swiotlb_dma_ops = {
16 .mapping_error = swiotlb_dma_mapping_error, 16 .mapping_error = swiotlb_dma_mapping_error,
17 .alloc_coherent = swiotlb_alloc_coherent, 17 .alloc_coherent = swiotlb_alloc_coherent,
18 .free_coherent = swiotlb_free_coherent, 18 .free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index d8d5ccc245c..4f21765078b 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -288,16 +288,18 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
288 288
289static int __init idle_setup (char *str) 289static int __init idle_setup (char *str)
290{ 290{
291 if (!strncmp(str, "poll", 4)) { 291 if (!strcmp(str, "poll")) {
292 printk("using polling idle threads.\n"); 292 printk("using polling idle threads.\n");
293 pm_idle = poll_idle; 293 pm_idle = poll_idle;
294 } 294 } else if (!strcmp(str, "mwait"))
295 force_mwait = 1;
296 else
297 return -1;
295 298
296 boot_option_idle_override = 1; 299 boot_option_idle_override = 1;
297 return 1; 300 return 0;
298} 301}
299 302early_param("idle", idle_setup);
300__setup("idle=", idle_setup);
301 303
302/* Prints also some state that isn't saved in the pt_regs */ 304/* Prints also some state that isn't saved in the pt_regs */
303void __show_regs(struct pt_regs * regs) 305void __show_regs(struct pt_regs * regs)
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 3d98b696881..db30b5bcef6 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -79,6 +79,8 @@ int bootloader_type;
79 79
80unsigned long saved_video_mode; 80unsigned long saved_video_mode;
81 81
82int force_mwait __cpuinitdata;
83
82/* 84/*
83 * Early DMI memory 85 * Early DMI memory
84 */ 86 */
@@ -205,10 +207,10 @@ static void discover_ebda(void)
205 * there is a real-mode segmented pointer pointing to the 207 * there is a real-mode segmented pointer pointing to the
206 * 4K EBDA area at 0x40E 208 * 4K EBDA area at 0x40E
207 */ 209 */
208 ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; 210 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
209 ebda_addr <<= 4; 211 ebda_addr <<= 4;
210 212
211 ebda_size = *(unsigned short *)(unsigned long)ebda_addr; 213 ebda_size = *(unsigned short *)__va(ebda_addr);
212 214
213 /* Round EBDA up to pages */ 215 /* Round EBDA up to pages */
214 if (ebda_size == 0) 216 if (ebda_size == 0)
@@ -243,11 +245,12 @@ void __init setup_arch(char **cmdline_p)
243 init_mm.end_code = (unsigned long) &_etext; 245 init_mm.end_code = (unsigned long) &_etext;
244 init_mm.end_data = (unsigned long) &_edata; 246 init_mm.end_data = (unsigned long) &_edata;
245 init_mm.brk = (unsigned long) &_end; 247 init_mm.brk = (unsigned long) &_end;
248 init_mm.pgd = __va(__pa_symbol(&init_level4_pgt));
246 249
247 code_resource.start = virt_to_phys(&_text); 250 code_resource.start = __pa_symbol(&_text);
248 code_resource.end = virt_to_phys(&_etext)-1; 251 code_resource.end = __pa_symbol(&_etext)-1;
249 data_resource.start = virt_to_phys(&_etext); 252 data_resource.start = __pa_symbol(&_etext);
250 data_resource.end = virt_to_phys(&_edata)-1; 253 data_resource.end = __pa_symbol(&_edata)-1;
251 254
252 early_identify_cpu(&boot_cpu_data); 255 early_identify_cpu(&boot_cpu_data);
253 256
@@ -274,8 +277,6 @@ void __init setup_arch(char **cmdline_p)
274 277
275 dmi_scan_machine(); 278 dmi_scan_machine();
276 279
277 zap_low_mappings(0);
278
279#ifdef CONFIG_ACPI 280#ifdef CONFIG_ACPI
280 /* 281 /*
281 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). 282 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
@@ -329,15 +330,8 @@ void __init setup_arch(char **cmdline_p)
329#endif 330#endif
330 331
331#ifdef CONFIG_SMP 332#ifdef CONFIG_SMP
332 /*
333 * But first pinch a few for the stack/trampoline stuff
334 * FIXME: Don't need the extra page at 4K, but need to fix
335 * trampoline before removing it. (see the GDT stuff)
336 */
337 reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
338
339 /* Reserve SMP trampoline */ 333 /* Reserve SMP trampoline */
340 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); 334 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
341#endif 335#endif
342 336
343#ifdef CONFIG_ACPI_SLEEP 337#ifdef CONFIG_ACPI_SLEEP
@@ -612,6 +606,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
612 606
613 /* RDTSC can be speculated around */ 607 /* RDTSC can be speculated around */
614 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); 608 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
609
610 /* Family 10 doesn't support C states in MWAIT so don't use it */
611 if (c->x86 == 0x10 && !force_mwait)
612 clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
615} 613}
616 614
617static void __cpuinit detect_ht(struct cpuinfo_x86 *c) 615static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -987,9 +985,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
987 "stc", 985 "stc",
988 "100mhzsteps", 986 "100mhzsteps",
989 "hwpstate", 987 "hwpstate",
990 NULL, /* tsc invariant mapped to constant_tsc */ 988 "", /* tsc invariant mapped to constant_tsc */
991 NULL, 989 /* nothing */
992 /* nothing */ /* constant_tsc - moved to flags */
993 }; 990 };
994 991
995 992
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 6a70b55f719..64379a80d76 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -103,9 +103,9 @@ void __init setup_per_cpu_areas(void)
103 if (!NODE_DATA(cpu_to_node(i))) { 103 if (!NODE_DATA(cpu_to_node(i))) {
104 printk("cpu with no node %d, num_online_nodes %d\n", 104 printk("cpu with no node %d, num_online_nodes %d\n",
105 i, num_online_nodes()); 105 i, num_online_nodes());
106 ptr = alloc_bootmem(size); 106 ptr = alloc_bootmem_pages(size);
107 } else { 107 } else {
108 ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); 108 ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
109 } 109 }
110 if (!ptr) 110 if (!ptr)
111 panic("Cannot allocate cpu data for CPU %d\n", i); 111 panic("Cannot allocate cpu data for CPU %d\n", i);
@@ -201,7 +201,6 @@ void __cpuinit cpu_init (void)
201 /* CPU 0 is initialised in head64.c */ 201 /* CPU 0 is initialised in head64.c */
202 if (cpu != 0) { 202 if (cpu != 0) {
203 pda_init(cpu); 203 pda_init(cpu);
204 zap_low_mappings(cpu);
205 } else 204 } else
206 estacks = boot_exception_stacks; 205 estacks = boot_exception_stacks;
207 206
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 49ec324cd14..c819625f331 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -141,7 +141,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
141 goto badframe; 141 goto badframe;
142 142
143#ifdef DEBUG_SIG 143#ifdef DEBUG_SIG
144 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); 144 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
145#endif 145#endif
146 146
147 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) 147 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
@@ -301,7 +301,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
301 if (test_thread_flag(TIF_SINGLESTEP)) 301 if (test_thread_flag(TIF_SINGLESTEP))
302 ptrace_notify(SIGTRAP); 302 ptrace_notify(SIGTRAP);
303#ifdef DEBUG_SIG 303#ifdef DEBUG_SIG
304 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 304 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
305 current->comm, current->pid, frame, regs->rip, frame->pretcode); 305 current->comm, current->pid, frame, regs->rip, frame->pretcode);
306#endif 306#endif
307 307
@@ -463,7 +463,7 @@ void
463do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 463do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
464{ 464{
465#ifdef DEBUG_SIG 465#ifdef DEBUG_SIG
466 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", 466 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
467 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 467 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current));
468#endif 468#endif
469 469
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index af1ec4d23cf..22abae4e9f3 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -76,7 +76,7 @@ static inline void leave_mm(int cpu)
76 if (read_pda(mmu_state) == TLBSTATE_OK) 76 if (read_pda(mmu_state) == TLBSTATE_OK)
77 BUG(); 77 BUG();
78 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); 78 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
79 load_cr3(swapper_pg_dir); 79 load_cr3(init_mm.pgd);
80} 80}
81 81
82/* 82/*
@@ -452,42 +452,34 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
452} 452}
453EXPORT_SYMBOL(smp_call_function); 453EXPORT_SYMBOL(smp_call_function);
454 454
455void smp_stop_cpu(void) 455static void stop_this_cpu(void *dummy)
456{ 456{
457 unsigned long flags; 457 local_irq_disable();
458 /* 458 /*
459 * Remove this CPU: 459 * Remove this CPU:
460 */ 460 */
461 cpu_clear(smp_processor_id(), cpu_online_map); 461 cpu_clear(smp_processor_id(), cpu_online_map);
462 local_irq_save(flags);
463 disable_local_APIC(); 462 disable_local_APIC();
464 local_irq_restore(flags);
465}
466
467static void smp_really_stop_cpu(void *dummy)
468{
469 smp_stop_cpu();
470 for (;;) 463 for (;;)
471 halt(); 464 halt();
472} 465}
473 466
474void smp_send_stop(void) 467void smp_send_stop(void)
475{ 468{
476 int nolock = 0; 469 int nolock;
470 unsigned long flags;
471
477 if (reboot_force) 472 if (reboot_force)
478 return; 473 return;
474
479 /* Don't deadlock on the call lock in panic */ 475 /* Don't deadlock on the call lock in panic */
480 if (!spin_trylock(&call_lock)) { 476 nolock = !spin_trylock(&call_lock);
481 /* ignore locking because we have panicked anyways */ 477 local_irq_save(flags);
482 nolock = 1; 478 __smp_call_function(stop_this_cpu, NULL, 0, 0);
483 }
484 __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
485 if (!nolock) 479 if (!nolock)
486 spin_unlock(&call_lock); 480 spin_unlock(&call_lock);
487
488 local_irq_disable();
489 disable_local_APIC(); 481 disable_local_APIC();
490 local_irq_enable(); 482 local_irq_restore(flags);
491} 483}
492 484
493/* 485/*
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index cd4643a3702..4d9dacfae57 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -60,7 +60,6 @@
60#include <asm/irq.h> 60#include <asm/irq.h>
61#include <asm/hw_irq.h> 61#include <asm/hw_irq.h>
62#include <asm/numa.h> 62#include <asm/numa.h>
63#include <asm/genapic.h>
64 63
65/* Number of siblings per CPU package */ 64/* Number of siblings per CPU package */
66int smp_num_siblings = 1; 65int smp_num_siblings = 1;
@@ -68,7 +67,6 @@ EXPORT_SYMBOL(smp_num_siblings);
68 67
69/* Last level cache ID of each logical CPU */ 68/* Last level cache ID of each logical CPU */
70u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; 69u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
71EXPORT_SYMBOL(cpu_llc_id);
72 70
73/* Bitmask of currently online CPUs */ 71/* Bitmask of currently online CPUs */
74cpumask_t cpu_online_map __read_mostly; 72cpumask_t cpu_online_map __read_mostly;
@@ -392,7 +390,8 @@ static void inquire_remote_apic(int apicid)
392{ 390{
393 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 391 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
394 char *names[] = { "ID", "VERSION", "SPIV" }; 392 char *names[] = { "ID", "VERSION", "SPIV" };
395 int timeout, status; 393 int timeout;
394 unsigned int status;
396 395
397 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 396 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
398 397
@@ -402,7 +401,9 @@ static void inquire_remote_apic(int apicid)
402 /* 401 /*
403 * Wait for idle. 402 * Wait for idle.
404 */ 403 */
405 apic_wait_icr_idle(); 404 status = safe_apic_wait_icr_idle();
405 if (status)
406 printk("a previous APIC delivery may have failed\n");
406 407
407 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 408 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
408 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); 409 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -430,8 +431,8 @@ static void inquire_remote_apic(int apicid)
430 */ 431 */
431static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) 432static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
432{ 433{
433 unsigned long send_status = 0, accept_status = 0; 434 unsigned long send_status, accept_status = 0;
434 int maxlvt, timeout, num_starts, j; 435 int maxlvt, num_starts, j;
435 436
436 Dprintk("Asserting INIT.\n"); 437 Dprintk("Asserting INIT.\n");
437 438
@@ -447,12 +448,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
447 | APIC_DM_INIT); 448 | APIC_DM_INIT);
448 449
449 Dprintk("Waiting for send to finish...\n"); 450 Dprintk("Waiting for send to finish...\n");
450 timeout = 0; 451 send_status = safe_apic_wait_icr_idle();
451 do {
452 Dprintk("+");
453 udelay(100);
454 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
455 } while (send_status && (timeout++ < 1000));
456 452
457 mdelay(10); 453 mdelay(10);
458 454
@@ -465,12 +461,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
465 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 461 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
466 462
467 Dprintk("Waiting for send to finish...\n"); 463 Dprintk("Waiting for send to finish...\n");
468 timeout = 0; 464 send_status = safe_apic_wait_icr_idle();
469 do {
470 Dprintk("+");
471 udelay(100);
472 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
473 } while (send_status && (timeout++ < 1000));
474 465
475 mb(); 466 mb();
476 atomic_set(&init_deasserted, 1); 467 atomic_set(&init_deasserted, 1);
@@ -509,12 +500,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
509 Dprintk("Startup point 1.\n"); 500 Dprintk("Startup point 1.\n");
510 501
511 Dprintk("Waiting for send to finish...\n"); 502 Dprintk("Waiting for send to finish...\n");
512 timeout = 0; 503 send_status = safe_apic_wait_icr_idle();
513 do {
514 Dprintk("+");
515 udelay(100);
516 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
517 } while (send_status && (timeout++ < 1000));
518 504
519 /* 505 /*
520 * Give the other CPU some time to accept the IPI. 506 * Give the other CPU some time to accept the IPI.
@@ -945,6 +931,12 @@ int __cpuinit __cpu_up(unsigned int cpu)
945 return -ENOSYS; 931 return -ENOSYS;
946 } 932 }
947 933
934 /*
935 * Save current MTRR state in case it was changed since early boot
936 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
937 */
938 mtrr_save_state();
939
948 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 940 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
949 /* Boot it! */ 941 /* Boot it! */
950 err = do_boot_cpu(cpu, apicid); 942 err = do_boot_cpu(cpu, apicid);
@@ -965,13 +957,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
965 957
966 while (!cpu_isset(cpu, cpu_online_map)) 958 while (!cpu_isset(cpu, cpu_online_map))
967 cpu_relax(); 959 cpu_relax();
968
969 if (num_online_cpus() > 8 && genapic == &apic_flat) {
970 printk(KERN_WARNING
971 "flat APIC routing can't be used with > 8 cpus\n");
972 BUG();
973 }
974
975 err = 0; 960 err = 0;
976 961
977 return err; 962 return err;
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index 91f7e678bae..6a5a98f2a75 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -12,6 +12,10 @@
12#include <asm/proto.h> 12#include <asm/proto.h>
13#include <asm/page.h> 13#include <asm/page.h>
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/mtrr.h>
16
17/* References to section boundaries */
18extern const void __nosave_begin, __nosave_end;
15 19
16struct saved_context saved_context; 20struct saved_context saved_context;
17 21
@@ -33,7 +37,6 @@ void __save_processor_state(struct saved_context *ctxt)
33 asm volatile ("str %0" : "=m" (ctxt->tr)); 37 asm volatile ("str %0" : "=m" (ctxt->tr));
34 38
35 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ 39 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
36 /* EFER should be constant for kernel version, no need to handle it. */
37 /* 40 /*
38 * segment registers 41 * segment registers
39 */ 42 */
@@ -46,10 +49,12 @@ void __save_processor_state(struct saved_context *ctxt)
46 rdmsrl(MSR_FS_BASE, ctxt->fs_base); 49 rdmsrl(MSR_FS_BASE, ctxt->fs_base);
47 rdmsrl(MSR_GS_BASE, ctxt->gs_base); 50 rdmsrl(MSR_GS_BASE, ctxt->gs_base);
48 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 51 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
52 mtrr_save_fixed_ranges(NULL);
49 53
50 /* 54 /*
51 * control registers 55 * control registers
52 */ 56 */
57 rdmsrl(MSR_EFER, ctxt->efer);
53 asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); 58 asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0));
54 asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); 59 asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
55 asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); 60 asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
@@ -75,6 +80,7 @@ void __restore_processor_state(struct saved_context *ctxt)
75 /* 80 /*
76 * control registers 81 * control registers
77 */ 82 */
83 wrmsrl(MSR_EFER, ctxt->efer);
78 asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); 84 asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
79 asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); 85 asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
80 asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); 86 asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
@@ -219,4 +225,15 @@ int swsusp_arch_resume(void)
219 restore_image(); 225 restore_image();
220 return 0; 226 return 0;
221} 227}
228
229/*
230 * pfn_is_nosave - check if given pfn is in the 'nosave' section
231 */
232
233int pfn_is_nosave(unsigned long pfn)
234{
235 unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
236 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
237 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
238}
222#endif /* CONFIG_SOFTWARE_SUSPEND */ 239#endif /* CONFIG_SOFTWARE_SUSPEND */
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index bfbe00763c6..16d183f67bc 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -71,9 +71,10 @@ loop:
71 jmp loop 71 jmp loop
72done: 72done:
73 /* go back to the original page tables */ 73 /* go back to the original page tables */
74 leaq init_level4_pgt(%rip), %rax 74 movq $(init_level4_pgt - __START_KERNEL_map), %rax
75 subq $__START_KERNEL_map, %rax 75 addq phys_base(%rip), %rax
76 movq %rax, %cr3 76 movq %rax, %cr3
77
77 /* Flush TLB, including "global" things (vmalloc) */ 78 /* Flush TLB, including "global" things (vmalloc) */
78 movq mmu_cr4_features(%rip), %rax 79 movq mmu_cr4_features(%rip), %rax
79 movq %rax, %rdx 80 movq %rax, %rdx
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
index 213fd6ab789..63d592c276c 100644
--- a/arch/x86_64/kernel/syscall.c
+++ b/arch/x86_64/kernel/syscall.c
@@ -3,6 +3,7 @@
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <linux/sys.h> 4#include <linux/sys.h>
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
6 7
7#define __NO_STUBS 8#define __NO_STUBS
8 9
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 75d73a9aa9f..0652e173813 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -39,13 +39,11 @@
39#include <asm/proto.h> 39#include <asm/proto.h>
40#include <asm/hpet.h> 40#include <asm/hpet.h>
41#include <asm/sections.h> 41#include <asm/sections.h>
42#include <linux/cpufreq.h>
43#include <linux/hpet.h> 42#include <linux/hpet.h>
44#include <asm/apic.h> 43#include <asm/apic.h>
45#include <asm/hpet.h> 44#include <asm/hpet.h>
46 45#include <asm/mpspec.h>
47extern void i8254_timer_resume(void); 46#include <asm/nmi.h>
48extern int using_apic_timer;
49 47
50static char *timename = NULL; 48static char *timename = NULL;
51 49
@@ -252,6 +250,51 @@ static unsigned long get_cmos_time(void)
252 return mktime(year, mon, day, hour, min, sec); 250 return mktime(year, mon, day, hour, min, sec);
253} 251}
254 252
253/* calibrate_cpu is used on systems with fixed rate TSCs to determine
254 * processor frequency */
255#define TICK_COUNT 100000000
256static unsigned int __init tsc_calibrate_cpu_khz(void)
257{
258 int tsc_start, tsc_now;
259 int i, no_ctr_free;
260 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
261 unsigned long flags;
262
263 for (i = 0; i < 4; i++)
264 if (avail_to_resrv_perfctr_nmi_bit(i))
265 break;
266 no_ctr_free = (i == 4);
267 if (no_ctr_free) {
268 i = 3;
269 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
270 wrmsrl(MSR_K7_EVNTSEL3, 0);
271 rdmsrl(MSR_K7_PERFCTR3, pmc3);
272 } else {
273 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
274 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
275 }
276 local_irq_save(flags);
277 /* start meauring cycles, incrementing from 0 */
278 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
279 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
280 rdtscl(tsc_start);
281 do {
282 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
283 tsc_now = get_cycles_sync();
284 } while ((tsc_now - tsc_start) < TICK_COUNT);
285
286 local_irq_restore(flags);
287 if (no_ctr_free) {
288 wrmsrl(MSR_K7_EVNTSEL3, 0);
289 wrmsrl(MSR_K7_PERFCTR3, pmc3);
290 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
291 } else {
292 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
293 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
294 }
295
296 return pmc_now * tsc_khz / (tsc_now - tsc_start);
297}
255 298
256/* 299/*
257 * pit_calibrate_tsc() uses the speaker output (channel 2) of 300 * pit_calibrate_tsc() uses the speaker output (channel 2) of
@@ -285,7 +328,7 @@ static unsigned int __init pit_calibrate_tsc(void)
285#define PIT_MODE 0x43 328#define PIT_MODE 0x43
286#define PIT_CH0 0x40 329#define PIT_CH0 0x40
287 330
288static void __init __pit_init(int val, u8 mode) 331static void __pit_init(int val, u8 mode)
289{ 332{
290 unsigned long flags; 333 unsigned long flags;
291 334
@@ -301,12 +344,12 @@ void __init pit_init(void)
301 __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ 344 __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
302} 345}
303 346
304void __init pit_stop_interrupt(void) 347void pit_stop_interrupt(void)
305{ 348{
306 __pit_init(0, 0x30); /* mode 0 */ 349 __pit_init(0, 0x30); /* mode 0 */
307} 350}
308 351
309void __init stop_timer_interrupt(void) 352void stop_timer_interrupt(void)
310{ 353{
311 char *name; 354 char *name;
312 if (hpet_address) { 355 if (hpet_address) {
@@ -339,23 +382,29 @@ void __init time_init(void)
339 if (hpet_use_timer) { 382 if (hpet_use_timer) {
340 /* set tick_nsec to use the proper rate for HPET */ 383 /* set tick_nsec to use the proper rate for HPET */
341 tick_nsec = TICK_NSEC_HPET; 384 tick_nsec = TICK_NSEC_HPET;
342 cpu_khz = hpet_calibrate_tsc(); 385 tsc_khz = hpet_calibrate_tsc();
343 timename = "HPET"; 386 timename = "HPET";
344 } else { 387 } else {
345 pit_init(); 388 pit_init();
346 cpu_khz = pit_calibrate_tsc(); 389 tsc_khz = pit_calibrate_tsc();
347 timename = "PIT"; 390 timename = "PIT";
348 } 391 }
349 392
393 cpu_khz = tsc_khz;
394 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
395 boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
396 boot_cpu_data.x86 == 16)
397 cpu_khz = tsc_calibrate_cpu_khz();
398
350 if (unsynchronized_tsc()) 399 if (unsynchronized_tsc())
351 mark_tsc_unstable(); 400 mark_tsc_unstable("TSCs unsynchronized");
352 401
353 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) 402 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
354 vgetcpu_mode = VGETCPU_RDTSCP; 403 vgetcpu_mode = VGETCPU_RDTSCP;
355 else 404 else
356 vgetcpu_mode = VGETCPU_LSL; 405 vgetcpu_mode = VGETCPU_LSL;
357 406
358 set_cyc2ns_scale(cpu_khz); 407 set_cyc2ns_scale(tsc_khz);
359 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", 408 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
360 cpu_khz / 1000, cpu_khz % 1000); 409 cpu_khz / 1000, cpu_khz % 1000);
361 init_tsc_clocksource(); 410 init_tsc_clocksource();
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
index c79b99a9e2f..e7e2764c461 100644
--- a/arch/x86_64/kernel/trampoline.S
+++ b/arch/x86_64/kernel/trampoline.S
@@ -3,6 +3,7 @@
3 * Trampoline.S Derived from Setup.S by Linus Torvalds 3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 * 4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as. 5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 * 15 Sept 2005 Eric Biederman: 64bit PIC support
6 * 7 *
7 * Entry: CS:IP point to the start of our code, we are 8 * Entry: CS:IP point to the start of our code, we are
8 * in real mode with no stack, but the rest of the 9 * in real mode with no stack, but the rest of the
@@ -17,15 +18,20 @@
17 * and IP is zero. Thus, data addresses need to be absolute 18 * and IP is zero. Thus, data addresses need to be absolute
18 * (no relocation) and are taken with regard to r_base. 19 * (no relocation) and are taken with regard to r_base.
19 * 20 *
21 * With the addition of trampoline_level4_pgt this code can
22 * now enter a 64bit kernel that lives at arbitrary 64bit
23 * physical addresses.
24 *
20 * If you work on this file, check the object module with objdump 25 * If you work on this file, check the object module with objdump
21 * --full-contents --reloc to make sure there are no relocation 26 * --full-contents --reloc to make sure there are no relocation
22 * entries. For the GDT entry we do hand relocation in smpboot.c 27 * entries.
23 * because of 64bit linker limitations.
24 */ 28 */
25 29
26#include <linux/linkage.h> 30#include <linux/linkage.h>
27#include <asm/segment.h> 31#include <asm/pgtable.h>
28#include <asm/page.h> 32#include <asm/page.h>
33#include <asm/msr.h>
34#include <asm/segment.h>
29 35
30.data 36.data
31 37
@@ -33,15 +39,33 @@
33 39
34ENTRY(trampoline_data) 40ENTRY(trampoline_data)
35r_base = . 41r_base = .
42 cli # We should be safe anyway
36 wbinvd 43 wbinvd
37 mov %cs, %ax # Code and data in the same place 44 mov %cs, %ax # Code and data in the same place
38 mov %ax, %ds 45 mov %ax, %ds
46 mov %ax, %es
47 mov %ax, %ss
39 48
40 cli # We should be safe anyway
41 49
42 movl $0xA5A5A5A5, trampoline_data - r_base 50 movl $0xA5A5A5A5, trampoline_data - r_base
43 # write marker for master knows we're running 51 # write marker for master knows we're running
44 52
53 # Setup stack
54 movw $(trampoline_stack_end - r_base), %sp
55
56 call verify_cpu # Verify the cpu supports long mode
57 testl %eax, %eax # Check for return code
58 jnz no_longmode
59
60 mov %cs, %ax
61 movzx %ax, %esi # Find the 32bit trampoline location
62 shll $4, %esi
63
64 # Fixup the vectors
65 addl %esi, startup_32_vector - r_base
66 addl %esi, startup_64_vector - r_base
67 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer
68
45 /* 69 /*
46 * GDT tables in non default location kernel can be beyond 16MB and 70 * GDT tables in non default location kernel can be beyond 16MB and
47 * lgdt will not be able to load the address as in real mode default 71 * lgdt will not be able to load the address as in real mode default
@@ -49,23 +73,94 @@ r_base = .
49 * to 32 bit. 73 * to 32 bit.
50 */ 74 */
51 75
52 lidtl idt_48 - r_base # load idt with 0, 0 76 lidtl tidt - r_base # load idt with 0, 0
53 lgdtl gdt_48 - r_base # load gdt with whatever is appropriate 77 lgdtl tgdt - r_base # load gdt with whatever is appropriate
54 78
55 xor %ax, %ax 79 xor %ax, %ax
56 inc %ax # protected mode (PE) bit 80 inc %ax # protected mode (PE) bit
57 lmsw %ax # into protected mode 81 lmsw %ax # into protected mode
58 # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S 82
59 ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map) 83 # flush prefetch and jump to startup_32
84 ljmpl *(startup_32_vector - r_base)
85
86 .code32
87 .balign 4
88startup_32:
89 movl $__KERNEL_DS, %eax # Initialize the %ds segment register
90 movl %eax, %ds
91
92 xorl %eax, %eax
93 btsl $5, %eax # Enable PAE mode
94 movl %eax, %cr4
95
96 # Setup trampoline 4 level pagetables
97 leal (trampoline_level4_pgt - r_base)(%esi), %eax
98 movl %eax, %cr3
99
100 movl $MSR_EFER, %ecx
101 movl $(1 << _EFER_LME), %eax # Enable Long Mode
102 xorl %edx, %edx
103 wrmsr
104
105 xorl %eax, %eax
106 btsl $31, %eax # Enable paging and in turn activate Long Mode
107 btsl $0, %eax # Enable protected mode
108 movl %eax, %cr0
109
110 /*
111 * At this point we're in long mode but in 32bit compatibility mode
112 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
113 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
114 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
115 */
116 ljmp *(startup_64_vector - r_base)(%esi)
117
118 .code64
119 .balign 4
120startup_64:
121 # Now jump into the kernel using virtual addresses
122 movq $secondary_startup_64, %rax
123 jmp *%rax
124
125 .code16
126no_longmode:
127 hlt
128 jmp no_longmode
129#include "verify_cpu.S"
60 130
61 # Careful these need to be in the same 64K segment as the above; 131 # Careful these need to be in the same 64K segment as the above;
62idt_48: 132tidt:
63 .word 0 # idt limit = 0 133 .word 0 # idt limit = 0
64 .word 0, 0 # idt base = 0L 134 .word 0, 0 # idt base = 0L
65 135
66gdt_48: 136 # Duplicate the global descriptor table
67 .short GDT_ENTRIES*8 - 1 # gdt limit 137 # so the kernel can live anywhere
68 .long cpu_gdt_table-__START_KERNEL_map 138 .balign 4
139tgdt:
140 .short tgdt_end - tgdt # gdt limit
141 .long tgdt - r_base
142 .short 0
143 .quad 0x00cf9b000000ffff # __KERNEL32_CS
144 .quad 0x00af9b000000ffff # __KERNEL_CS
145 .quad 0x00cf93000000ffff # __KERNEL_DS
146tgdt_end:
147
148 .balign 4
149startup_32_vector:
150 .long startup_32 - r_base
151 .word __KERNEL32_CS, 0
152
153 .balign 4
154startup_64_vector:
155 .long startup_64 - r_base
156 .word __KERNEL_CS, 0
157
158trampoline_stack:
159 .org 0x1000
160trampoline_stack_end:
161ENTRY(trampoline_level4_pgt)
162 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
163 .fill 510,8,0
164 .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
69 165
70.globl trampoline_end 166ENTRY(trampoline_end)
71trampoline_end:
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 09d2e8a10a4..d76fc32d459 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -426,8 +426,7 @@ void show_registers(struct pt_regs *regs)
426 const int cpu = smp_processor_id(); 426 const int cpu = smp_processor_id();
427 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 427 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
428 428
429 rsp = regs->rsp; 429 rsp = regs->rsp;
430
431 printk("CPU %d ", cpu); 430 printk("CPU %d ", cpu);
432 __show_regs(regs); 431 __show_regs(regs);
433 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 432 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -438,7 +437,6 @@ void show_registers(struct pt_regs *regs)
438 * time of the fault.. 437 * time of the fault..
439 */ 438 */
440 if (in_kernel) { 439 if (in_kernel) {
441
442 printk("Stack: "); 440 printk("Stack: ");
443 _show_stack(NULL, regs, (unsigned long*)rsp); 441 _show_stack(NULL, regs, (unsigned long*)rsp);
444 442
@@ -581,10 +579,20 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
581{ 579{
582 struct task_struct *tsk = current; 580 struct task_struct *tsk = current;
583 581
584 tsk->thread.error_code = error_code;
585 tsk->thread.trap_no = trapnr;
586
587 if (user_mode(regs)) { 582 if (user_mode(regs)) {
583 /*
584 * We want error_code and trap_no set for userspace
585 * faults and kernelspace faults which result in
586 * die(), but not kernelspace faults which are fixed
587 * up. die() gives the process no chance to handle
588 * the signal and notice the kernel fault information,
589 * so that won't result in polluting the information
590 * about previously queued, but not yet delivered,
591 * faults. See also do_general_protection below.
592 */
593 tsk->thread.error_code = error_code;
594 tsk->thread.trap_no = trapnr;
595
588 if (exception_trace && unhandled_signal(tsk, signr)) 596 if (exception_trace && unhandled_signal(tsk, signr))
589 printk(KERN_INFO 597 printk(KERN_INFO
590 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", 598 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
@@ -605,8 +613,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
605 fixup = search_exception_tables(regs->rip); 613 fixup = search_exception_tables(regs->rip);
606 if (fixup) 614 if (fixup)
607 regs->rip = fixup->fixup; 615 regs->rip = fixup->fixup;
608 else 616 else {
617 tsk->thread.error_code = error_code;
618 tsk->thread.trap_no = trapnr;
609 die(str, regs, error_code); 619 die(str, regs, error_code);
620 }
610 return; 621 return;
611 } 622 }
612} 623}
@@ -682,10 +693,10 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
682 693
683 conditional_sti(regs); 694 conditional_sti(regs);
684 695
685 tsk->thread.error_code = error_code;
686 tsk->thread.trap_no = 13;
687
688 if (user_mode(regs)) { 696 if (user_mode(regs)) {
697 tsk->thread.error_code = error_code;
698 tsk->thread.trap_no = 13;
699
689 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) 700 if (exception_trace && unhandled_signal(tsk, SIGSEGV))
690 printk(KERN_INFO 701 printk(KERN_INFO
691 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", 702 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
@@ -704,6 +715,9 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
704 regs->rip = fixup->fixup; 715 regs->rip = fixup->fixup;
705 return; 716 return;
706 } 717 }
718
719 tsk->thread.error_code = error_code;
720 tsk->thread.trap_no = 13;
707 if (notify_die(DIE_GPF, "general protection fault", regs, 721 if (notify_die(DIE_GPF, "general protection fault", regs,
708 error_code, 13, SIGSEGV) == NOTIFY_STOP) 722 error_code, 13, SIGSEGV) == NOTIFY_STOP)
709 return; 723 return;
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 1a0edbbffaa..48f9a8e6aa9 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -13,6 +13,8 @@ static int notsc __initdata = 0;
13 13
14unsigned int cpu_khz; /* TSC clocks / usec, not used here */ 14unsigned int cpu_khz; /* TSC clocks / usec, not used here */
15EXPORT_SYMBOL(cpu_khz); 15EXPORT_SYMBOL(cpu_khz);
16unsigned int tsc_khz;
17EXPORT_SYMBOL(tsc_khz);
16 18
17static unsigned int cyc2ns_scale __read_mostly; 19static unsigned int cyc2ns_scale __read_mostly;
18 20
@@ -77,7 +79,7 @@ static void handle_cpufreq_delayed_get(struct work_struct *v)
77static unsigned int ref_freq = 0; 79static unsigned int ref_freq = 0;
78static unsigned long loops_per_jiffy_ref = 0; 80static unsigned long loops_per_jiffy_ref = 0;
79 81
80static unsigned long cpu_khz_ref = 0; 82static unsigned long tsc_khz_ref = 0;
81 83
82static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 84static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
83 void *data) 85 void *data)
@@ -99,7 +101,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
99 if (!ref_freq) { 101 if (!ref_freq) {
100 ref_freq = freq->old; 102 ref_freq = freq->old;
101 loops_per_jiffy_ref = *lpj; 103 loops_per_jiffy_ref = *lpj;
102 cpu_khz_ref = cpu_khz; 104 tsc_khz_ref = tsc_khz;
103 } 105 }
104 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || 106 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
105 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || 107 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
@@ -107,12 +109,12 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
107 *lpj = 109 *lpj =
108 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); 110 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
109 111
110 cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); 112 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
111 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) 113 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
112 mark_tsc_unstable(); 114 mark_tsc_unstable("cpufreq changes");
113 } 115 }
114 116
115 set_cyc2ns_scale(cpu_khz_ref); 117 set_cyc2ns_scale(tsc_khz_ref);
116 118
117 return 0; 119 return 0;
118} 120}
@@ -197,10 +199,11 @@ static struct clocksource clocksource_tsc = {
197 .vread = vread_tsc, 199 .vread = vread_tsc,
198}; 200};
199 201
200void mark_tsc_unstable(void) 202void mark_tsc_unstable(char *reason)
201{ 203{
202 if (!tsc_unstable) { 204 if (!tsc_unstable) {
203 tsc_unstable = 1; 205 tsc_unstable = 1;
206 printk("Marking TSC unstable due to %s\n", reason);
204 /* Change only the rating, when not registered */ 207 /* Change only the rating, when not registered */
205 if (clocksource_tsc.mult) 208 if (clocksource_tsc.mult)
206 clocksource_change_rating(&clocksource_tsc, 0); 209 clocksource_change_rating(&clocksource_tsc, 0);
@@ -213,7 +216,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
213void __init init_tsc_clocksource(void) 216void __init init_tsc_clocksource(void)
214{ 217{
215 if (!notsc) { 218 if (!notsc) {
216 clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, 219 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
217 clocksource_tsc.shift); 220 clocksource_tsc.shift);
218 if (check_tsc_unstable()) 221 if (check_tsc_unstable())
219 clocksource_tsc.rating = 0; 222 clocksource_tsc.rating = 0;
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c
index 014f0db45df..355f5f506c8 100644
--- a/arch/x86_64/kernel/tsc_sync.c
+++ b/arch/x86_64/kernel/tsc_sync.c
@@ -50,7 +50,7 @@ static __cpuinit void check_tsc_warp(void)
50 /* 50 /*
51 * The measurement runs for 20 msecs: 51 * The measurement runs for 20 msecs:
52 */ 52 */
53 end = start + cpu_khz * 20ULL; 53 end = start + tsc_khz * 20ULL;
54 now = start; 54 now = start;
55 55
56 for (i = 0; ; i++) { 56 for (i = 0; ; i++) {
@@ -138,7 +138,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
138 printk("\n"); 138 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp); 140 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable(); 141 mark_tsc_unstable("check_tsc_sync_source failed");
142 nr_warps = 0; 142 nr_warps = 0;
143 max_warp = 0; 143 max_warp = 0;
144 last_tsc = 0; 144 last_tsc = 0;
diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S
new file mode 100644
index 00000000000..e035f594819
--- /dev/null
+++ b/arch/x86_64/kernel/verify_cpu.S
@@ -0,0 +1,119 @@
1/*
2 *
3 * verify_cpu.S - Code for cpu long mode and SSE verification. This
4 * code has been borrowed from boot/setup.S and was introduced by
5 * Andi Kleen.
6 *
7 * Copyright (c) 2007 Andi Kleen (ak@suse.de)
8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
10 *
11 * This source code is licensed under the GNU General Public License,
12 * Version 2. See the file COPYING for more details.
13 *
14 * This is a common code for verification whether CPU supports
15 * long mode and SSE or not. It is not called directly instead this
16 * file is included at various places and compiled in that context.
17 * Following are the current usage.
18 *
19 * This file is included by both 16bit and 32bit code.
20 *
21 * arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
22 * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
23 * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
24 * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
25 *
26 * verify_cpu, returns the status of cpu check in register %eax.
27 * 0: Success 1: Failure
28 *
29 * The caller needs to check for the error code and take the action
30 * appropriately. Either display a message or halt.
31 */
32
33#include <asm/cpufeature.h>
34
35verify_cpu:
36 pushfl # Save caller passed flags
37 pushl $0 # Kill any dangerous flags
38 popfl
39
40 /* minimum CPUID flags for x86-64 as defined by AMD */
41#define M(x) (1<<(x))
42#define M2(a,b) M(a)|M(b)
43#define M4(a,b,c,d) M(a)|M(b)|M(c)|M(d)
44
45#define SSE_MASK \
46 (M2(X86_FEATURE_XMM,X86_FEATURE_XMM2))
47#define REQUIRED_MASK1 \
48 (M4(X86_FEATURE_FPU,X86_FEATURE_PSE,X86_FEATURE_TSC,X86_FEATURE_MSR)|\
49 M4(X86_FEATURE_PAE,X86_FEATURE_CX8,X86_FEATURE_PGE,X86_FEATURE_CMOV)|\
50 M(X86_FEATURE_FXSR))
51#define REQUIRED_MASK2 \
52 (M(X86_FEATURE_LM - 32))
53
54 pushfl # standard way to check for cpuid
55 popl %eax
56 movl %eax,%ebx
57 xorl $0x200000,%eax
58 pushl %eax
59 popfl
60 pushfl
61 popl %eax
62 cmpl %eax,%ebx
63 jz verify_cpu_no_longmode # cpu has no cpuid
64
65 movl $0x0,%eax # See if cpuid 1 is implemented
66 cpuid
67 cmpl $0x1,%eax
68 jb verify_cpu_no_longmode # no cpuid 1
69
70 xor %di,%di
71 cmpl $0x68747541,%ebx # AuthenticAMD
72 jnz verify_cpu_noamd
73 cmpl $0x69746e65,%edx
74 jnz verify_cpu_noamd
75 cmpl $0x444d4163,%ecx
76 jnz verify_cpu_noamd
77 mov $1,%di # cpu is from AMD
78
79verify_cpu_noamd:
80 movl $0x1,%eax # Does the cpu have what it takes
81 cpuid
82 andl $REQUIRED_MASK1,%edx
83 xorl $REQUIRED_MASK1,%edx
84 jnz verify_cpu_no_longmode
85
86 movl $0x80000000,%eax # See if extended cpuid is implemented
87 cpuid
88 cmpl $0x80000001,%eax
89 jb verify_cpu_no_longmode # no extended cpuid
90
91 movl $0x80000001,%eax # Does the cpu have what it takes
92 cpuid
93 andl $REQUIRED_MASK2,%edx
94 xorl $REQUIRED_MASK2,%edx
95 jnz verify_cpu_no_longmode
96
97verify_cpu_sse_test:
98 movl $1,%eax
99 cpuid
100 andl $SSE_MASK,%edx
101 cmpl $SSE_MASK,%edx
102 je verify_cpu_sse_ok
103 test %di,%di
104 jz verify_cpu_no_longmode # only try to force SSE on AMD
105 movl $0xc0010015,%ecx # HWCR
106 rdmsr
107 btr $15,%eax # enable SSE
108 wrmsr
109 xor %di,%di # don't loop
110 jmp verify_cpu_sse_test # try again
111
112verify_cpu_no_longmode:
113 popfl # Restore caller passed flags
114 movl $1,%eax
115 ret
116verify_cpu_sse_ok:
117 popfl # Restore caller passed flags
118 xorl %eax, %eax
119 ret
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 5176ecf006e..88cfa50b424 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -29,9 +29,7 @@ SECTIONS
29 .text : AT(ADDR(.text) - LOAD_OFFSET) { 29 .text : AT(ADDR(.text) - LOAD_OFFSET) {
30 /* First the code that has to be first for bootstrapping */ 30 /* First the code that has to be first for bootstrapping */
31 *(.bootstrap.text) 31 *(.bootstrap.text)
32 /* Then all the functions that are "hot" in profiles, to group them 32 _stext = .;
33 onto the same hugetlb entry */
34 #include "functionlist"
35 /* Then the rest */ 33 /* Then the rest */
36 *(.text) 34 *(.text)
37 SCHED_TEXT 35 SCHED_TEXT
@@ -50,10 +48,10 @@ SECTIONS
50 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 48 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
51 __stop___ex_table = .; 49 __stop___ex_table = .;
52 50
53 RODATA
54
55 BUG_TABLE 51 BUG_TABLE
56 52
53 RODATA
54
57 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ 55 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
58 /* Data */ 56 /* Data */
59 .data : AT(ADDR(.data) - LOAD_OFFSET) { 57 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -94,6 +92,12 @@ SECTIONS
94 { *(.vsyscall_gtod_data) } 92 { *(.vsyscall_gtod_data) }
95 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); 93 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
96 94
95
96 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
97 { *(.vsyscall_1) }
98 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
99 { *(.vsyscall_2) }
100
97 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } 101 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
98 vgetcpu_mode = VVIRT(.vgetcpu_mode); 102 vgetcpu_mode = VVIRT(.vgetcpu_mode);
99 103
@@ -101,10 +105,6 @@ SECTIONS
101 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } 105 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
102 jiffies = VVIRT(.jiffies); 106 jiffies = VVIRT(.jiffies);
103 107
104 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
105 { *(.vsyscall_1) }
106 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
107 { *(.vsyscall_2) }
108 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) 108 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
109 { *(.vsyscall_3) } 109 { *(.vsyscall_3) }
110 110
@@ -194,7 +194,7 @@ SECTIONS
194 __initramfs_end = .; 194 __initramfs_end = .;
195#endif 195#endif
196 196
197 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(4096);
198 __per_cpu_start = .; 198 __per_cpu_start = .;
199 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 199 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
200 __per_cpu_end = .; 200 __per_cpu_end = .;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index b43c698cf7d..dc32cef9619 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -45,14 +45,34 @@
45 45
46#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 46#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
47#define __syscall_clobber "r11","rcx","memory" 47#define __syscall_clobber "r11","rcx","memory"
48#define __pa_vsymbol(x) \
49 ({unsigned long v; \
50 extern char __vsyscall_0; \
51 asm("" : "=r" (v) : "0" (x)); \
52 ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
48 53
54/*
55 * vsyscall_gtod_data contains data that is :
56 * - readonly from vsyscalls
57 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
58 * Try to keep this structure as small as possible to avoid cache line ping pongs
59 */
49struct vsyscall_gtod_data_t { 60struct vsyscall_gtod_data_t {
50 seqlock_t lock; 61 seqlock_t lock;
51 int sysctl_enabled; 62
52 struct timeval wall_time_tv; 63 /* open coded 'struct timespec' */
64 time_t wall_time_sec;
65 u32 wall_time_nsec;
66
67 int sysctl_enabled;
53 struct timezone sys_tz; 68 struct timezone sys_tz;
54 cycle_t offset_base; 69 struct { /* extract of a clocksource struct */
55 struct clocksource clock; 70 cycle_t (*vread)(void);
71 cycle_t cycle_last;
72 cycle_t mask;
73 u32 mult;
74 u32 shift;
75 } clock;
56}; 76};
57int __vgetcpu_mode __section_vgetcpu_mode; 77int __vgetcpu_mode __section_vgetcpu_mode;
58 78
@@ -68,9 +88,13 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
68 88
69 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); 89 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
70 /* copy vsyscall data */ 90 /* copy vsyscall data */
71 vsyscall_gtod_data.clock = *clock; 91 vsyscall_gtod_data.clock.vread = clock->vread;
72 vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; 92 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
73 vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; 93 vsyscall_gtod_data.clock.mask = clock->mask;
94 vsyscall_gtod_data.clock.mult = clock->mult;
95 vsyscall_gtod_data.clock.shift = clock->shift;
96 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
97 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
74 vsyscall_gtod_data.sys_tz = sys_tz; 98 vsyscall_gtod_data.sys_tz = sys_tz;
75 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 99 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
76} 100}
@@ -105,7 +129,8 @@ static __always_inline long time_syscall(long *t)
105static __always_inline void do_vgettimeofday(struct timeval * tv) 129static __always_inline void do_vgettimeofday(struct timeval * tv)
106{ 130{
107 cycle_t now, base, mask, cycle_delta; 131 cycle_t now, base, mask, cycle_delta;
108 unsigned long seq, mult, shift, nsec_delta; 132 unsigned seq;
133 unsigned long mult, shift, nsec;
109 cycle_t (*vread)(void); 134 cycle_t (*vread)(void);
110 do { 135 do {
111 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 136 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
@@ -121,21 +146,20 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
121 mult = __vsyscall_gtod_data.clock.mult; 146 mult = __vsyscall_gtod_data.clock.mult;
122 shift = __vsyscall_gtod_data.clock.shift; 147 shift = __vsyscall_gtod_data.clock.shift;
123 148
124 *tv = __vsyscall_gtod_data.wall_time_tv; 149 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
125 150 nsec = __vsyscall_gtod_data.wall_time_nsec;
126 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 151 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
127 152
128 /* calculate interval: */ 153 /* calculate interval: */
129 cycle_delta = (now - base) & mask; 154 cycle_delta = (now - base) & mask;
130 /* convert to nsecs: */ 155 /* convert to nsecs: */
131 nsec_delta = (cycle_delta * mult) >> shift; 156 nsec += (cycle_delta * mult) >> shift;
132 157
133 /* convert to usecs and add to timespec: */ 158 while (nsec >= NSEC_PER_SEC) {
134 tv->tv_usec += nsec_delta / NSEC_PER_USEC;
135 while (tv->tv_usec > USEC_PER_SEC) {
136 tv->tv_sec += 1; 159 tv->tv_sec += 1;
137 tv->tv_usec -= USEC_PER_SEC; 160 nsec -= NSEC_PER_SEC;
138 } 161 }
162 tv->tv_usec = nsec / NSEC_PER_USEC;
139} 163}
140 164
141int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) 165int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
@@ -151,11 +175,13 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
151 * unlikely */ 175 * unlikely */
152time_t __vsyscall(1) vtime(time_t *t) 176time_t __vsyscall(1) vtime(time_t *t)
153{ 177{
178 time_t result;
154 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 179 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
155 return time_syscall(t); 180 return time_syscall(t);
156 else if (t) 181 result = __vsyscall_gtod_data.wall_time_sec;
157 *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; 182 if (t)
158 return __vsyscall_gtod_data.wall_time_tv.tv_sec; 183 *t = result;
184 return result;
159} 185}
160 186
161/* Fast way to get current CPU and node. 187/* Fast way to get current CPU and node.
@@ -224,10 +250,10 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
224 return ret; 250 return ret;
225 /* gcc has some trouble with __va(__pa()), so just do it this 251 /* gcc has some trouble with __va(__pa()), so just do it this
226 way. */ 252 way. */
227 map1 = ioremap(__pa_symbol(&vsysc1), 2); 253 map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
228 if (!map1) 254 if (!map1)
229 return -ENOMEM; 255 return -ENOMEM;
230 map2 = ioremap(__pa_symbol(&vsysc2), 2); 256 map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
231 if (!map2) { 257 if (!map2) {
232 ret = -ENOMEM; 258 ret = -ENOMEM;
233 goto out; 259 goto out;
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 6ada7231f3a..de99dba2c51 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -585,7 +585,7 @@ do_sigbus:
585} 585}
586 586
587DEFINE_SPINLOCK(pgd_lock); 587DEFINE_SPINLOCK(pgd_lock);
588struct page *pgd_list; 588LIST_HEAD(pgd_list);
589 589
590void vmalloc_sync_all(void) 590void vmalloc_sync_all(void)
591{ 591{
@@ -605,8 +605,7 @@ void vmalloc_sync_all(void)
605 if (pgd_none(*pgd_ref)) 605 if (pgd_none(*pgd_ref))
606 continue; 606 continue;
607 spin_lock(&pgd_lock); 607 spin_lock(&pgd_lock);
608 for (page = pgd_list; page; 608 list_for_each_entry(page, &pgd_list, lru) {
609 page = (struct page *)page->index) {
610 pgd_t *pgd; 609 pgd_t *pgd;
611 pgd = (pgd_t *)page_address(page) + pgd_index(address); 610 pgd = (pgd_t *)page_address(page) + pgd_index(address);
612 if (pgd_none(*pgd)) 611 if (pgd_none(*pgd))
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index ec31534eb10..282b0a8f00a 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -22,10 +22,12 @@
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 23#include <linux/proc_fs.h>
24#include <linux/pci.h> 24#include <linux/pci.h>
25#include <linux/pfn.h>
25#include <linux/poison.h> 26#include <linux/poison.h>
26#include <linux/dma-mapping.h> 27#include <linux/dma-mapping.h>
27#include <linux/module.h> 28#include <linux/module.h>
28#include <linux/memory_hotplug.h> 29#include <linux/memory_hotplug.h>
30#include <linux/nmi.h>
29 31
30#include <asm/processor.h> 32#include <asm/processor.h>
31#include <asm/system.h> 33#include <asm/system.h>
@@ -46,7 +48,7 @@
46#define Dprintk(x...) 48#define Dprintk(x...)
47#endif 49#endif
48 50
49struct dma_mapping_ops* dma_ops; 51const struct dma_mapping_ops* dma_ops;
50EXPORT_SYMBOL(dma_ops); 52EXPORT_SYMBOL(dma_ops);
51 53
52static unsigned long dma_reserve __initdata; 54static unsigned long dma_reserve __initdata;
@@ -72,6 +74,11 @@ void show_mem(void)
72 74
73 for_each_online_pgdat(pgdat) { 75 for_each_online_pgdat(pgdat) {
74 for (i = 0; i < pgdat->node_spanned_pages; ++i) { 76 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
77 /* this loop can take a while with 256 GB and 4k pages
78 so update the NMI watchdog */
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
80 touch_nmi_watchdog();
81 }
75 page = pfn_to_page(pgdat->node_start_pfn + i); 82 page = pfn_to_page(pgdat->node_start_pfn + i);
76 total++; 83 total++;
77 if (PageReserved(page)) 84 if (PageReserved(page))
@@ -167,23 +174,9 @@ __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
167 174
168unsigned long __initdata table_start, table_end; 175unsigned long __initdata table_start, table_end;
169 176
170extern pmd_t temp_boot_pmds[]; 177static __meminit void *alloc_low_page(unsigned long *phys)
171
172static struct temp_map {
173 pmd_t *pmd;
174 void *address;
175 int allocated;
176} temp_mappings[] __initdata = {
177 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
178 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
179 {}
180};
181
182static __meminit void *alloc_low_page(int *index, unsigned long *phys)
183{ 178{
184 struct temp_map *ti; 179 unsigned long pfn = table_end++;
185 int i;
186 unsigned long pfn = table_end++, paddr;
187 void *adr; 180 void *adr;
188 181
189 if (after_bootmem) { 182 if (after_bootmem) {
@@ -194,57 +187,63 @@ static __meminit void *alloc_low_page(int *index, unsigned long *phys)
194 187
195 if (pfn >= end_pfn) 188 if (pfn >= end_pfn)
196 panic("alloc_low_page: ran out of memory"); 189 panic("alloc_low_page: ran out of memory");
197 for (i = 0; temp_mappings[i].allocated; i++) { 190
198 if (!temp_mappings[i].pmd) 191 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
199 panic("alloc_low_page: ran out of temp mappings");
200 }
201 ti = &temp_mappings[i];
202 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
203 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
204 ti->allocated = 1;
205 __flush_tlb();
206 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
207 memset(adr, 0, PAGE_SIZE); 192 memset(adr, 0, PAGE_SIZE);
208 *index = i; 193 *phys = pfn * PAGE_SIZE;
209 *phys = pfn * PAGE_SIZE; 194 return adr;
210 return adr; 195}
211}
212 196
213static __meminit void unmap_low_page(int i) 197static __meminit void unmap_low_page(void *adr)
214{ 198{
215 struct temp_map *ti;
216 199
217 if (after_bootmem) 200 if (after_bootmem)
218 return; 201 return;
219 202
220 ti = &temp_mappings[i]; 203 early_iounmap(adr, PAGE_SIZE);
221 set_pmd(ti->pmd, __pmd(0));
222 ti->allocated = 0;
223} 204}
224 205
225/* Must run before zap_low_mappings */ 206/* Must run before zap_low_mappings */
226__init void *early_ioremap(unsigned long addr, unsigned long size) 207__init void *early_ioremap(unsigned long addr, unsigned long size)
227{ 208{
228 unsigned long map = round_down(addr, LARGE_PAGE_SIZE); 209 unsigned long vaddr;
229 210 pmd_t *pmd, *last_pmd;
230 /* actually usually some more */ 211 int i, pmds;
231 if (size >= LARGE_PAGE_SIZE) { 212
232 return NULL; 213 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
214 vaddr = __START_KERNEL_map;
215 pmd = level2_kernel_pgt;
216 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
217 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
218 for (i = 0; i < pmds; i++) {
219 if (pmd_present(pmd[i]))
220 goto next;
221 }
222 vaddr += addr & ~PMD_MASK;
223 addr &= PMD_MASK;
224 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
225 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
226 __flush_tlb();
227 return (void *)vaddr;
228 next:
229 ;
233 } 230 }
234 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); 231 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
235 map += LARGE_PAGE_SIZE; 232 return NULL;
236 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
237 __flush_tlb();
238 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
239} 233}
240 234
241/* To avoid virtual aliases later */ 235/* To avoid virtual aliases later */
242__init void early_iounmap(void *addr, unsigned long size) 236__init void early_iounmap(void *addr, unsigned long size)
243{ 237{
244 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) 238 unsigned long vaddr;
245 printk("early_iounmap: bad address %p\n", addr); 239 pmd_t *pmd;
246 set_pmd(temp_mappings[0].pmd, __pmd(0)); 240 int i, pmds;
247 set_pmd(temp_mappings[1].pmd, __pmd(0)); 241
242 vaddr = (unsigned long)addr;
243 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
244 pmd = level2_kernel_pgt + pmd_index(vaddr);
245 for (i = 0; i < pmds; i++)
246 pmd_clear(pmd + i);
248 __flush_tlb(); 247 __flush_tlb();
249} 248}
250 249
@@ -289,7 +288,6 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
289 288
290 289
291 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { 290 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
292 int map;
293 unsigned long pmd_phys; 291 unsigned long pmd_phys;
294 pud_t *pud = pud_page + pud_index(addr); 292 pud_t *pud = pud_page + pud_index(addr);
295 pmd_t *pmd; 293 pmd_t *pmd;
@@ -307,12 +305,12 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
307 continue; 305 continue;
308 } 306 }
309 307
310 pmd = alloc_low_page(&map, &pmd_phys); 308 pmd = alloc_low_page(&pmd_phys);
311 spin_lock(&init_mm.page_table_lock); 309 spin_lock(&init_mm.page_table_lock);
312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 310 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
313 phys_pmd_init(pmd, addr, end); 311 phys_pmd_init(pmd, addr, end);
314 spin_unlock(&init_mm.page_table_lock); 312 spin_unlock(&init_mm.page_table_lock);
315 unmap_low_page(map); 313 unmap_low_page(pmd);
316 } 314 }
317 __flush_tlb(); 315 __flush_tlb();
318} 316}
@@ -364,7 +362,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
364 end = (unsigned long)__va(end); 362 end = (unsigned long)__va(end);
365 363
366 for (; start < end; start = next) { 364 for (; start < end; start = next) {
367 int map;
368 unsigned long pud_phys; 365 unsigned long pud_phys;
369 pgd_t *pgd = pgd_offset_k(start); 366 pgd_t *pgd = pgd_offset_k(start);
370 pud_t *pud; 367 pud_t *pud;
@@ -372,7 +369,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
372 if (after_bootmem) 369 if (after_bootmem)
373 pud = pud_offset(pgd, start & PGDIR_MASK); 370 pud = pud_offset(pgd, start & PGDIR_MASK);
374 else 371 else
375 pud = alloc_low_page(&map, &pud_phys); 372 pud = alloc_low_page(&pud_phys);
376 373
377 next = start + PGDIR_SIZE; 374 next = start + PGDIR_SIZE;
378 if (next > end) 375 if (next > end)
@@ -380,7 +377,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
380 phys_pud_init(pud, __pa(start), __pa(next)); 377 phys_pud_init(pud, __pa(start), __pa(next));
381 if (!after_bootmem) 378 if (!after_bootmem)
382 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 379 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
383 unmap_low_page(map); 380 unmap_low_page(pud);
384 } 381 }
385 382
386 if (!after_bootmem) 383 if (!after_bootmem)
@@ -388,21 +385,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
388 __flush_tlb_all(); 385 __flush_tlb_all();
389} 386}
390 387
391void __cpuinit zap_low_mappings(int cpu)
392{
393 if (cpu == 0) {
394 pgd_t *pgd = pgd_offset_k(0UL);
395 pgd_clear(pgd);
396 } else {
397 /*
398 * For AP's, zap the low identity mappings by changing the cr3
399 * to init_level4_pgt and doing local flush tlb all
400 */
401 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
402 }
403 __flush_tlb_all();
404}
405
406#ifndef CONFIG_NUMA 388#ifndef CONFIG_NUMA
407void __init paging_init(void) 389void __init paging_init(void)
408{ 390{
@@ -579,15 +561,6 @@ void __init mem_init(void)
579 reservedpages << (PAGE_SHIFT-10), 561 reservedpages << (PAGE_SHIFT-10),
580 datasize >> 10, 562 datasize >> 10,
581 initsize >> 10); 563 initsize >> 10);
582
583#ifdef CONFIG_SMP
584 /*
585 * Sync boot_level4_pgt mappings with the init_level4_pgt
586 * except for the low identity mappings which are already zapped
587 * in init_level4_pgt. This sync-up is essential for AP's bringup
588 */
589 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
590#endif
591} 564}
592 565
593void free_init_pages(char *what, unsigned long begin, unsigned long end) 566void free_init_pages(char *what, unsigned long begin, unsigned long end)
@@ -597,37 +570,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
597 if (begin >= end) 570 if (begin >= end)
598 return; 571 return;
599 572
600 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); 573 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
601 for (addr = begin; addr < end; addr += PAGE_SIZE) { 574 for (addr = begin; addr < end; addr += PAGE_SIZE) {
602 ClearPageReserved(virt_to_page(addr)); 575 struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
603 init_page_count(virt_to_page(addr)); 576 ClearPageReserved(page);
604 memset((void *)(addr & ~(PAGE_SIZE-1)), 577 init_page_count(page);
605 POISON_FREE_INITMEM, PAGE_SIZE); 578 memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
606 free_page(addr); 579 if (addr >= __START_KERNEL_map)
580 change_page_attr_addr(addr, 1, __pgprot(0));
581 __free_page(page);
607 totalram_pages++; 582 totalram_pages++;
608 } 583 }
584 if (addr > __START_KERNEL_map)
585 global_flush_tlb();
609} 586}
610 587
611void free_initmem(void) 588void free_initmem(void)
612{ 589{
613 memset(__initdata_begin, POISON_FREE_INITDATA,
614 __initdata_end - __initdata_begin);
615 free_init_pages("unused kernel memory", 590 free_init_pages("unused kernel memory",
616 (unsigned long)(&__init_begin), 591 __pa_symbol(&__init_begin),
617 (unsigned long)(&__init_end)); 592 __pa_symbol(&__init_end));
618} 593}
619 594
620#ifdef CONFIG_DEBUG_RODATA 595#ifdef CONFIG_DEBUG_RODATA
621 596
622void mark_rodata_ro(void) 597void mark_rodata_ro(void)
623{ 598{
624 unsigned long addr = (unsigned long)__start_rodata; 599 unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size;
625 600
626 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) 601#ifdef CONFIG_HOTPLUG_CPU
627 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); 602 /* It must still be possible to apply SMP alternatives. */
603 if (num_possible_cpus() > 1)
604 start = PFN_ALIGN(__va(__pa_symbol(&_etext)));
605#endif
606 size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start;
607 change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO);
628 608
629 printk ("Write protecting the kernel read-only data: %luk\n", 609 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
630 (__end_rodata - __start_rodata) >> 10); 610 size >> 10);
631 611
632 /* 612 /*
633 * change_page_attr_addr() requires a global_flush_tlb() call after it. 613 * change_page_attr_addr() requires a global_flush_tlb() call after it.
@@ -642,7 +622,7 @@ void mark_rodata_ro(void)
642#ifdef CONFIG_BLK_DEV_INITRD 622#ifdef CONFIG_BLK_DEV_INITRD
643void free_initrd_mem(unsigned long start, unsigned long end) 623void free_initrd_mem(unsigned long start, unsigned long end)
644{ 624{
645 free_init_pages("initrd memory", start, end); 625 free_init_pages("initrd memory", __pa(start), __pa(end));
646} 626}
647#endif 627#endif
648 628
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index b5b8dba28b4..f983c75825d 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -49,11 +49,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
49 int found = 0; 49 int found = 0;
50 u32 reg; 50 u32 reg;
51 unsigned numnodes; 51 unsigned numnodes;
52 nodemask_t nodes_parsed;
53 unsigned dualcore = 0; 52 unsigned dualcore = 0;
54 53
55 nodes_clear(nodes_parsed);
56
57 if (!early_pci_allowed()) 54 if (!early_pci_allowed())
58 return -1; 55 return -1;
59 56
@@ -65,6 +62,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
65 62
66 reg = read_pci_config(0, nb, 0, 0x60); 63 reg = read_pci_config(0, nb, 0, 0x60);
67 numnodes = ((reg >> 4) & 0xF) + 1; 64 numnodes = ((reg >> 4) & 0xF) + 1;
65 if (numnodes <= 1)
66 return -1;
68 67
69 printk(KERN_INFO "Number of nodes %d\n", numnodes); 68 printk(KERN_INFO "Number of nodes %d\n", numnodes);
70 69
@@ -102,7 +101,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
102 nodeid, (base>>8)&3, (limit>>8) & 3); 101 nodeid, (base>>8)&3, (limit>>8) & 3);
103 return -1; 102 return -1;
104 } 103 }
105 if (node_isset(nodeid, nodes_parsed)) { 104 if (node_isset(nodeid, node_possible_map)) {
106 printk(KERN_INFO "Node %d already present. Skipping\n", 105 printk(KERN_INFO "Node %d already present. Skipping\n",
107 nodeid); 106 nodeid);
108 continue; 107 continue;
@@ -155,7 +154,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
155 154
156 prevbase = base; 155 prevbase = base;
157 156
158 node_set(nodeid, nodes_parsed); 157 node_set(nodeid, node_possible_map);
159 } 158 }
160 159
161 if (!found) 160 if (!found)
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 41b8fb06992..51548947ad3 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -273,125 +273,213 @@ void __init numa_init_array(void)
273 273
274#ifdef CONFIG_NUMA_EMU 274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */ 275/* Numa emulation */
276int numa_fake __initdata = 0; 276#define E820_ADDR_HOLE_SIZE(start, end) \
277 (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \
278 PAGE_SHIFT)
279char *cmdline __initdata;
277 280
278/* 281/*
279 * This function is used to find out if the start and end correspond to 282 * Setups up nid to range from addr to addr + size. If the end boundary is
280 * different zones. 283 * greater than max_addr, then max_addr is used instead. The return value is 0
284 * if there is additional memory left for allocation past addr and -1 otherwise.
285 * addr is adjusted to be at the end of the node.
281 */ 286 */
282int zone_cross_over(unsigned long start, unsigned long end) 287static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
288 u64 size, u64 max_addr)
283{ 289{
284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && 290 int ret = 0;
285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) 291 nodes[nid].start = *addr;
286 return 1; 292 *addr += size;
287 return 0; 293 if (*addr >= max_addr) {
294 *addr = max_addr;
295 ret = -1;
296 }
297 nodes[nid].end = *addr;
298 node_set(nid, node_possible_map);
299 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
300 nodes[nid].start, nodes[nid].end,
301 (nodes[nid].end - nodes[nid].start) >> 20);
302 return ret;
288} 303}
289 304
290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 305/*
306 * Splits num_nodes nodes up equally starting at node_start. The return value
307 * is the number of nodes split up and addr is adjusted to be at the end of the
308 * last node allocated.
309 */
310static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
311 u64 max_addr, int node_start,
312 int num_nodes)
291{ 313{
292 int i, big; 314 unsigned int big;
293 struct bootnode nodes[MAX_NUMNODES]; 315 u64 size;
294 unsigned long sz, old_sz; 316 int i;
295 unsigned long hole_size;
296 unsigned long start, end;
297 unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299 start = (start_pfn << PAGE_SHIFT);
300 hole_size = e820_hole_size(start, max_addr);
301 sz = (max_addr - start - hole_size) / numa_fake;
302
303 /* Kludge needed for the hash function */
304
305 old_sz = sz;
306 /*
307 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308 */
309 sz &= FAKE_NODE_MIN_HASH_MASK;
310 317
318 if (num_nodes <= 0)
319 return -1;
320 if (num_nodes > MAX_NUMNODES)
321 num_nodes = MAX_NUMNODES;
322 size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
323 num_nodes;
311 /* 324 /*
312 * We ensure that each node is at least 64MB big. Smaller than this 325 * Calculate the number of big nodes that can be allocated as a result
313 * size can cause VM hiccups. 326 * of consolidating the leftovers.
314 */ 327 */
315 if (sz == 0) { 328 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing " 329 FAKE_NODE_MIN_SIZE;
317 "the number of nodes\n", numa_fake); 330
318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; 331 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
319 printk(KERN_INFO "Number of fake nodes will be = %d\n", 332 size &= FAKE_NODE_MIN_HASH_MASK;
320 numa_fake); 333 if (!size) {
321 sz = FAKE_NODE_MIN_SIZE; 334 printk(KERN_ERR "Not enough memory for each node. "
335 "NUMA emulation disabled.\n");
336 return -1;
322 } 337 }
323 /* 338
324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule. 339 for (i = node_start; i < num_nodes + node_start; i++) {
325 * This logic ensures the extra memory gets distributed among as many 340 u64 end = *addr + size;
326 * nodes as possible (as compared to one single node getting all that
327 * extra memory.
328 */
329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331 "%d\n",
332 (sz >> 20), (hole_size >> 20), big);
333 memset(&nodes,0,sizeof(nodes));
334 end = start;
335 for (i = 0; i < numa_fake; i++) {
336 /*
337 * In case we are not able to allocate enough memory for all
338 * the nodes, we reduce the number of fake nodes.
339 */
340 if (end >= max_addr) {
341 numa_fake = i - 1;
342 break;
343 }
344 start = nodes[i].start = end;
345 /*
346 * Final node can have all the remaining memory.
347 */
348 if (i == numa_fake-1)
349 sz = max_addr - start;
350 end = nodes[i].start + sz;
351 /*
352 * Fir "big" number of nodes get extra granule.
353 */
354 if (i < big) 341 if (i < big)
355 end += FAKE_NODE_MIN_SIZE; 342 end += FAKE_NODE_MIN_SIZE;
356 /* 343 /*
357 * Iterate over the range to ensure that this node gets at 344 * The final node can have the remaining system RAM. Other
358 * least sz amount of RAM (excluding holes) 345 * nodes receive roughly the same amount of available pages.
359 */ 346 */
360 while ((end - start - e820_hole_size(start, end)) < sz) { 347 if (i == num_nodes + node_start - 1)
361 end += FAKE_NODE_MIN_SIZE; 348 end = max_addr;
362 if (end >= max_addr) 349 else
363 break; 350 while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
351 size) {
352 end += FAKE_NODE_MIN_SIZE;
353 if (end > max_addr) {
354 end = max_addr;
355 break;
356 }
357 }
358 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
359 break;
360 }
361 return i - node_start + 1;
362}
363
364/*
365 * Splits the remaining system RAM into chunks of size. The remaining memory is
366 * always assigned to a final node and can be asymmetric. Returns the number of
367 * nodes split.
368 */
369static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
370 u64 max_addr, int node_start, u64 size)
371{
372 int i = node_start;
373 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
374 while (!setup_node_range(i++, nodes, addr, size, max_addr))
375 ;
376 return i - node_start;
377}
378
379/*
380 * Sets up the system RAM area from start_pfn to end_pfn according to the
381 * numa=fake command-line option.
382 */
383static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
384{
385 struct bootnode nodes[MAX_NUMNODES];
386 u64 addr = start_pfn << PAGE_SHIFT;
387 u64 max_addr = end_pfn << PAGE_SHIFT;
388 int num_nodes = 0;
389 int coeff_flag;
390 int coeff = -1;
391 int num = 0;
392 u64 size;
393 int i;
394
395 memset(&nodes, 0, sizeof(nodes));
396 /*
397 * If the numa=fake command-line is just a single number N, split the
398 * system RAM into N fake nodes.
399 */
400 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
401 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
402 simple_strtol(cmdline, NULL, 0));
403 if (num_nodes < 0)
404 return num_nodes;
405 goto out;
406 }
407
408 /* Parse the command line. */
409 for (coeff_flag = 0; ; cmdline++) {
410 if (*cmdline && isdigit(*cmdline)) {
411 num = num * 10 + *cmdline - '0';
412 continue;
364 } 413 }
365 /* 414 if (*cmdline == '*') {
366 * Look at the next node to make sure there is some real memory 415 if (num > 0)
367 * to map. Bad things happen when the only memory present 416 coeff = num;
368 * in a zone on a fake node is IO hole. 417 coeff_flag = 1;
369 */ 418 }
370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { 419 if (!*cmdline || *cmdline == ',') {
371 if (zone_cross_over(start, end + sz)) { 420 if (!coeff_flag)
372 end = (MAX_DMA32_PFN << PAGE_SHIFT); 421 coeff = 1;
422 /*
423 * Round down to the nearest FAKE_NODE_MIN_SIZE.
424 * Command-line coefficients are in megabytes.
425 */
426 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
427 if (size)
428 for (i = 0; i < coeff; i++, num_nodes++)
429 if (setup_node_range(num_nodes, nodes,
430 &addr, size, max_addr) < 0)
431 goto done;
432 if (!*cmdline)
373 break; 433 break;
374 } 434 coeff_flag = 0;
375 if (end >= max_addr) 435 coeff = -1;
436 }
437 num = 0;
438 }
439done:
440 if (!num_nodes)
441 return -1;
442 /* Fill remainder of system RAM, if appropriate. */
443 if (addr < max_addr) {
444 if (coeff_flag && coeff < 0) {
445 /* Split remaining nodes into num-sized chunks */
446 num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
447 num_nodes, num);
448 goto out;
449 }
450 switch (*(cmdline - 1)) {
451 case '*':
452 /* Split remaining nodes into coeff chunks */
453 if (coeff <= 0)
376 break; 454 break;
377 end += FAKE_NODE_MIN_SIZE; 455 num_nodes += split_nodes_equally(nodes, &addr, max_addr,
456 num_nodes, coeff);
457 break;
458 case ',':
459 /* Do not allocate remaining system RAM */
460 break;
461 default:
462 /* Give one final node */
463 setup_node_range(num_nodes, nodes, &addr,
464 max_addr - addr, max_addr);
465 num_nodes++;
378 } 466 }
379 if (end > max_addr) 467 }
380 end = max_addr; 468out:
381 nodes[i].end = end; 469 memnode_shift = compute_hash_shift(nodes, num_nodes);
382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 470 if (memnode_shift < 0) {
383 i, 471 memnode_shift = 0;
384 nodes[i].start, nodes[i].end, 472 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
385 (nodes[i].end - nodes[i].start) >> 20); 473 "disabled.\n");
386 node_set_online(i); 474 return -1;
387 } 475 }
388 memnode_shift = compute_hash_shift(nodes, numa_fake); 476
389 if (memnode_shift < 0) { 477 /*
390 memnode_shift = 0; 478 * We need to vacate all active ranges that may have been registered by
391 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); 479 * SRAT.
392 return -1; 480 */
393 } 481 remove_all_active_ranges();
394 for_each_online_node(i) { 482 for_each_node_mask(i, node_possible_map) {
395 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 483 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
396 nodes[i].end >> PAGE_SHIFT); 484 nodes[i].end >> PAGE_SHIFT);
397 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 485 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
@@ -399,26 +487,32 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
399 numa_init_array(); 487 numa_init_array();
400 return 0; 488 return 0;
401} 489}
402#endif 490#undef E820_ADDR_HOLE_SIZE
491#endif /* CONFIG_NUMA_EMU */
403 492
404void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 493void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
405{ 494{
406 int i; 495 int i;
407 496
497 nodes_clear(node_possible_map);
498
408#ifdef CONFIG_NUMA_EMU 499#ifdef CONFIG_NUMA_EMU
409 if (numa_fake && !numa_emulation(start_pfn, end_pfn)) 500 if (cmdline && !numa_emulation(start_pfn, end_pfn))
410 return; 501 return;
502 nodes_clear(node_possible_map);
411#endif 503#endif
412 504
413#ifdef CONFIG_ACPI_NUMA 505#ifdef CONFIG_ACPI_NUMA
414 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 506 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
415 end_pfn << PAGE_SHIFT)) 507 end_pfn << PAGE_SHIFT))
416 return; 508 return;
509 nodes_clear(node_possible_map);
417#endif 510#endif
418 511
419#ifdef CONFIG_K8_NUMA 512#ifdef CONFIG_K8_NUMA
420 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) 513 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
421 return; 514 return;
515 nodes_clear(node_possible_map);
422#endif 516#endif
423 printk(KERN_INFO "%s\n", 517 printk(KERN_INFO "%s\n",
424 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 518 numa_off ? "NUMA turned off" : "No NUMA configuration found");
@@ -432,6 +526,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
432 memnodemap[0] = 0; 526 memnodemap[0] = 0;
433 nodes_clear(node_online_map); 527 nodes_clear(node_online_map);
434 node_set_online(0); 528 node_set_online(0);
529 node_set(0, node_possible_map);
435 for (i = 0; i < NR_CPUS; i++) 530 for (i = 0; i < NR_CPUS; i++)
436 numa_set_node(i, 0); 531 numa_set_node(i, 0);
437 node_to_cpumask[0] = cpumask_of_cpu(0); 532 node_to_cpumask[0] = cpumask_of_cpu(0);
@@ -486,11 +581,8 @@ static __init int numa_setup(char *opt)
486 if (!strncmp(opt,"off",3)) 581 if (!strncmp(opt,"off",3))
487 numa_off = 1; 582 numa_off = 1;
488#ifdef CONFIG_NUMA_EMU 583#ifdef CONFIG_NUMA_EMU
489 if(!strncmp(opt, "fake=", 5)) { 584 if (!strncmp(opt, "fake=", 5))
490 numa_fake = simple_strtoul(opt+5,NULL,0); ; 585 cmdline = opt + 5;
491 if (numa_fake >= MAX_NUMNODES)
492 numa_fake = MAX_NUMNODES;
493 }
494#endif 586#endif
495#ifdef CONFIG_ACPI_NUMA 587#ifdef CONFIG_ACPI_NUMA
496 if (!strncmp(opt,"noacpi",6)) 588 if (!strncmp(opt,"noacpi",6))
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 081409aa345..bf4aa8dd425 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -51,7 +51,6 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
51 SetPagePrivate(base); 51 SetPagePrivate(base);
52 page_private(base) = 0; 52 page_private(base) = 0;
53 53
54 address = __pa(address);
55 addr = address & LARGE_PAGE_MASK; 54 addr = address & LARGE_PAGE_MASK;
56 pbase = (pte_t *)page_address(base); 55 pbase = (pte_t *)page_address(base);
57 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 56 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
@@ -101,13 +100,12 @@ static inline void save_page(struct page *fpage)
101 * No more special protections in this 2/4MB area - revert to a 100 * No more special protections in this 2/4MB area - revert to a
102 * large page again. 101 * large page again.
103 */ 102 */
104static void revert_page(unsigned long address, pgprot_t ref_prot) 103static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot)
105{ 104{
106 pgd_t *pgd; 105 pgd_t *pgd;
107 pud_t *pud; 106 pud_t *pud;
108 pmd_t *pmd; 107 pmd_t *pmd;
109 pte_t large_pte; 108 pte_t large_pte;
110 unsigned long pfn;
111 109
112 pgd = pgd_offset_k(address); 110 pgd = pgd_offset_k(address);
113 BUG_ON(pgd_none(*pgd)); 111 BUG_ON(pgd_none(*pgd));
@@ -115,7 +113,6 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
115 BUG_ON(pud_none(*pud)); 113 BUG_ON(pud_none(*pud));
116 pmd = pmd_offset(pud, address); 114 pmd = pmd_offset(pud, address);
117 BUG_ON(pmd_val(*pmd) & _PAGE_PSE); 115 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
118 pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
119 large_pte = pfn_pte(pfn, ref_prot); 116 large_pte = pfn_pte(pfn, ref_prot);
120 large_pte = pte_mkhuge(large_pte); 117 large_pte = pte_mkhuge(large_pte);
121 set_pte((pte_t *)pmd, large_pte); 118 set_pte((pte_t *)pmd, large_pte);
@@ -141,7 +138,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
141 */ 138 */
142 struct page *split; 139 struct page *split;
143 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); 140 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
144 split = split_large_page(address, prot, ref_prot2); 141 split = split_large_page(pfn << PAGE_SHIFT, prot,
142 ref_prot2);
145 if (!split) 143 if (!split)
146 return -ENOMEM; 144 return -ENOMEM;
147 set_pte(kpte, mk_pte(split, ref_prot2)); 145 set_pte(kpte, mk_pte(split, ref_prot2));
@@ -160,7 +158,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
160 158
161 if (page_private(kpte_page) == 0) { 159 if (page_private(kpte_page) == 0) {
162 save_page(kpte_page); 160 save_page(kpte_page);
163 revert_page(address, ref_prot); 161 revert_page(address, pfn, ref_prot);
164 } 162 }
165 return 0; 163 return 0;
166} 164}
@@ -180,22 +178,32 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
180 */ 178 */
181int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) 179int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
182{ 180{
183 int err = 0; 181 unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT;
182 int err = 0, kernel_map = 0;
184 int i; 183 int i;
185 184
185 if (address >= __START_KERNEL_map
186 && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
187 address = (unsigned long)__va(__pa(address));
188 kernel_map = 1;
189 }
190
186 down_write(&init_mm.mmap_sem); 191 down_write(&init_mm.mmap_sem);
187 for (i = 0; i < numpages; i++, address += PAGE_SIZE) { 192 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
188 unsigned long pfn = __pa(address) >> PAGE_SHIFT; 193 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
189 194
190 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); 195 if (!kernel_map || pte_present(pfn_pte(0, prot))) {
191 if (err) 196 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
192 break; 197 if (err)
198 break;
199 }
193 /* Handle kernel mapping too which aliases part of the 200 /* Handle kernel mapping too which aliases part of the
194 * lowmem */ 201 * lowmem */
195 if (__pa(address) < KERNEL_TEXT_SIZE) { 202 if ((pfn >= phys_base_pfn) &&
203 ((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) {
196 unsigned long addr2; 204 unsigned long addr2;
197 pgprot_t prot2; 205 pgprot_t prot2;
198 addr2 = __START_KERNEL_map + __pa(address); 206 addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT);
199 /* Make sure the kernel mappings stay executable */ 207 /* Make sure the kernel mappings stay executable */
200 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); 208 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
201 err = __change_page_attr(addr2, pfn, prot2, 209 err = __change_page_attr(addr2, pfn, prot2,
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2efe215fc76..1e76bb0a727 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -419,19 +419,21 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
419 return -1; 419 return -1;
420 } 420 }
421 421
422 node_possible_map = nodes_parsed;
423
422 /* Finally register nodes */ 424 /* Finally register nodes */
423 for_each_node_mask(i, nodes_parsed) 425 for_each_node_mask(i, node_possible_map)
424 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 426 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
425 /* Try again in case setup_node_bootmem missed one due 427 /* Try again in case setup_node_bootmem missed one due
426 to missing bootmem */ 428 to missing bootmem */
427 for_each_node_mask(i, nodes_parsed) 429 for_each_node_mask(i, node_possible_map)
428 if (!node_online(i)) 430 if (!node_online(i))
429 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 431 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
430 432
431 for (i = 0; i < NR_CPUS; i++) { 433 for (i = 0; i < NR_CPUS; i++) {
432 if (cpu_to_node[i] == NUMA_NO_NODE) 434 if (cpu_to_node[i] == NUMA_NO_NODE)
433 continue; 435 continue;
434 if (!node_isset(cpu_to_node[i], nodes_parsed)) 436 if (!node_isset(cpu_to_node[i], node_possible_map))
435 numa_set_node(i, NUMA_NO_NODE); 437 numa_set_node(i, NUMA_NO_NODE);
436 } 438 }
437 numa_init_array(); 439 numa_init_array();
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index ab6370054ce..4fbd66a52a8 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -198,7 +198,7 @@ SECTIONS
198 __ftr_fixup : { *(__ftr_fixup) } 198 __ftr_fixup : { *(__ftr_fixup) }
199 __stop___ftr_fixup = .; 199 __stop___ftr_fixup = .;
200 200
201 . = ALIGN(32); 201 . = ALIGN(4096);
202 __per_cpu_start = .; 202 __per_cpu_start = .;
203 .data.percpu : { *(.data.percpu) } 203 .data.percpu : { *(.data.percpu) }
204 __per_cpu_end = .; 204 __per_cpu_end = .;