aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig23
-rw-r--r--arch/x86/Kconfig.cpu17
-rw-r--r--arch/x86/Makefile29
-rw-r--r--arch/x86/boot/Makefile53
-rw-r--r--arch/x86/boot/header.S29
-rw-r--r--arch/x86/boot/pm.c44
-rw-r--r--arch/x86/boot/pmjump.S1
-rw-r--r--arch/x86/boot/setup.ld3
-rw-r--r--arch/x86/boot/tools/build.c9
-rw-r--r--arch/x86/include/asm/apic.h15
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/boot.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h53
-rwxr-xr-xarch/x86/include/asm/cpu_debug.h226
-rw-r--r--arch/x86/include/asm/desc.h3
-rw-r--r--arch/x86/include/asm/dmi.h14
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/i387.h8
-rw-r--r--arch/x86/include/asm/init.h18
-rw-r--r--arch/x86/include/asm/io.h3
-rw-r--r--arch/x86/include/asm/io_apic.h5
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/kexec.h13
-rw-r--r--arch/x86/include/asm/linkage.h19
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/mmzone_32.h43
-rw-r--r--arch/x86/include/asm/msidef.h1
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/page_32_types.h5
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h4
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32.h3
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/sections.h7
-rw-r--r--arch/x86/include/asm/setup.h37
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/include/asm/xen/page.h1
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c17
-rw-r--r--arch/x86/kernel/apic/apic.c35
-rw-r--r--arch/x86/kernel/apic/io_apic.c268
-rw-r--r--arch/x86/kernel/apic/probe_64.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c6
-rw-r--r--arch/x86/kernel/check.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c54
-rw-r--r--arch/x86/kernel/cpu/centaur.c36
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c37
-rw-r--r--arch/x86/kernel/cpu/common.c38
-rw-r--r--arch/x86/kernel/cpu/cpu.h11
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c901
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cyrix.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c32
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c62
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c1101
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c202
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1069
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/umc.c2
-rw-r--r--arch/x86/kernel/ds.c3
-rw-r--r--arch/x86/kernel/e820.c132
-rw-r--r--arch/x86/kernel/early_printk.c20
-rw-r--r--arch/x86/kernel/efi.c7
-rw-r--r--arch/x86/kernel/efi_64.c21
-rw-r--r--arch/x86/kernel/entry_32.S18
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/head32.c5
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S76
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/irq.c67
-rw-r--r--arch/x86/kernel/irq_32.c29
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/kprobes.c3
-rw-r--r--arch/x86/kernel/machine_kexec_32.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c99
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/mpparse.c31
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S24
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S189
-rw-r--r--arch/x86/kernel/setup.c63
-rw-r--r--arch/x86/kernel/setup_percpu.c355
-rw-r--r--arch/x86/kernel/smpboot.c78
-rw-r--r--arch/x86/kernel/tlb_uv.c5
-rw-r--r--arch/x86/kernel/tsc.c110
-rw-r--r--arch/x86/kernel/uv_time.c393
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S21
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S101
-rw-r--r--arch/x86/lguest/boot.c29
-rw-r--r--arch/x86/lib/memcpy_64.S143
-rw-r--r--arch/x86/math-emu/fpu_aux.c31
-rw-r--r--arch/x86/mm/highmem_32.c24
-rw-r--r--arch/x86/mm/init.c344
-rw-r--r--arch/x86/mm/init_32.c273
-rw-r--r--arch/x86/mm/init_64.c352
-rw-r--r--arch/x86/mm/iomap_32.c26
-rw-r--r--arch/x86/mm/ioremap.c71
-rw-r--r--arch/x86/mm/kmmio.c162
-rw-r--r--arch/x86/mm/memtest.c3
-rw-r--r--arch/x86/mm/numa_32.c5
-rw-r--r--arch/x86/mm/pageattr.c16
-rw-r--r--arch/x86/mm/pat.c5
-rw-r--r--arch/x86/mm/testmmiotrace.c70
-rw-r--r--arch/x86/mm/tlb.c5
-rw-r--r--arch/x86/pci/common.c4
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/xen/enlighten.c10
-rw-r--r--arch/x86/xen/mmu.c13
-rw-r--r--arch/x86/xen/smp.c6
137 files changed, 5994 insertions, 3001 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 469f3450bf8..34bc3a89228 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
138config HAVE_SETUP_PER_CPU_AREA 138config HAVE_SETUP_PER_CPU_AREA
139 def_bool y 139 def_bool y
140 140
141config HAVE_DYNAMIC_PER_CPU_AREA
142 def_bool y
143
141config HAVE_CPUMASK_OF_CPU_MAP 144config HAVE_CPUMASK_OF_CPU_MAP
142 def_bool X86_64_SMP 145 def_bool X86_64_SMP
143 146
@@ -166,6 +169,9 @@ config GENERIC_HARDIRQS
166 bool 169 bool
167 default y 170 default y
168 171
172config GENERIC_HARDIRQS_NO__DO_IRQ
173 def_bool y
174
169config GENERIC_IRQ_PROBE 175config GENERIC_IRQ_PROBE
170 bool 176 bool
171 default y 177 default y
@@ -780,6 +786,11 @@ config X86_MCE_AMD
780 Additional support for AMD specific MCE features such as 786 Additional support for AMD specific MCE features such as
781 the DRAM Error Threshold. 787 the DRAM Error Threshold.
782 788
789config X86_MCE_THRESHOLD
790 depends on X86_MCE_AMD || X86_MCE_INTEL
791 bool
792 default y
793
783config X86_MCE_NONFATAL 794config X86_MCE_NONFATAL
784 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 795 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
785 depends on X86_32 && X86_MCE 796 depends on X86_32 && X86_MCE
@@ -923,6 +934,12 @@ config X86_CPUID
923 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to 934 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
924 /dev/cpu/31/cpuid. 935 /dev/cpu/31/cpuid.
925 936
937config X86_CPU_DEBUG
938 tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
939 ---help---
940 If you select this option, this will provide various x86 CPUs
941 information through debugfs.
942
926choice 943choice
927 prompt "High Memory Support" 944 prompt "High Memory Support"
928 default HIGHMEM4G if !X86_NUMAQ 945 default HIGHMEM4G if !X86_NUMAQ
@@ -1115,7 +1132,7 @@ config NUMA_EMU
1115 1132
1116config NODES_SHIFT 1133config NODES_SHIFT
1117 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP 1134 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1118 range 1 9 if X86_64 1135 range 1 9
1119 default "9" if MAXSMP 1136 default "9" if MAXSMP
1120 default "6" if X86_64 1137 default "6" if X86_64
1121 default "4" if X86_NUMAQ 1138 default "4" if X86_NUMAQ
@@ -1125,7 +1142,7 @@ config NODES_SHIFT
1125 Specify the maximum number of NUMA Nodes available on the target 1142 Specify the maximum number of NUMA Nodes available on the target
1126 system. Increases memory reserved to accomodate various tables. 1143 system. Increases memory reserved to accomodate various tables.
1127 1144
1128config HAVE_ARCH_BOOTMEM_NODE 1145config HAVE_ARCH_BOOTMEM
1129 def_bool y 1146 def_bool y
1130 depends on X86_32 && NUMA 1147 depends on X86_32 && NUMA
1131 1148
@@ -1423,7 +1440,7 @@ config CRASH_DUMP
1423config KEXEC_JUMP 1440config KEXEC_JUMP
1424 bool "kexec jump (EXPERIMENTAL)" 1441 bool "kexec jump (EXPERIMENTAL)"
1425 depends on EXPERIMENTAL 1442 depends on EXPERIMENTAL
1426 depends on KEXEC && HIBERNATION && X86_32 1443 depends on KEXEC && HIBERNATION
1427 ---help--- 1444 ---help---
1428 Jump between original kernel and kexeced kernel and invoke 1445 Jump between original kernel and kexeced kernel and invoke
1429 code in physical address mode via KEXEC 1446 code in physical address mode via KEXEC
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a95eaf0e582..924e156a85a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -456,24 +456,9 @@ config CPU_SUP_AMD
456 456
457 If unsure, say N. 457 If unsure, say N.
458 458
459config CPU_SUP_CENTAUR_32 459config CPU_SUP_CENTAUR
460 default y 460 default y
461 bool "Support Centaur processors" if PROCESSOR_SELECT 461 bool "Support Centaur processors" if PROCESSOR_SELECT
462 depends on !64BIT
463 ---help---
464 This enables detection, tunings and quirks for Centaur processors
465
466 You need this enabled if you want your kernel to run on a
467 Centaur CPU. Disabling this option on other types of CPUs
468 makes the kernel a tiny bit smaller. Disabling it on a Centaur
469 CPU might render the kernel unbootable.
470
471 If unsure, say N.
472
473config CPU_SUP_CENTAUR_64
474 default y
475 bool "Support Centaur processors" if PROCESSOR_SELECT
476 depends on 64BIT
477 ---help--- 462 ---help---
478 This enables detection, tunings and quirks for Centaur processors 463 This enables detection, tunings and quirks for Centaur processors
479 464
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1836191839e..f05d8c91d9e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -153,34 +153,23 @@ endif
153 153
154boot := arch/x86/boot 154boot := arch/x86/boot
155 155
156PHONY += zImage bzImage compressed zlilo bzlilo \ 156BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
157 zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install 157
158PHONY += bzImage $(BOOT_TARGETS)
158 159
159# Default kernel to build 160# Default kernel to build
160all: bzImage 161all: bzImage
161 162
162# KBUILD_IMAGE specify target image being built 163# KBUILD_IMAGE specify target image being built
163 KBUILD_IMAGE := $(boot)/bzImage 164KBUILD_IMAGE := $(boot)/bzImage
164zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
165 165
166zImage bzImage: vmlinux 166bzImage: vmlinux
167 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 167 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
168 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 168 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
169 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ 169 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
170 170
171compressed: zImage 171$(BOOT_TARGETS): vmlinux
172 172 $(Q)$(MAKE) $(build)=$(boot) $@
173zlilo bzlilo: vmlinux
174 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
175
176zdisk bzdisk: vmlinux
177 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
178
179fdimage fdimage144 fdimage288 isoimage: vmlinux
180 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
181
182install:
183 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
184 173
185PHONY += vdso_install 174PHONY += vdso_install
186vdso_install: 175vdso_install:
@@ -205,7 +194,3 @@ define archhelp
205 echo ' FDARGS="..." arguments for the booted kernel' 194 echo ' FDARGS="..." arguments for the booted kernel'
206 echo ' FDINITRD=file initrd for the booted kernel' 195 echo ' FDINITRD=file initrd for the booted kernel'
207endef 196endef
208
209CLEAN_FILES += arch/x86/boot/fdimage \
210 arch/x86/boot/image.iso \
211 arch/x86/boot/mtools.conf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c70eff69a1f..fb737ce5888 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -6,26 +6,24 @@
6# for more details. 6# for more details.
7# 7#
8# Copyright (C) 1994 by Linus Torvalds 8# Copyright (C) 1994 by Linus Torvalds
9# Changed by many, many contributors over the years.
9# 10#
10 11
11# ROOT_DEV specifies the default root-device when making the image. 12# ROOT_DEV specifies the default root-device when making the image.
12# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case 13# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
13# the default of FLOPPY is used by 'build'. 14# the default of FLOPPY is used by 'build'.
14 15
15ROOT_DEV := CURRENT 16ROOT_DEV := CURRENT
16 17
17# If you want to preset the SVGA mode, uncomment the next line and 18# If you want to preset the SVGA mode, uncomment the next line and
18# set SVGA_MODE to whatever number you want. 19# set SVGA_MODE to whatever number you want.
19# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. 20# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
20# The number is the same as you would ordinarily press at bootup. 21# The number is the same as you would ordinarily press at bootup.
21 22
22SVGA_MODE := -DSVGA_MODE=NORMAL_VGA 23SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
23 24
24# If you want the RAM disk device, define this to be the size in blocks. 25targets := vmlinux.bin setup.bin setup.elf bzImage
25 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
26#RAMDISK := -DRAMDISK=512
27
28targets := vmlinux.bin setup.bin setup.elf zImage bzImage
29subdir- := compressed 27subdir- := compressed
30 28
31setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
@@ -71,17 +69,13 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
71KBUILD_CFLAGS += $(call cc-option,-m32) 69KBUILD_CFLAGS += $(call cc-option,-m32)
72KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 70KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
73 71
74$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 72$(obj)/bzImage: asflags-y := $(SVGA_MODE)
75$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
76$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
77$(obj)/bzImage: BUILDFLAGS := -b
78 73
79quiet_cmd_image = BUILD $@ 74quiet_cmd_image = BUILD $@
80cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \ 75cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
81 $(obj)/vmlinux.bin $(ROOT_DEV) > $@ 76 $(ROOT_DEV) > $@
82 77
83$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ 78$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
84 $(obj)/vmlinux.bin $(obj)/tools/build FORCE
85 $(call if_changed,image) 79 $(call if_changed,image)
86 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' 80 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
87 81
@@ -116,9 +110,11 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
116$(obj)/compressed/vmlinux: FORCE 110$(obj)/compressed/vmlinux: FORCE
117 $(Q)$(MAKE) $(build)=$(obj)/compressed $@ 111 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
118 112
119# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 113# Set this if you want to pass append arguments to the
114# bzdisk/fdimage/isoimage kernel
120FDARGS = 115FDARGS =
121# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel 116# Set this if you want an initrd included with the
117# bzdisk/fdimage/isoimage kernel
122FDINITRD = 118FDINITRD =
123 119
124image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) 120image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
@@ -127,7 +123,7 @@ $(obj)/mtools.conf: $(src)/mtools.conf.in
127 sed -e 's|@OBJ@|$(obj)|g' < $< > $@ 123 sed -e 's|@OBJ@|$(obj)|g' < $< > $@
128 124
129# This requires write access to /dev/fd0 125# This requires write access to /dev/fd0
130zdisk: $(BOOTIMAGE) $(obj)/mtools.conf 126bzdisk: $(obj)/bzImage $(obj)/mtools.conf
131 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync 127 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
132 syslinux /dev/fd0 ; sync 128 syslinux /dev/fd0 ; sync
133 echo '$(image_cmdline)' | \ 129 echo '$(image_cmdline)' | \
@@ -135,10 +131,10 @@ zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
135 if [ -f '$(FDINITRD)' ] ; then \ 131 if [ -f '$(FDINITRD)' ] ; then \
136 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ 132 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
137 fi 133 fi
138 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync 134 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync
139 135
140# These require being root or having syslinux 2.02 or higher installed 136# These require being root or having syslinux 2.02 or higher installed
141fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf 137fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
142 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 138 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
143 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync 139 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
144 syslinux $(obj)/fdimage ; sync 140 syslinux $(obj)/fdimage ; sync
@@ -147,9 +143,9 @@ fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
147 if [ -f '$(FDINITRD)' ] ; then \ 143 if [ -f '$(FDINITRD)' ] ; then \
148 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ 144 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
149 fi 145 fi
150 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync 146 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync
151 147
152fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf 148fdimage288: $(obj)/bzImage $(obj)/mtools.conf
153 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 149 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
154 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync 150 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
155 syslinux $(obj)/fdimage ; sync 151 syslinux $(obj)/fdimage ; sync
@@ -158,9 +154,9 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
158 if [ -f '$(FDINITRD)' ] ; then \ 154 if [ -f '$(FDINITRD)' ] ; then \
159 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ 155 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
160 fi 156 fi
161 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync 157 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync
162 158
163isoimage: $(BOOTIMAGE) 159isoimage: $(obj)/bzImage
164 -rm -rf $(obj)/isoimage 160 -rm -rf $(obj)/isoimage
165 mkdir $(obj)/isoimage 161 mkdir $(obj)/isoimage
166 for i in lib lib64 share end ; do \ 162 for i in lib lib64 share end ; do \
@@ -170,7 +166,7 @@ isoimage: $(BOOTIMAGE)
170 fi ; \ 166 fi ; \
171 if [ $$i = end ] ; then exit 1 ; fi ; \ 167 if [ $$i = end ] ; then exit 1 ; fi ; \
172 done 168 done
173 cp $(BOOTIMAGE) $(obj)/isoimage/linux 169 cp $(obj)/bzImage $(obj)/isoimage/linux
174 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg 170 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
175 if [ -f '$(FDINITRD)' ] ; then \ 171 if [ -f '$(FDINITRD)' ] ; then \
176 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ 172 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
@@ -181,12 +177,13 @@ isoimage: $(BOOTIMAGE)
181 isohybrid $(obj)/image.iso 2>/dev/null || true 177 isohybrid $(obj)/image.iso 2>/dev/null || true
182 rm -rf $(obj)/isoimage 178 rm -rf $(obj)/isoimage
183 179
184zlilo: $(BOOTIMAGE) 180bzlilo: $(obj)/bzImage
185 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi 181 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
186 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi 182 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
187 cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz 183 cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
188 cp System.map $(INSTALL_PATH)/ 184 cp System.map $(INSTALL_PATH)/
189 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi 185 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
190 186
191install: 187install:
192 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" 188 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
189 System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 7ccff4884a2..5d84d1c74e4 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -24,12 +24,8 @@
24#include "boot.h" 24#include "boot.h"
25#include "offsets.h" 25#include "offsets.h"
26 26
27SETUPSECTS = 4 /* default nr of setup-sectors */
28BOOTSEG = 0x07C0 /* original address of boot-sector */ 27BOOTSEG = 0x07C0 /* original address of boot-sector */
29SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ 28SYSSEG = 0x1000 /* historical load address >> 4 */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33 29
34#ifndef SVGA_MODE 30#ifndef SVGA_MODE
35#define SVGA_MODE ASK_VGA 31#define SVGA_MODE ASK_VGA
@@ -97,12 +93,12 @@ bugger_off_msg:
97 .section ".header", "a" 93 .section ".header", "a"
98 .globl hdr 94 .globl hdr
99hdr: 95hdr:
100setup_sects: .byte SETUPSECTS 96setup_sects: .byte 0 /* Filled in by build.c */
101root_flags: .word ROOT_RDONLY 97root_flags: .word ROOT_RDONLY
102syssize: .long SYSSIZE 98syssize: .long 0 /* Filled in by build.c */
103ram_size: .word RAMDISK 99ram_size: .word 0 /* Obsolete */
104vid_mode: .word SVGA_MODE 100vid_mode: .word SVGA_MODE
105root_dev: .word ROOT_DEV 101root_dev: .word 0 /* Filled in by build.c */
106boot_flag: .word 0xAA55 102boot_flag: .word 0xAA55
107 103
108 # offset 512, entry point 104 # offset 512, entry point
@@ -123,14 +119,15 @@ _start:
123 # or else old loadlin-1.5 will fail) 119 # or else old loadlin-1.5 will fail)
124 .globl realmode_swtch 120 .globl realmode_swtch
125realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 121realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
126start_sys_seg: .word SYSSEG 122start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
123 # in case something decided to "use" it
127 .word kernel_version-512 # pointing to kernel version string 124 .word kernel_version-512 # pointing to kernel version string
128 # above section of header is compatible 125 # above section of header is compatible
129 # with loadlin-1.5 (header v1.5). Don't 126 # with loadlin-1.5 (header v1.5). Don't
130 # change it. 127 # change it.
131 128
132type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, 129type_of_loader: .byte 0 # 0 means ancient bootloader, newer
133 # Bootlin, SYSLX, bootsect...) 130 # bootloaders know to change this.
134 # See Documentation/i386/boot.txt for 131 # See Documentation/i386/boot.txt for
135 # assigned ids 132 # assigned ids
136 133
@@ -142,11 +139,7 @@ CAN_USE_HEAP = 0x80 # If set, the loader also has set
142 # space behind setup.S can be used for 139 # space behind setup.S can be used for
143 # heap purposes. 140 # heap purposes.
144 # Only the loader knows what is free 141 # Only the loader knows what is free
145#ifndef __BIG_KERNEL__
146 .byte 0
147#else
148 .byte LOADED_HIGH 142 .byte LOADED_HIGH
149#endif
150 143
151setup_move_size: .word 0x8000 # size to move, when setup is not 144setup_move_size: .word 0x8000 # size to move, when setup is not
152 # loaded at 0x90000. We will move setup 145 # loaded at 0x90000. We will move setup
@@ -157,11 +150,7 @@ setup_move_size: .word 0x8000 # size to move, when setup is not
157 150
158code32_start: # here loaders can put a different 151code32_start: # here loaders can put a different
159 # start address for 32-bit code. 152 # start address for 32-bit code.
160#ifndef __BIG_KERNEL__
161 .long 0x1000 # 0x1000 = default for zImage
162#else
163 .long 0x100000 # 0x100000 = default for big kernel 153 .long 0x100000 # 0x100000 = default for big kernel
164#endif
165 154
166ramdisk_image: .long 0 # address of loaded ramdisk image 155ramdisk_image: .long 0 # address of loaded ramdisk image
167 # Here the loader puts the 32-bit 156 # Here the loader puts the 32-bit
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 85a1cd8a8ff..8062f891525 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -33,47 +33,6 @@ static void realmode_switch_hook(void)
33} 33}
34 34
35/* 35/*
36 * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
37 * A bzImage kernel is loaded and runs at 0x100000.
38 */
39static void move_kernel_around(void)
40{
41 /* Note: rely on the compile-time option here rather than
42 the LOADED_HIGH flag. The Qemu kernel loader unconditionally
43 sets the loadflags to zero. */
44#ifndef __BIG_KERNEL__
45 u16 dst_seg, src_seg;
46 u32 syssize;
47
48 dst_seg = 0x1000 >> 4;
49 src_seg = 0x10000 >> 4;
50 syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
51
52 while (syssize) {
53 int paras = (syssize >= 0x1000) ? 0x1000 : syssize;
54 int dwords = paras << 2;
55
56 asm volatile("pushw %%es ; "
57 "pushw %%ds ; "
58 "movw %1,%%es ; "
59 "movw %2,%%ds ; "
60 "xorw %%di,%%di ; "
61 "xorw %%si,%%si ; "
62 "rep;movsl ; "
63 "popw %%ds ; "
64 "popw %%es"
65 : "+c" (dwords)
66 : "r" (dst_seg), "r" (src_seg)
67 : "esi", "edi");
68
69 syssize -= paras;
70 dst_seg += paras;
71 src_seg += paras;
72 }
73#endif
74}
75
76/*
77 * Disable all interrupts at the legacy PIC. 36 * Disable all interrupts at the legacy PIC.
78 */ 37 */
79static void mask_all_interrupts(void) 38static void mask_all_interrupts(void)
@@ -147,9 +106,6 @@ void go_to_protected_mode(void)
147 /* Hook before leaving real mode, also disables interrupts */ 106 /* Hook before leaving real mode, also disables interrupts */
148 realmode_switch_hook(); 107 realmode_switch_hook();
149 108
150 /* Move the kernel/setup to their final resting places */
151 move_kernel_around();
152
153 /* Enable the A20 gate */ 109 /* Enable the A20 gate */
154 if (enable_a20()) { 110 if (enable_a20()) {
155 puts("A20 gate not responding, unable to boot...\n"); 111 puts("A20 gate not responding, unable to boot...\n");
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index 019c17a7585..3e0edc6d2a2 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -47,6 +47,7 @@ GLOBAL(protected_mode_jump)
47ENDPROC(protected_mode_jump) 47ENDPROC(protected_mode_jump)
48 48
49 .code32 49 .code32
50 .section ".text32","ax"
50GLOBAL(in_pm32) 51GLOBAL(in_pm32)
51 # Set up data segments for flat 32-bit mode 52 # Set up data segments for flat 32-bit mode
52 movl %ecx, %ds 53 movl %ecx, %ds
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index df9234b3a5e..bb8dc2de796 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -17,7 +17,8 @@ SECTIONS
17 .header : { *(.header) } 17 .header : { *(.header) }
18 .inittext : { *(.inittext) } 18 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) } 19 .initdata : { *(.initdata) }
20 .text : { *(.text*) } 20 .text : { *(.text) }
21 .text32 : { *(.text32) }
21 22
22 . = ALIGN(16); 23 . = ALIGN(16);
23 .rodata : { *(.rodata*) } 24 .rodata : { *(.rodata*) }
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 44dc1923c0e..ee3a4ea923a 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
130 130
131static void usage(void) 131static void usage(void)
132{ 132{
133 die("Usage: build [-b] setup system [rootdev] [> image]"); 133 die("Usage: build setup system [rootdev] [> image]");
134} 134}
135 135
136int main(int argc, char ** argv) 136int main(int argc, char ** argv)
@@ -145,11 +145,6 @@ int main(int argc, char ** argv)
145 void *kernel; 145 void *kernel;
146 u32 crc = 0xffffffffUL; 146 u32 crc = 0xffffffffUL;
147 147
148 if (argc > 2 && !strcmp(argv[1], "-b"))
149 {
150 is_big_kernel = 1;
151 argc--, argv++;
152 }
153 if ((argc < 3) || (argc > 4)) 148 if ((argc < 3) || (argc > 4))
154 usage(); 149 usage();
155 if (argc > 3) { 150 if (argc > 3) {
@@ -216,8 +211,6 @@ int main(int argc, char ** argv)
216 die("Unable to mmap '%s': %m", argv[2]); 211 die("Unable to mmap '%s': %m", argv[2]);
217 /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ 212 /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
218 sys_size = (sz + 15 + 4) / 16; 213 sys_size = (sz + 15 + 4) / 16;
219 if (!is_big_kernel && sys_size > DEF_SYSSIZE)
220 die("System is too big. Try using bzImage or modules.");
221 214
222 /* Patch the setup code with the appropriate size parameters */ 215 /* Patch the setup code with the appropriate size parameters */
223 buf[0x1f1] = setup_sectors-1; 216 buf[0x1f1] = setup_sectors-1;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4ef949c1972..00f5962d82d 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -108,6 +108,16 @@ extern void native_apic_icr_write(u32 low, u32 id);
108extern u64 native_apic_icr_read(void); 108extern u64 native_apic_icr_read(void);
109 109
110#ifdef CONFIG_X86_X2APIC 110#ifdef CONFIG_X86_X2APIC
111/*
112 * Make previous memory operations globally visible before
113 * sending the IPI through x2apic wrmsr. We need a serializing instruction or
114 * mfence for this.
115 */
116static inline void x2apic_wrmsr_fence(void)
117{
118 asm volatile("mfence" : : : "memory");
119}
120
111static inline void native_apic_msr_write(u32 reg, u32 v) 121static inline void native_apic_msr_write(u32 reg, u32 v)
112{ 122{
113 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || 123 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -184,6 +194,9 @@ static inline int x2apic_enabled(void)
184{ 194{
185 return 0; 195 return 0;
186} 196}
197
198#define x2apic 0
199
187#endif 200#endif
188 201
189extern int get_physical_broadcast(void); 202extern int get_physical_broadcast(void);
@@ -379,6 +392,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
379 392
380static inline void ack_APIC_irq(void) 393static inline void ack_APIC_irq(void)
381{ 394{
395#ifdef CONFIG_X86_LOCAL_APIC
382 /* 396 /*
383 * ack_APIC_irq() actually gets compiled as a single instruction 397 * ack_APIC_irq() actually gets compiled as a single instruction
384 * ... yummie. 398 * ... yummie.
@@ -386,6 +400,7 @@ static inline void ack_APIC_irq(void)
386 400
387 /* Docs say use 0 for future compatibility */ 401 /* Docs say use 0 for future compatibility */
388 apic_write(APIC_EOI, 0); 402 apic_write(APIC_EOI, 0);
403#endif
389} 404}
390 405
391static inline unsigned default_get_apic_id(unsigned long x) 406static inline unsigned default_get_apic_id(unsigned long x)
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b..bc9514fb3b1 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
53#define APIC_ESR_SENDILL 0x00020 53#define APIC_ESR_SENDILL 0x00020
54#define APIC_ESR_RECVILL 0x00040 54#define APIC_ESR_RECVILL 0x00040
55#define APIC_ESR_ILLREGA 0x00080 55#define APIC_ESR_ILLREGA 0x00080
56#define APIC_LVTCMCI 0x2f0
56#define APIC_ICR 0x300 57#define APIC_ICR 0x300
57#define APIC_DEST_SELF 0x40000 58#define APIC_DEST_SELF 0x40000
58#define APIC_DEST_ALLINC 0x80000 59#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6526cf08b0e..6ba23dd9fc9 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -1,10 +1,6 @@
1#ifndef _ASM_X86_BOOT_H 1#ifndef _ASM_X86_BOOT_H
2#define _ASM_X86_BOOT_H 2#define _ASM_X86_BOOT_H
3 3
4/* Don't touch these, unless you really know what you're doing. */
5#define DEF_SYSSEG 0x1000
6#define DEF_SYSSIZE 0x7F00
7
8/* Internal svga startup constants */ 4/* Internal svga startup constants */
9#define NORMAL_VGA 0xffff /* 80x25 mode */ 5#define NORMAL_VGA 0xffff /* 80x25 mode */
10#define EXTENDED_VGA 0xfffe /* 80x50 mode */ 6#define EXTENDED_VGA 0xfffe /* 80x50 mode */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f8466540fb..5b301b7ff5f 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6 6
7/* Caches aren't brain-dead on the intel. */ 7/* Caches aren't brain-dead on the intel. */
8#define flush_cache_all() do { } while (0) 8static inline void flush_cache_all(void) { }
9#define flush_cache_mm(mm) do { } while (0) 9static inline void flush_cache_mm(struct mm_struct *mm) { }
10#define flush_cache_dup_mm(mm) do { } while (0) 10static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
11#define flush_cache_range(vma, start, end) do { } while (0) 11static inline void flush_cache_range(struct vm_area_struct *vma,
12#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 12 unsigned long start, unsigned long end) { }
13#define flush_dcache_page(page) do { } while (0) 13static inline void flush_cache_page(struct vm_area_struct *vma,
14#define flush_dcache_mmap_lock(mapping) do { } while (0) 14 unsigned long vmaddr, unsigned long pfn) { }
15#define flush_dcache_mmap_unlock(mapping) do { } while (0) 15static inline void flush_dcache_page(struct page *page) { }
16#define flush_icache_range(start, end) do { } while (0) 16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17#define flush_icache_page(vma, pg) do { } while (0) 17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
18#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) 18static inline void flush_icache_range(unsigned long start,
19#define flush_cache_vmap(start, end) do { } while (0) 19 unsigned long end) { }
20#define flush_cache_vunmap(start, end) do { } while (0) 20static inline void flush_icache_page(struct vm_area_struct *vma,
21 struct page *page) { }
22static inline void flush_icache_user_range(struct vm_area_struct *vma,
23 struct page *page,
24 unsigned long addr,
25 unsigned long len) { }
26static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
27static inline void flush_cache_vunmap(unsigned long start,
28 unsigned long end) { }
21 29
22#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ 30static inline void copy_to_user_page(struct vm_area_struct *vma,
23 memcpy((dst), (src), (len)) 31 struct page *page, unsigned long vaddr,
24#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ 32 void *dst, const void *src,
25 memcpy((dst), (src), (len)) 33 unsigned long len)
34{
35 memcpy(dst, src, len);
36}
37
38static inline void copy_from_user_page(struct vm_area_struct *vma,
39 struct page *page, unsigned long vaddr,
40 void *dst, const void *src,
41 unsigned long len)
42{
43 memcpy(dst, src, len);
44}
26 45
27#define PG_non_WB PG_arch_1 46#define PG_non_WB PG_arch_1
28PAGEFLAG(NonWB, non_WB) 47PAGEFLAG(NonWB, non_WB)
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
new file mode 100755
index 00000000000..222802029fa
--- /dev/null
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -0,0 +1,226 @@
1#ifndef _ASM_X86_CPU_DEBUG_H
2#define _ASM_X86_CPU_DEBUG_H
3
4/*
5 * CPU x86 architecture debug
6 *
7 * Copyright(C) 2009 Jaswinder Singh Rajput
8 */
9
10/* Register flags */
11enum cpu_debug_bit {
12/* Model Specific Registers (MSRs) */
13 CPU_MC_BIT, /* Machine Check */
14 CPU_MONITOR_BIT, /* Monitor */
15 CPU_TIME_BIT, /* Time */
16 CPU_PMC_BIT, /* Performance Monitor */
17 CPU_PLATFORM_BIT, /* Platform */
18 CPU_APIC_BIT, /* APIC */
19 CPU_POWERON_BIT, /* Power-on */
20 CPU_CONTROL_BIT, /* Control */
21 CPU_FEATURES_BIT, /* Features control */
22 CPU_LBRANCH_BIT, /* Last Branch */
23 CPU_BIOS_BIT, /* BIOS */
24 CPU_FREQ_BIT, /* Frequency */
25 CPU_MTTR_BIT, /* MTRR */
26 CPU_PERF_BIT, /* Performance */
27 CPU_CACHE_BIT, /* Cache */
28 CPU_SYSENTER_BIT, /* Sysenter */
29 CPU_THERM_BIT, /* Thermal */
30 CPU_MISC_BIT, /* Miscellaneous */
31 CPU_DEBUG_BIT, /* Debug */
32 CPU_PAT_BIT, /* PAT */
33 CPU_VMX_BIT, /* VMX */
34 CPU_CALL_BIT, /* System Call */
35 CPU_BASE_BIT, /* BASE Address */
36 CPU_VER_BIT, /* Version ID */
37 CPU_CONF_BIT, /* Configuration */
38 CPU_SMM_BIT, /* System mgmt mode */
39 CPU_SVM_BIT, /*Secure Virtual Machine*/
40 CPU_OSVM_BIT, /* OS-Visible Workaround*/
41/* Standard Registers */
42 CPU_TSS_BIT, /* Task Stack Segment */
43 CPU_CR_BIT, /* Control Registers */
44 CPU_DT_BIT, /* Descriptor Table */
45/* End of Registers flags */
46 CPU_REG_ALL_BIT, /* Select all Registers */
47};
48
49#define CPU_REG_ALL (~0) /* Select all Registers */
50
51#define CPU_MC (1 << CPU_MC_BIT)
52#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
53#define CPU_TIME (1 << CPU_TIME_BIT)
54#define CPU_PMC (1 << CPU_PMC_BIT)
55#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
56#define CPU_APIC (1 << CPU_APIC_BIT)
57#define CPU_POWERON (1 << CPU_POWERON_BIT)
58#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
59#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
60#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
61#define CPU_BIOS (1 << CPU_BIOS_BIT)
62#define CPU_FREQ (1 << CPU_FREQ_BIT)
63#define CPU_MTRR (1 << CPU_MTTR_BIT)
64#define CPU_PERF (1 << CPU_PERF_BIT)
65#define CPU_CACHE (1 << CPU_CACHE_BIT)
66#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
67#define CPU_THERM (1 << CPU_THERM_BIT)
68#define CPU_MISC (1 << CPU_MISC_BIT)
69#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
70#define CPU_PAT (1 << CPU_PAT_BIT)
71#define CPU_VMX (1 << CPU_VMX_BIT)
72#define CPU_CALL (1 << CPU_CALL_BIT)
73#define CPU_BASE (1 << CPU_BASE_BIT)
74#define CPU_VER (1 << CPU_VER_BIT)
75#define CPU_CONF (1 << CPU_CONF_BIT)
76#define CPU_SMM (1 << CPU_SMM_BIT)
77#define CPU_SVM (1 << CPU_SVM_BIT)
78#define CPU_OSVM (1 << CPU_OSVM_BIT)
79#define CPU_TSS (1 << CPU_TSS_BIT)
80#define CPU_CR (1 << CPU_CR_BIT)
81#define CPU_DT (1 << CPU_DT_BIT)
82
83/* Register file flags */
84enum cpu_file_bit {
85 CPU_INDEX_BIT, /* index */
86 CPU_VALUE_BIT, /* value */
87};
88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91/*
92 * DisplayFamily_DisplayModel Processor Families/Processor Number Series
93 * -------------------------- ------------------------------------------
94 * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
95 *
96 * 06_01 Pentium Pro
97 * 06_03, 06_05 Pentium II Xeon, Pentium II
98 * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
99 *
100 * 06_09, 060D Pentium M
101 *
102 * 06_0E Core Duo, Core Solo
103 *
104 * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
105 * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
106 * Pentium dual-core
107 * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
108 *
109 * 06_1C Atom
110 *
111 * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
112 * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
113 *
114 * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
115 * Pentium 4, Pentium D
116 */
117
118/* Register processors bits */
119enum cpu_processor_bit {
120 CPU_NONE,
121/* Intel */
122 CPU_INTEL_PENTIUM_BIT,
123 CPU_INTEL_P6_BIT,
124 CPU_INTEL_PENTIUM_M_BIT,
125 CPU_INTEL_CORE_BIT,
126 CPU_INTEL_CORE2_BIT,
127 CPU_INTEL_ATOM_BIT,
128 CPU_INTEL_XEON_P4_BIT,
129 CPU_INTEL_XEON_MP_BIT,
130/* AMD */
131 CPU_AMD_K6_BIT,
132 CPU_AMD_K7_BIT,
133 CPU_AMD_K8_BIT,
134 CPU_AMD_0F_BIT,
135 CPU_AMD_10_BIT,
136 CPU_AMD_11_BIT,
137};
138
139#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
140#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
141#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
142#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
143#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
144#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
145#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
146#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
147
148#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
149#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
150#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
151#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
152#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
153#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
154#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
155#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
156#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
157#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
158#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
159#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
160#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
161#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
162#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
163#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
164#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
165#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
166#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
167
168/* Select all supported Intel CPUs */
169#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
170
171#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
172#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
173#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
174#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
175#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
176#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
177
178#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
179#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
180#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
181#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
182
183/* Select all supported AMD CPUs */
184#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
185
186/* Select all supported CPUs */
187#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
188
189#define MAX_CPU_FILES 512
190
191struct cpu_private {
192 unsigned cpu;
193 unsigned type;
194 unsigned reg;
195 unsigned file;
196};
197
198struct cpu_debug_base {
199 char *name; /* Register name */
200 unsigned flag; /* Register flag */
201 unsigned write; /* Register write flag */
202};
203
204/*
205 * Currently it looks similar to cpu_debug_base but once we add more files
206 * cpu_file_base will go in different direction
207 */
208struct cpu_file_base {
209 char *name; /* Register file name */
210 unsigned flag; /* Register file flag */
211 unsigned write; /* Register write flag */
212};
213
214struct cpu_cpuX_base {
215 struct dentry *dentry; /* Register dentry */
216 int init; /* Register index file */
217};
218
219struct cpu_debug_range {
220 unsigned min; /* Register range min */
221 unsigned max; /* Register range max */
222 unsigned flag; /* Supported flags */
223 unsigned model; /* Supported models */
224};
225
226#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index dc27705f544..5623c50d67b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -91,7 +91,6 @@ static inline int desc_empty(const void *ptr)
91#define store_gdt(dtr) native_store_gdt(dtr) 91#define store_gdt(dtr) native_store_gdt(dtr)
92#define store_idt(dtr) native_store_idt(dtr) 92#define store_idt(dtr) native_store_idt(dtr)
93#define store_tr(tr) (tr = native_store_tr()) 93#define store_tr(tr) (tr = native_store_tr())
94#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
95 94
96#define load_TLS(t, cpu) native_load_tls(t, cpu) 95#define load_TLS(t, cpu) native_load_tls(t, cpu)
97#define set_ldt native_set_ldt 96#define set_ldt native_set_ldt
@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
112} 111}
113#endif /* CONFIG_PARAVIRT */ 112#endif /* CONFIG_PARAVIRT */
114 113
114#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
115
115static inline void native_write_idt_entry(gate_desc *idt, int entry, 116static inline void native_write_idt_entry(gate_desc *idt, int entry,
116 const gate_desc *gate) 117 const gate_desc *gate)
117{ 118{
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index bc68212c6bc..aa32f7e6c19 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -2,21 +2,11 @@
2#define _ASM_X86_DMI_H 2#define _ASM_X86_DMI_H
3 3
4#include <asm/io.h> 4#include <asm/io.h>
5#include <asm/setup.h>
5 6
6#define DMI_MAX_DATA 2048
7
8extern int dmi_alloc_index;
9extern char dmi_alloc_data[DMI_MAX_DATA];
10
11/* This is so early that there is no good way to allocate dynamic memory.
12 Allocate data in an BSS array. */
13static inline void *dmi_alloc(unsigned len) 7static inline void *dmi_alloc(unsigned len)
14{ 8{
15 int idx = dmi_alloc_index; 9 return extend_brk(len, sizeof(int));
16 if ((dmi_alloc_index + len) > DMI_MAX_DATA)
17 return NULL;
18 dmi_alloc_index += len;
19 return dmi_alloc_data + idx;
20} 10}
21 11
22/* Use early IO mappings for DMI because it's initialized early */ 12/* Use early IO mappings for DMI because it's initialized early */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2856b..edc90f23e70 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
37 37
38#else /* !CONFIG_X86_32 */ 38#else /* !CONFIG_X86_32 */
39 39
40#define MAX_EFI_IO_PAGES 100
41
42extern u64 efi_call0(void *fp); 40extern u64 efi_call0(void *fp);
43extern u64 efi_call1(void *fp, u64 arg1); 41extern u64 efi_call1(void *fp, u64 arg1);
44extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); 42extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 854d538ae85..c2e6bedaf25 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
33 smp_invalidate_interrupt) 33 smp_invalidate_interrupt)
34#endif 34#endif
35 35
36BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
37
36/* 38/*
37 * every pentium local APIC has two 'local interrupts', with a 39 * every pentium local APIC has two 'local interrupts', with a
38 * soft-definable vector attached to both interrupts, one of 40 * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dca8f03da5b..63a79c77d22 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -24,9 +24,6 @@
24#include <asm/kmap_types.h> 24#include <asm/kmap_types.h>
25#else 25#else
26#include <asm/vsyscall.h> 26#include <asm/vsyscall.h>
27#ifdef CONFIG_EFI
28#include <asm/efi.h>
29#endif
30#endif 27#endif
31 28
32/* 29/*
@@ -92,13 +89,6 @@ enum fixed_addresses {
92 FIX_IO_APIC_BASE_0, 89 FIX_IO_APIC_BASE_0,
93 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, 90 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
94#endif 91#endif
95#ifdef CONFIG_X86_64
96#ifdef CONFIG_EFI
97 FIX_EFI_IO_MAP_LAST_PAGE,
98 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
99 + MAX_EFI_IO_PAGES - 1,
100#endif
101#endif
102#ifdef CONFIG_X86_VISWS_APIC 92#ifdef CONFIG_X86_VISWS_APIC
103 FIX_CO_CPU, /* Cobalt timer */ 93 FIX_CO_CPU, /* Cobalt timer */
104 FIX_CO_APIC, /* Cobalt APIC Redirection Table */ 94 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 176f058e715..039db6aa8e0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */
15#ifdef CONFIG_SMP 16#ifdef CONFIG_SMP
16 unsigned int irq_resched_count; 17 unsigned int irq_resched_count;
17 unsigned int irq_call_count; 18 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index bf9276bea66..014c2b85ae4 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
63void *kmap_atomic(struct page *page, enum km_type type); 63void *kmap_atomic(struct page *page, enum km_type type);
64void kunmap_atomic(void *kvaddr, enum km_type type); 64void kunmap_atomic(void *kvaddr, enum km_type type);
65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); 65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
66struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
67 68
68#ifndef CONFIG_PARAVIRT 69#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 370e1c83bb4..b762ea49bd7 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,6 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void);
30extern void error_interrupt(void); 31extern void error_interrupt(void);
31extern void spurious_interrupt(void); 32extern void spurious_interrupt(void);
32extern void thermal_interrupt(void); 33extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c..71c9e518398 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
172 172
173#else /* CONFIG_X86_32 */ 173#else /* CONFIG_X86_32 */
174 174
175extern void finit(void); 175#ifdef CONFIG_MATH_EMULATION
176extern void finit_task(struct task_struct *tsk);
177#else
178static inline void finit_task(struct task_struct *tsk)
179{
180}
181#endif
176 182
177static inline void tolerant_fwait(void) 183static inline void tolerant_fwait(void)
178{ 184{
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 00000000000..36fb1a6a510
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_INIT_32_H
2#define _ASM_X86_INIT_32_H
3
4#ifdef CONFIG_X86_32
5extern void __init early_ioremap_page_table_range_init(void);
6#endif
7
8extern unsigned long __init
9kernel_physical_mapping_init(unsigned long start,
10 unsigned long end,
11 unsigned long page_size_mask);
12
13
14extern unsigned long __initdata e820_table_start;
15extern unsigned long __meminitdata e820_table_end;
16extern unsigned long __meminitdata e820_table_top;
17
18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 683d0b4c00f..e5383e3d2f8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
172 172
173extern void iounmap(volatile void __iomem *addr); 173extern void iounmap(volatile void __iomem *addr);
174 174
175extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
176
177 175
178#ifdef CONFIG_X86_32 176#ifdef CONFIG_X86_32
179# include "io_32.h" 177# include "io_32.h"
@@ -198,7 +196,6 @@ extern void early_ioremap_reset(void);
198extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); 196extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
199extern void __iomem *early_memremap(unsigned long offset, unsigned long size); 197extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
200extern void early_iounmap(void __iomem *addr, unsigned long size); 198extern void early_iounmap(void __iomem *addr, unsigned long size);
201extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
202 199
203#define IO_SPACE_LIMIT 0xffff 200#define IO_SPACE_LIMIT 0xffff
204 201
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 59cb4a1317b..373cc2bbcad 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -162,7 +162,8 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq);
162extern void ioapic_init_mappings(void); 162extern void ioapic_init_mappings(void);
163 163
164#ifdef CONFIG_X86_64 164#ifdef CONFIG_X86_64
165extern int save_mask_IO_APIC_setup(void); 165extern int save_IO_APIC_setup(void);
166extern void mask_IO_APIC_setup(void);
166extern void restore_IO_APIC_setup(void); 167extern void restore_IO_APIC_setup(void);
167extern void reinit_intr_remapped_IO_APIC(int); 168extern void reinit_intr_remapped_IO_APIC(int);
168#endif 169#endif
@@ -172,7 +173,7 @@ extern void probe_nr_irqs_gsi(void);
172extern int setup_ioapic_entry(int apic, int irq, 173extern int setup_ioapic_entry(int apic, int irq,
173 struct IO_APIC_route_entry *entry, 174 struct IO_APIC_route_entry *entry,
174 unsigned int destination, int trigger, 175 unsigned int destination, int trigger,
175 int polarity, int vector); 176 int polarity, int vector, int pin);
176extern void ioapic_write_entry(int apic, int pin, 177extern void ioapic_write_entry(int apic, int pin,
177 struct IO_APIC_route_entry e); 178 struct IO_APIC_route_entry e);
178#else /* !CONFIG_X86_IO_APIC */ 179#else /* !CONFIG_X86_IO_APIC */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 107eb219669..f38481bcd45 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
36extern void fixup_irqs(void); 36extern void fixup_irqs(void);
37#endif 37#endif
38 38
39extern void (*generic_interrupt_extension)(void);
39extern void init_IRQ(void); 40extern void init_IRQ(void);
40extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
41extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 20e1fd588db..0396760fccb 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H 1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H 2#define _ASM_X86_IRQ_REMAPPING_H
3 3
4extern int x2apic;
5
6#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
7 5
8#endif /* _ASM_X86_IRQ_REMAPPING_H */ 6#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8a285f356f8..3cbd79bbb47 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,6 +112,11 @@
112#define LOCAL_PERF_VECTOR 0xee 112#define LOCAL_PERF_VECTOR 0xee
113 113
114/* 114/*
115 * Generic system vector for platform specific use
116 */
117#define GENERIC_INTERRUPT_VECTOR 0xed
118
119/*
115 * First APIC vector available to drivers: (vectors 0x30-0xee) we 120 * First APIC vector available to drivers: (vectors 0x30-0xee) we
116 * start at 0x31(0x41) to spread out vectors evenly between priority 121 * start at 0x31(0x41) to spread out vectors evenly between priority
117 * levels. (0x80 is the syscall vector) 122 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0ceb6d19ed3..317ff1703d0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -9,13 +9,13 @@
9# define PAGES_NR 4 9# define PAGES_NR 4
10#else 10#else
11# define PA_CONTROL_PAGE 0 11# define PA_CONTROL_PAGE 0
12# define PA_TABLE_PAGE 1 12# define VA_CONTROL_PAGE 1
13# define PAGES_NR 2 13# define PA_TABLE_PAGE 2
14# define PA_SWAP_PAGE 3
15# define PAGES_NR 4
14#endif 16#endif
15 17
16#ifdef CONFIG_X86_32
17# define KEXEC_CONTROL_CODE_MAX_SIZE 2048 18# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
18#endif
19 19
20#ifndef __ASSEMBLY__ 20#ifndef __ASSEMBLY__
21 21
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
136 unsigned int has_pae, 136 unsigned int has_pae,
137 unsigned int preserve_context); 137 unsigned int preserve_context);
138#else 138#else
139NORET_TYPE void 139unsigned long
140relocate_kernel(unsigned long indirection_page, 140relocate_kernel(unsigned long indirection_page,
141 unsigned long page_list, 141 unsigned long page_list,
142 unsigned long start_address) ATTRIB_NORET; 142 unsigned long start_address,
143 unsigned int preserve_context);
143#endif 144#endif
144 145
145#define ARCH_HAS_KIMAGE_ARCH 146#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a8a26..12d55e773eb 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -1,14 +1,11 @@
1#ifndef _ASM_X86_LINKAGE_H 1#ifndef _ASM_X86_LINKAGE_H
2#define _ASM_X86_LINKAGE_H 2#define _ASM_X86_LINKAGE_H
3 3
4#include <linux/stringify.h>
5
4#undef notrace 6#undef notrace
5#define notrace __attribute__((no_instrument_function)) 7#define notrace __attribute__((no_instrument_function))
6 8
7#ifdef CONFIG_X86_64
8#define __ALIGN .p2align 4,,15
9#define __ALIGN_STR ".p2align 4,,15"
10#endif
11
12#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
13#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) 10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
14/* 11/*
@@ -50,16 +47,20 @@
50 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 47 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
51 "g" (arg4), "g" (arg5), "g" (arg6)) 48 "g" (arg4), "g" (arg5), "g" (arg6))
52 49
53#endif 50#endif /* CONFIG_X86_32 */
51
52#ifdef __ASSEMBLY__
54 53
55#define GLOBAL(name) \ 54#define GLOBAL(name) \
56 .globl name; \ 55 .globl name; \
57 name: 56 name:
58 57
59#ifdef CONFIG_X86_ALIGNMENT_16 58#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
60#define __ALIGN .align 16,0x90 59#define __ALIGN .p2align 4, 0x90
61#define __ALIGN_STR ".align 16,0x90" 60#define __ALIGN_STR __stringify(__ALIGN)
62#endif 61#endif
63 62
63#endif /* __ASSEMBLY__ */
64
64#endif /* _ASM_X86_LINKAGE_H */ 65#endif /* _ASM_X86_LINKAGE_H */
65 66
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960..563933e06a3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
11 */ 11 */
12 12
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
14 16
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
90 92
91#include <asm/atomic.h> 93#include <asm/atomic.h>
92 94
95void mce_setup(struct mce *m);
93void mce_log(struct mce *m); 96void mce_log(struct mce *m);
94DECLARE_PER_CPU(struct sys_device, device_mce); 97DECLARE_PER_CPU(struct sys_device, device_mce);
95extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
96 99
100/*
101 * To support more than 128 would need to escape the predefined
102 * Linux defined extended banks first.
103 */
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105
97#ifdef CONFIG_X86_MCE_INTEL 106#ifdef CONFIG_X86_MCE_INTEL
98void mce_intel_feature_init(struct cpuinfo_x86 *c); 107void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void);
109void cmci_reenable(void);
110void cmci_rediscover(int dying);
111void cmci_recheck(void);
99#else 112#else
100static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } 113static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
114static inline void cmci_clear(void) {}
115static inline void cmci_reenable(void) {}
116static inline void cmci_rediscover(int dying) {}
117static inline void cmci_recheck(void) {}
101#endif 118#endif
102 119
103#ifdef CONFIG_X86_MCE_AMD 120#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
106static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
107#endif 124#endif
108 125
109void mce_log_therm_throt_event(unsigned int cpu, __u64 status); 126extern int mce_available(struct cpuinfo_x86 *c);
127
128void mce_log_therm_throt_event(__u64 status);
110 129
111extern atomic_t mce_entry; 130extern atomic_t mce_entry;
112 131
113extern void do_machine_check(struct pt_regs *, long); 132extern void do_machine_check(struct pt_regs *, long);
133
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
136
137enum mcp_flags {
138 MCP_TIMESTAMP = (1 << 0), /* log time stamp */
139 MCP_UC = (1 << 1), /* log uncorrected errors */
140};
141extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
142
114extern int mce_notify_user(void); 143extern int mce_notify_user(void);
115 144
116#endif /* !CONFIG_X86_32 */ 145#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
120#else 149#else
121#define mcheck_init(c) do { } while (0) 150#define mcheck_init(c) do { } while (0)
122#endif 151#endif
123extern void stop_mce(void); 152
124extern void restart_mce(void); 153extern void (*mce_threshold_vector)(void);
125 154
126#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
127#endif /* _ASM_X86_MCE_H */ 156#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a063..ede6998bd92 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn)
91#endif /* CONFIG_DISCONTIGMEM */ 91#endif /* CONFIG_DISCONTIGMEM */
92 92
93#ifdef CONFIG_NEED_MULTIPLE_NODES 93#ifdef CONFIG_NEED_MULTIPLE_NODES
94 94/* always use node 0 for bootmem on this numa platform */
95/* 95#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \
96 * Following are macros that are specific to this numa platform. 96 (NODE_DATA(0)->bdata)
97 */
98#define reserve_bootmem(addr, size, flags) \
99 reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
100#define alloc_bootmem(x) \
101 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
102#define alloc_bootmem_nopanic(x) \
103 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
104 __pa(MAX_DMA_ADDRESS))
105#define alloc_bootmem_low(x) \
106 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
107#define alloc_bootmem_pages(x) \
108 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
109#define alloc_bootmem_pages_nopanic(x) \
110 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
111 __pa(MAX_DMA_ADDRESS))
112#define alloc_bootmem_low_pages(x) \
113 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
114#define alloc_bootmem_node(pgdat, x) \
115({ \
116 struct pglist_data __maybe_unused \
117 *__alloc_bootmem_node__pgdat = (pgdat); \
118 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
119 __pa(MAX_DMA_ADDRESS)); \
120})
121#define alloc_bootmem_pages_node(pgdat, x) \
122({ \
123 struct pglist_data __maybe_unused \
124 *__alloc_bootmem_node__pgdat = (pgdat); \
125 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
126 __pa(MAX_DMA_ADDRESS)); \
127})
128#define alloc_bootmem_low_pages_node(pgdat, x) \
129({ \
130 struct pglist_data __maybe_unused \
131 *__alloc_bootmem_node__pgdat = (pgdat); \
132 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
133})
134#endif /* CONFIG_NEED_MULTIPLE_NODES */ 97#endif /* CONFIG_NEED_MULTIPLE_NODES */
135 98
136#endif /* _ASM_X86_MMZONE_32_H */ 99#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index 6706b3006f1..4cc48af23fe 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -47,6 +47,7 @@
47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ 48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
49 MSI_ADDR_DEST_ID_MASK) 49 MSI_ADDR_DEST_ID_MASK)
50#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00)
50 51
51#define MSI_ADDR_IR_EXT_INT (1 << 4) 52#define MSI_ADDR_IR_EXT_INT (1 << 4)
52#define MSI_ADDR_IR_SHV (1 << 3) 53#define MSI_ADDR_IR_SHV (1 << 3)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae0..2dbd2314139 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
77#define MSR_IA32_MC0_ADDR 0x00000402 77#define MSR_IA32_MC0_ADDR 0x00000402
78#define MSR_IA32_MC0_MISC 0x00000403 78#define MSR_IA32_MC0_MISC 0x00000403
79 79
80/* These are consecutive and not in the normal 4er MCE bank block */
81#define MSR_IA32_MC0_CTL2 0x00000280
82#define CMCI_EN (1ULL << 30)
83#define CMCI_THRESHOLD_MASK 0xffffULL
84
80#define MSR_P6_PERFCTR0 0x000000c1 85#define MSR_P6_PERFCTR0 0x000000c1
81#define MSR_P6_PERFCTR1 0x000000c2 86#define MSR_P6_PERFCTR1 0x000000c2
82#define MSR_P6_EVNTSEL0 0x00000186 87#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f1e4a79a6e4..0f915ae649a 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
39#define __VIRTUAL_MASK_SHIFT 32 39#define __VIRTUAL_MASK_SHIFT 32
40#endif /* CONFIG_X86_PAE */ 40#endif /* CONFIG_X86_PAE */
41 41
42/*
43 * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
44 */
45#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
46
42#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
43 48
44/* 49/*
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603..826ad37006a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42 42
43struct pgprot;
44
45extern int page_is_ram(unsigned long pagenr); 43extern int page_is_ram(unsigned long pagenr);
46extern int devmem_is_allowed(unsigned long pagenr); 44extern int devmem_is_allowed(unsigned long pagenr);
47extern void map_devmem(unsigned long pfn, unsigned long size,
48 struct pgprot vma_prot);
49extern void unmap_devmem(unsigned long pfn, unsigned long size,
50 struct pgprot vma_prot);
51 45
52extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
53extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0617d5cc971..31fe83b10a4 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -389,7 +389,7 @@ extern struct pv_lock_ops pv_lock_ops;
389 389
390#define paravirt_type(op) \ 390#define paravirt_type(op) \
391 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ 391 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
392 [paravirt_opptr] "m" (op) 392 [paravirt_opptr] "i" (&(op))
393#define paravirt_clobber(clobber) \ 393#define paravirt_clobber(clobber) \
394 [paravirt_clobber] "i" (clobber) 394 [paravirt_clobber] "i" (clobber)
395 395
@@ -443,7 +443,7 @@ int paravirt_disable_iospace(void);
443 * offset into the paravirt_patch_template structure, and can therefore be 443 * offset into the paravirt_patch_template structure, and can therefore be
444 * freely converted back into a structure offset. 444 * freely converted back into a structure offset.
445 */ 445 */
446#define PARAVIRT_CALL "call *%[paravirt_opptr];" 446#define PARAVIRT_CALL "call *%c[paravirt_opptr];"
447 447
448/* 448/*
449 * These macros are intended to wrap calls through one of the paravirt 449 * These macros are intended to wrap calls through one of the paravirt
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838..2cd07b9422f 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PAT_H 2#define _ASM_X86_PAT_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/pgtable_types.h>
5 6
6#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
7extern int pat_enabled; 8extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
17 18
18extern int kernel_map_sync_memtype(u64 base, unsigned long size, 19extern int kernel_map_sync_memtype(u64 base, unsigned long size,
19 unsigned long flag); 20 unsigned long flag);
21extern void map_devmem(unsigned long pfn, unsigned long size,
22 struct pgprot vma_prot);
23extern void unmap_devmem(unsigned long pfn, unsigned long size,
24 struct pgprot vma_prot);
20 25
21#endif /* _ASM_X86_PAT_H */ 26#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a666..d0812e155f1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
288 return 1; 288 return 1;
289} 289}
290 290
291pmd_t *populate_extra_pmd(unsigned long vaddr);
292pte_t *populate_extra_pte(unsigned long vaddr);
291#endif /* __ASSEMBLY__ */ 293#endif /* __ASSEMBLY__ */
292 294
293#ifdef CONFIG_X86_32 295#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 97612fc7632..31bd120cf2a 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
42 */ 42 */
43#undef TEST_ACCESS_OK 43#undef TEST_ACCESS_OK
44 44
45/* The boot page tables (all created as a single array) */
46extern unsigned long pg0[];
47
48#ifdef CONFIG_X86_PAE 45#ifdef CONFIG_X86_PAE
49# include <asm/pgtable-3level.h> 46# include <asm/pgtable-3level.h>
50#else 47#else
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe0..2733fad45f9 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
25 * area for the same reason. ;) 25 * area for the same reason. ;)
26 */ 26 */
27#define VMALLOC_OFFSET (8 * 1024 * 1024) 27#define VMALLOC_OFFSET (8 * 1024 * 1024)
28
29#ifndef __ASSEMBLER__
30extern bool __vmalloc_start_set; /* set once high_memory is set */
31#endif
32
28#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) 33#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
29#ifdef CONFIG_X86_PAE 34#ifdef CONFIG_X86_PAE
30#define LAST_PKMAP 512 35#define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0..b8238dc8786 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
273 273
274extern pteval_t __supported_pte_mask; 274extern pteval_t __supported_pte_mask;
275extern int nx_enabled; 275extern int nx_enabled;
276extern void set_nx(void);
276 277
277#define pgprot_writecombine pgprot_writecombine 278#define pgprot_writecombine pgprot_writecombine
278extern pgprot_t pgprot_writecombine(pgprot_t prot); 279extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index dccef5be0fc..ae85a8d66a3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -75,9 +75,9 @@ struct cpuinfo_x86 {
75#else 75#else
76 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ 76 /* Number of 4K pages in DTLB/ITLB combined(in pages): */
77 int x86_tlbsize; 77 int x86_tlbsize;
78#endif
78 __u8 x86_virt_bits; 79 __u8 x86_virt_bits;
79 __u8 x86_phys_bits; 80 __u8 x86_phys_bits;
80#endif
81 /* CPUID returned core id bits: */ 81 /* CPUID returned core id bits: */
82 __u8 x86_coreid_bits; 82 __u8 x86_coreid_bits;
83 /* Max extended CPUID function supported: */ 83 /* Max extended CPUID function supported: */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 2b8c5160388..1b7ee5d673c 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -1 +1,8 @@
1#ifndef _ASM_X86_SECTIONS_H
2#define _ASM_X86_SECTIONS_H
3
1#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5
6extern char __brk_base[], __brk_limit[];
7
8#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 05c6f6b11fd..fbf0521eeed 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -100,20 +100,51 @@ extern struct boot_params boot_params;
100 */ 100 */
101#define LOWMEMSIZE() (0x9f000) 101#define LOWMEMSIZE() (0x9f000)
102 102
103/* exceedingly early brk-like allocator */
104extern unsigned long _brk_end;
105void *extend_brk(size_t size, size_t align);
106
107/*
108 * Reserve space in the brk section. The name must be unique within
109 * the file, and somewhat descriptive. The size is in bytes. Must be
110 * used at file scope.
111 *
112 * (This uses a temp function to wrap the asm so we can pass it the
113 * size parameter; otherwise we wouldn't be able to. We can't use a
114 * "section" attribute on a normal variable because it always ends up
115 * being @progbits, which ends up allocating space in the vmlinux
116 * executable.)
117 */
118#define RESERVE_BRK(name,sz) \
119 static void __section(.discard) __used \
120 __brk_reservation_fn_##name##__(void) { \
121 asm volatile ( \
122 ".pushsection .brk_reservation,\"aw\",@nobits;" \
123 ".brk." #name ":" \
124 " 1:.skip %c0;" \
125 " .size .brk." #name ", . - 1b;" \
126 " .popsection" \
127 : : "i" (sz)); \
128 }
129
103#ifdef __i386__ 130#ifdef __i386__
104 131
105void __init i386_start_kernel(void); 132void __init i386_start_kernel(void);
106extern void probe_roms(void); 133extern void probe_roms(void);
107 134
108extern unsigned long init_pg_tables_start;
109extern unsigned long init_pg_tables_end;
110
111#else 135#else
112void __init x86_64_start_kernel(char *real_mode); 136void __init x86_64_start_kernel(char *real_mode);
113void __init x86_64_start_reservations(char *real_mode_data); 137void __init x86_64_start_reservations(char *real_mode_data);
114 138
115#endif /* __i386__ */ 139#endif /* __i386__ */
116#endif /* _SETUP */ 140#endif /* _SETUP */
141#else
142#define RESERVE_BRK(name,sz) \
143 .pushsection .brk_reservation,"aw",@nobits; \
144.brk.name: \
1451: .skip sz; \
146 .size .brk.name,.-1b; \
147 .popsection
117#endif /* __ASSEMBLY__ */ 148#endif /* __ASSEMBLY__ */
118#endif /* __KERNEL__ */ 149#endif /* __KERNEL__ */
119 150
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327ef05c..9f4dfba33b2 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */ 199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ 200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
201 201
202/* Loop through all installed blades */
203#define for_each_possible_blade(bid) \
204 for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
205
202/* 206/*
203 * Macros for converting between kernel virtual addresses, socket local physical 207 * Macros for converting between kernel virtual addresses, socket local physical
204 * addresses, and UV global physical addresses. 208 * addresses, and UV global physical addresses.
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5e79ca69432..9c371e4a9fa 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -296,6 +296,8 @@ HYPERVISOR_get_debugreg(int reg)
296static inline int 296static inline int
297HYPERVISOR_update_descriptor(u64 ma, u64 desc) 297HYPERVISOR_update_descriptor(u64 ma, u64 desc)
298{ 298{
299 if (sizeof(u64) == sizeof(long))
300 return _hypercall2(int, update_descriptor, ma, desc);
299 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); 301 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
300} 302}
301 303
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43d..1a918dde46b 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
164 164
165 165
166xmaddr_t arbitrary_virt_to_machine(void *address); 166xmaddr_t arbitrary_virt_to_machine(void *address);
167unsigned long arbitrary_virt_to_mfn(void *vaddr);
167void make_lowmem_page_readonly(void *vaddr); 168void make_lowmem_page_readonly(void *vaddr);
168void make_lowmem_page_readwrite(void *vaddr); 169void make_lowmem_page_readwrite(void *vaddr);
169 170
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 95f216bbfaf..339ce35648e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
111### 111###
112# 64 bit specific files 112# 64 bit specific files
113ifeq ($(CONFIG_X86_64),y) 113ifeq ($(CONFIG_X86_64),y)
114 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o 114 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
115 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 115 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
116 obj-$(CONFIG_AUDIT) += audit_64.o 116 obj-$(CONFIG_AUDIT) += audit_64.o
117 117
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d5..4c80f155743 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
414 that might execute the to be patched code. 414 that might execute the to be patched code.
415 Other CPUs are not running. */ 415 Other CPUs are not running. */
416 stop_nmi(); 416 stop_nmi();
417#ifdef CONFIG_X86_MCE 417
418 stop_mce(); 418 /*
419#endif 419 * Don't stop machine check exceptions while patching.
420 * MCEs only happen when something got corrupted and in this
421 * case we must do something about the corruption.
422 * Ignoring it is worse than a unlikely patching race.
423 * Also machine checks tend to be broadcast and if one CPU
424 * goes into machine check the others follow quickly, so we don't
425 * expect a machine check to cause undue problems during to code
426 * patching.
427 */
420 428
421 apply_alternatives(__alt_instructions, __alt_instructions_end); 429 apply_alternatives(__alt_instructions, __alt_instructions_end);
422 430
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
456 (unsigned long)__smp_locks_end); 464 (unsigned long)__smp_locks_end);
457 465
458 restart_nmi(); 466 restart_nmi();
459#ifdef CONFIG_X86_MCE
460 restart_mce();
461#endif
462} 467}
463 468
464/** 469/**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c..85eb8e10081 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/mtrr.h> 47#include <asm/mtrr.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/mce.h>
49 50
50unsigned int num_processors; 51unsigned int num_processors;
51 52
@@ -808,7 +809,7 @@ void clear_local_APIC(void)
808 u32 v; 809 u32 v;
809 810
810 /* APIC hasn't been mapped yet */ 811 /* APIC hasn't been mapped yet */
811 if (!apic_phys) 812 if (!x2apic && !apic_phys)
812 return; 813 return;
813 814
814 maxlvt = lapic_get_maxlvt(); 815 maxlvt = lapic_get_maxlvt();
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
842 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 843 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
843 } 844 }
844#endif 845#endif
846#ifdef CONFIG_X86_MCE_INTEL
847 if (maxlvt >= 6) {
848 v = apic_read(APIC_LVTCMCI);
849 if (!(v & APIC_LVT_MASKED))
850 apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
851 }
852#endif
853
845 /* 854 /*
846 * Clean APIC state for other OSs: 855 * Clean APIC state for other OSs:
847 */ 856 */
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
1241 apic_write(APIC_LVT1, value); 1250 apic_write(APIC_LVT1, value);
1242 1251
1243 preempt_enable(); 1252 preempt_enable();
1253
1254#ifdef CONFIG_X86_MCE_INTEL
1255 /* Recheck CMCI information after local APIC is up on CPU #0 */
1256 if (smp_processor_id() == 0)
1257 cmci_recheck();
1258#endif
1244} 1259}
1245 1260
1246void __cpuinit end_local_APIC_setup(void) 1261void __cpuinit end_local_APIC_setup(void)
@@ -1319,15 +1334,16 @@ void __init enable_IR_x2apic(void)
1319 return; 1334 return;
1320 } 1335 }
1321 1336
1322 local_irq_save(flags); 1337 ret = save_IO_APIC_setup();
1323 mask_8259A();
1324
1325 ret = save_mask_IO_APIC_setup();
1326 if (ret) { 1338 if (ret) {
1327 pr_info("Saving IO-APIC state failed: %d\n", ret); 1339 pr_info("Saving IO-APIC state failed: %d\n", ret);
1328 goto end; 1340 goto end;
1329 } 1341 }
1330 1342
1343 local_irq_save(flags);
1344 mask_IO_APIC_setup();
1345 mask_8259A();
1346
1331 ret = enable_intr_remapping(1); 1347 ret = enable_intr_remapping(1);
1332 1348
1333 if (ret && x2apic_preenabled) { 1349 if (ret && x2apic_preenabled) {
@@ -1352,10 +1368,10 @@ end_restore:
1352 else 1368 else
1353 reinit_intr_remapped_IO_APIC(x2apic_preenabled); 1369 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1354 1370
1355end:
1356 unmask_8259A(); 1371 unmask_8259A();
1357 local_irq_restore(flags); 1372 local_irq_restore(flags);
1358 1373
1374end:
1359 if (!ret) { 1375 if (!ret) {
1360 if (!x2apic_preenabled) 1376 if (!x2apic_preenabled)
1361 pr_info("Enabled x2apic and interrupt-remapping\n"); 1377 pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1508,12 +1524,10 @@ void __init early_init_lapic_mapping(void)
1508 */ 1524 */
1509void __init init_apic_mappings(void) 1525void __init init_apic_mappings(void)
1510{ 1526{
1511#ifdef CONFIG_X86_X2APIC
1512 if (x2apic) { 1527 if (x2apic) {
1513 boot_cpu_physical_apicid = read_apic_id(); 1528 boot_cpu_physical_apicid = read_apic_id();
1514 return; 1529 return;
1515 } 1530 }
1516#endif
1517 1531
1518 /* 1532 /*
1519 * If no local APIC can be found then set up a fake all 1533 * If no local APIC can be found then set up a fake all
@@ -1957,12 +1971,9 @@ static int lapic_resume(struct sys_device *dev)
1957 1971
1958 local_irq_save(flags); 1972 local_irq_save(flags);
1959 1973
1960#ifdef CONFIG_X86_X2APIC
1961 if (x2apic) 1974 if (x2apic)
1962 enable_x2apic(); 1975 enable_x2apic();
1963 else 1976 else {
1964#endif
1965 {
1966 /* 1977 /*
1967 * Make sure the APICBASE points to the right address 1978 * Make sure the APICBASE points to the right address
1968 * 1979 *
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00e6071cefc..42cdc78427a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -389,6 +389,8 @@ struct io_apic {
389 unsigned int index; 389 unsigned int index;
390 unsigned int unused[3]; 390 unsigned int unused[3];
391 unsigned int data; 391 unsigned int data;
392 unsigned int unused2[11];
393 unsigned int eoi;
392}; 394};
393 395
394static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 396static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
@@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
397 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); 399 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
398} 400}
399 401
402static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
403{
404 struct io_apic __iomem *io_apic = io_apic_base(apic);
405 writel(vector, &io_apic->eoi);
406}
407
400static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 408static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
401{ 409{
402 struct io_apic __iomem *io_apic = io_apic_base(apic); 410 struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -546,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
546 554
547 apic = entry->apic; 555 apic = entry->apic;
548 pin = entry->pin; 556 pin = entry->pin;
549#ifdef CONFIG_INTR_REMAP
550 /* 557 /*
551 * With interrupt-remapping, destination information comes 558 * With interrupt-remapping, destination information comes
552 * from interrupt-remapping table entry. 559 * from interrupt-remapping table entry.
553 */ 560 */
554 if (!irq_remapped(irq)) 561 if (!irq_remapped(irq))
555 io_apic_write(apic, 0x11 + pin*2, dest); 562 io_apic_write(apic, 0x11 + pin*2, dest);
556#else
557 io_apic_write(apic, 0x11 + pin*2, dest);
558#endif
559 reg = io_apic_read(apic, 0x10 + pin*2); 563 reg = io_apic_read(apic, 0x10 + pin*2);
560 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
561 reg |= vector; 565 reg |= vector;
@@ -849,9 +853,9 @@ __setup("pirq=", ioapic_pirq_setup);
849static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; 853static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
850 854
851/* 855/*
852 * Saves and masks all the unmasked IO-APIC RTE's 856 * Saves all the IO-APIC RTE's
853 */ 857 */
854int save_mask_IO_APIC_setup(void) 858int save_IO_APIC_setup(void)
855{ 859{
856 union IO_APIC_reg_01 reg_01; 860 union IO_APIC_reg_01 reg_01;
857 unsigned long flags; 861 unsigned long flags;
@@ -876,16 +880,9 @@ int save_mask_IO_APIC_setup(void)
876 } 880 }
877 881
878 for (apic = 0; apic < nr_ioapics; apic++) 882 for (apic = 0; apic < nr_ioapics; apic++)
879 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 883 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
880 struct IO_APIC_route_entry entry; 884 early_ioapic_entries[apic][pin] =
881
882 entry = early_ioapic_entries[apic][pin] =
883 ioapic_read_entry(apic, pin); 885 ioapic_read_entry(apic, pin);
884 if (!entry.mask) {
885 entry.mask = 1;
886 ioapic_write_entry(apic, pin, entry);
887 }
888 }
889 886
890 return 0; 887 return 0;
891 888
@@ -898,6 +895,25 @@ nomem:
898 return -ENOMEM; 895 return -ENOMEM;
899} 896}
900 897
898void mask_IO_APIC_setup(void)
899{
900 int apic, pin;
901
902 for (apic = 0; apic < nr_ioapics; apic++) {
903 if (!early_ioapic_entries[apic])
904 break;
905 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
906 struct IO_APIC_route_entry entry;
907
908 entry = early_ioapic_entries[apic][pin];
909 if (!entry.mask) {
910 entry.mask = 1;
911 ioapic_write_entry(apic, pin, entry);
912 }
913 }
914 }
915}
916
901void restore_IO_APIC_setup(void) 917void restore_IO_APIC_setup(void)
902{ 918{
903 int apic, pin; 919 int apic, pin;
@@ -1411,9 +1427,8 @@ void __setup_vector_irq(int cpu)
1411} 1427}
1412 1428
1413static struct irq_chip ioapic_chip; 1429static struct irq_chip ioapic_chip;
1414#ifdef CONFIG_INTR_REMAP
1415static struct irq_chip ir_ioapic_chip; 1430static struct irq_chip ir_ioapic_chip;
1416#endif 1431static struct irq_chip msi_ir_chip;
1417 1432
1418#define IOAPIC_AUTO -1 1433#define IOAPIC_AUTO -1
1419#define IOAPIC_EDGE 0 1434#define IOAPIC_EDGE 0
@@ -1452,7 +1467,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1452 else 1467 else
1453 desc->status &= ~IRQ_LEVEL; 1468 desc->status &= ~IRQ_LEVEL;
1454 1469
1455#ifdef CONFIG_INTR_REMAP
1456 if (irq_remapped(irq)) { 1470 if (irq_remapped(irq)) {
1457 desc->status |= IRQ_MOVE_PCNTXT; 1471 desc->status |= IRQ_MOVE_PCNTXT;
1458 if (trigger) 1472 if (trigger)
@@ -1464,7 +1478,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1464 handle_edge_irq, "edge"); 1478 handle_edge_irq, "edge");
1465 return; 1479 return;
1466 } 1480 }
1467#endif 1481
1468 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1482 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1469 trigger == IOAPIC_LEVEL) 1483 trigger == IOAPIC_LEVEL)
1470 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1484 set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1478,14 +1492,13 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1478int setup_ioapic_entry(int apic_id, int irq, 1492int setup_ioapic_entry(int apic_id, int irq,
1479 struct IO_APIC_route_entry *entry, 1493 struct IO_APIC_route_entry *entry,
1480 unsigned int destination, int trigger, 1494 unsigned int destination, int trigger,
1481 int polarity, int vector) 1495 int polarity, int vector, int pin)
1482{ 1496{
1483 /* 1497 /*
1484 * add it to the IO-APIC irq-routing table: 1498 * add it to the IO-APIC irq-routing table:
1485 */ 1499 */
1486 memset(entry,0,sizeof(*entry)); 1500 memset(entry,0,sizeof(*entry));
1487 1501
1488#ifdef CONFIG_INTR_REMAP
1489 if (intr_remapping_enabled) { 1502 if (intr_remapping_enabled) {
1490 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); 1503 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
1491 struct irte irte; 1504 struct irte irte;
@@ -1504,7 +1517,14 @@ int setup_ioapic_entry(int apic_id, int irq,
1504 1517
1505 irte.present = 1; 1518 irte.present = 1;
1506 irte.dst_mode = apic->irq_dest_mode; 1519 irte.dst_mode = apic->irq_dest_mode;
1507 irte.trigger_mode = trigger; 1520 /*
1521 * Trigger mode in the IRTE will always be edge, and the
1522 * actual level or edge trigger will be setup in the IO-APIC
1523 * RTE. This will help simplify level triggered irq migration.
1524 * For more details, see the comments above explainig IO-APIC
1525 * irq migration in the presence of interrupt-remapping.
1526 */
1527 irte.trigger_mode = 0;
1508 irte.dlvry_mode = apic->irq_delivery_mode; 1528 irte.dlvry_mode = apic->irq_delivery_mode;
1509 irte.vector = vector; 1529 irte.vector = vector;
1510 irte.dest_id = IRTE_DEST(destination); 1530 irte.dest_id = IRTE_DEST(destination);
@@ -1515,18 +1535,21 @@ int setup_ioapic_entry(int apic_id, int irq,
1515 ir_entry->zero = 0; 1535 ir_entry->zero = 0;
1516 ir_entry->format = 1; 1536 ir_entry->format = 1;
1517 ir_entry->index = (index & 0x7fff); 1537 ir_entry->index = (index & 0x7fff);
1518 } else 1538 /*
1519#endif 1539 * IO-APIC RTE will be configured with virtual vector.
1520 { 1540 * irq handler will do the explicit EOI to the io-apic.
1541 */
1542 ir_entry->vector = pin;
1543 } else {
1521 entry->delivery_mode = apic->irq_delivery_mode; 1544 entry->delivery_mode = apic->irq_delivery_mode;
1522 entry->dest_mode = apic->irq_dest_mode; 1545 entry->dest_mode = apic->irq_dest_mode;
1523 entry->dest = destination; 1546 entry->dest = destination;
1547 entry->vector = vector;
1524 } 1548 }
1525 1549
1526 entry->mask = 0; /* enable IRQ */ 1550 entry->mask = 0; /* enable IRQ */
1527 entry->trigger = trigger; 1551 entry->trigger = trigger;
1528 entry->polarity = polarity; 1552 entry->polarity = polarity;
1529 entry->vector = vector;
1530 1553
1531 /* Mask level triggered irqs. 1554 /* Mask level triggered irqs.
1532 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1555 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1561,7 +1584,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1561 1584
1562 1585
1563 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, 1586 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
1564 dest, trigger, polarity, cfg->vector)) { 1587 dest, trigger, polarity, cfg->vector, pin)) {
1565 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1588 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1566 mp_ioapics[apic_id].apicid, pin); 1589 mp_ioapics[apic_id].apicid, pin);
1567 __clear_irq_vector(irq, cfg); 1590 __clear_irq_vector(irq, cfg);
@@ -1642,10 +1665,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1642{ 1665{
1643 struct IO_APIC_route_entry entry; 1666 struct IO_APIC_route_entry entry;
1644 1667
1645#ifdef CONFIG_INTR_REMAP
1646 if (intr_remapping_enabled) 1668 if (intr_remapping_enabled)
1647 return; 1669 return;
1648#endif
1649 1670
1650 memset(&entry, 0, sizeof(entry)); 1671 memset(&entry, 0, sizeof(entry));
1651 1672
@@ -2040,8 +2061,13 @@ void disable_IO_APIC(void)
2040 * If the i8259 is routed through an IOAPIC 2061 * If the i8259 is routed through an IOAPIC
2041 * Put that IOAPIC in virtual wire mode 2062 * Put that IOAPIC in virtual wire mode
2042 * so legacy interrupts can be delivered. 2063 * so legacy interrupts can be delivered.
2064 *
2065 * With interrupt-remapping, for now we will use virtual wire A mode,
2066 * as virtual wire B is little complex (need to configure both
2067 * IOAPIC RTE aswell as interrupt-remapping table entry).
2068 * As this gets called during crash dump, keep this simple for now.
2043 */ 2069 */
2044 if (ioapic_i8259.pin != -1) { 2070 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
2045 struct IO_APIC_route_entry entry; 2071 struct IO_APIC_route_entry entry;
2046 2072
2047 memset(&entry, 0, sizeof(entry)); 2073 memset(&entry, 0, sizeof(entry));
@@ -2061,7 +2087,10 @@ void disable_IO_APIC(void)
2061 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); 2087 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
2062 } 2088 }
2063 2089
2064 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 2090 /*
2091 * Use virtual wire A mode when interrupt remapping is enabled.
2092 */
2093 disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
2065} 2094}
2066 2095
2067#ifdef CONFIG_X86_32 2096#ifdef CONFIG_X86_32
@@ -2303,37 +2332,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
2303#ifdef CONFIG_SMP 2332#ifdef CONFIG_SMP
2304 2333
2305#ifdef CONFIG_INTR_REMAP 2334#ifdef CONFIG_INTR_REMAP
2306static void ir_irq_migration(struct work_struct *work);
2307
2308static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2309 2335
2310/* 2336/*
2311 * Migrate the IO-APIC irq in the presence of intr-remapping. 2337 * Migrate the IO-APIC irq in the presence of intr-remapping.
2312 * 2338 *
2313 * For edge triggered, irq migration is a simple atomic update(of vector 2339 * For both level and edge triggered, irq migration is a simple atomic
2314 * and cpu destination) of IRTE and flush the hardware cache. 2340 * update(of vector and cpu destination) of IRTE and flush the hardware cache.
2315 *
2316 * For level triggered, we need to modify the io-apic RTE aswell with the update
2317 * vector information, along with modifying IRTE with vector and destination.
2318 * So irq migration for level triggered is little bit more complex compared to
2319 * edge triggered migration. But the good news is, we use the same algorithm
2320 * for level triggered migration as we have today, only difference being,
2321 * we now initiate the irq migration from process context instead of the
2322 * interrupt context.
2323 * 2341 *
2324 * In future, when we do a directed EOI (combined with cpu EOI broadcast 2342 * For level triggered, we eliminate the io-apic RTE modification (with the
2325 * suppression) to the IO-APIC, level triggered irq migration will also be 2343 * updated vector information), by using a virtual vector (io-apic pin number).
2326 * as simple as edge triggered migration and we can do the irq migration 2344 * Real vector that is used for interrupting cpu will be coming from
2327 * with a simple atomic update to IO-APIC RTE. 2345 * the interrupt-remapping table entry.
2328 */ 2346 */
2329static void 2347static void
2330migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2348migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2331{ 2349{
2332 struct irq_cfg *cfg; 2350 struct irq_cfg *cfg;
2333 struct irte irte; 2351 struct irte irte;
2334 int modify_ioapic_rte;
2335 unsigned int dest; 2352 unsigned int dest;
2336 unsigned long flags;
2337 unsigned int irq; 2353 unsigned int irq;
2338 2354
2339 if (!cpumask_intersects(mask, cpu_online_mask)) 2355 if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2351,13 +2367,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2351 2367
2352 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2368 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2353 2369
2354 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2355 if (modify_ioapic_rte) {
2356 spin_lock_irqsave(&ioapic_lock, flags);
2357 __target_IO_APIC_irq(irq, dest, cfg);
2358 spin_unlock_irqrestore(&ioapic_lock, flags);
2359 }
2360
2361 irte.vector = cfg->vector; 2370 irte.vector = cfg->vector;
2362 irte.dest_id = IRTE_DEST(dest); 2371 irte.dest_id = IRTE_DEST(dest);
2363 2372
@@ -2372,73 +2381,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2372 cpumask_copy(desc->affinity, mask); 2381 cpumask_copy(desc->affinity, mask);
2373} 2382}
2374 2383
2375static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2376{
2377 int ret = -1;
2378 struct irq_cfg *cfg = desc->chip_data;
2379
2380 mask_IO_APIC_irq_desc(desc);
2381
2382 if (io_apic_level_ack_pending(cfg)) {
2383 /*
2384 * Interrupt in progress. Migrating irq now will change the
2385 * vector information in the IO-APIC RTE and that will confuse
2386 * the EOI broadcast performed by cpu.
2387 * So, delay the irq migration to the next instance.
2388 */
2389 schedule_delayed_work(&ir_migration_work, 1);
2390 goto unmask;
2391 }
2392
2393 /* everthing is clear. we have right of way */
2394 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2395
2396 ret = 0;
2397 desc->status &= ~IRQ_MOVE_PENDING;
2398 cpumask_clear(desc->pending_mask);
2399
2400unmask:
2401 unmask_IO_APIC_irq_desc(desc);
2402
2403 return ret;
2404}
2405
2406static void ir_irq_migration(struct work_struct *work)
2407{
2408 unsigned int irq;
2409 struct irq_desc *desc;
2410
2411 for_each_irq_desc(irq, desc) {
2412 if (desc->status & IRQ_MOVE_PENDING) {
2413 unsigned long flags;
2414
2415 spin_lock_irqsave(&desc->lock, flags);
2416 if (!desc->chip->set_affinity ||
2417 !(desc->status & IRQ_MOVE_PENDING)) {
2418 desc->status &= ~IRQ_MOVE_PENDING;
2419 spin_unlock_irqrestore(&desc->lock, flags);
2420 continue;
2421 }
2422
2423 desc->chip->set_affinity(irq, desc->pending_mask);
2424 spin_unlock_irqrestore(&desc->lock, flags);
2425 }
2426 }
2427}
2428
2429/* 2384/*
2430 * Migrates the IRQ destination in the process context. 2385 * Migrates the IRQ destination in the process context.
2431 */ 2386 */
2432static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2387static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2433 const struct cpumask *mask) 2388 const struct cpumask *mask)
2434{ 2389{
2435 if (desc->status & IRQ_LEVEL) {
2436 desc->status |= IRQ_MOVE_PENDING;
2437 cpumask_copy(desc->pending_mask, mask);
2438 migrate_irq_remapped_level_desc(desc);
2439 return;
2440 }
2441
2442 migrate_ioapic_irq_desc(desc, mask); 2390 migrate_ioapic_irq_desc(desc, mask);
2443} 2391}
2444static void set_ir_ioapic_affinity_irq(unsigned int irq, 2392static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2448,6 +2396,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
2448 2396
2449 set_ir_ioapic_affinity_irq_desc(desc, mask); 2397 set_ir_ioapic_affinity_irq_desc(desc, mask);
2450} 2398}
2399#else
2400static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2401 const struct cpumask *mask)
2402{
2403}
2451#endif 2404#endif
2452 2405
2453asmlinkage void smp_irq_move_cleanup_interrupt(void) 2406asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2461,6 +2414,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2461 me = smp_processor_id(); 2414 me = smp_processor_id();
2462 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2415 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
2463 unsigned int irq; 2416 unsigned int irq;
2417 unsigned int irr;
2464 struct irq_desc *desc; 2418 struct irq_desc *desc;
2465 struct irq_cfg *cfg; 2419 struct irq_cfg *cfg;
2466 irq = __get_cpu_var(vector_irq)[vector]; 2420 irq = __get_cpu_var(vector_irq)[vector];
@@ -2480,6 +2434,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2480 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2434 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2481 goto unlock; 2435 goto unlock;
2482 2436
2437 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
2438 /*
2439 * Check if the vector that needs to be cleanedup is
2440 * registered at the cpu's IRR. If so, then this is not
2441 * the best time to clean it up. Lets clean it up in the
2442 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
2443 * to myself.
2444 */
2445 if (irr & (1 << (vector % 32))) {
2446 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2447 goto unlock;
2448 }
2483 __get_cpu_var(vector_irq)[vector] = -1; 2449 __get_cpu_var(vector_irq)[vector] = -1;
2484 cfg->move_cleanup_count--; 2450 cfg->move_cleanup_count--;
2485unlock: 2451unlock:
@@ -2529,9 +2495,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
2529#endif 2495#endif
2530 2496
2531#ifdef CONFIG_INTR_REMAP 2497#ifdef CONFIG_INTR_REMAP
2498static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2499{
2500 int apic, pin;
2501 struct irq_pin_list *entry;
2502
2503 entry = cfg->irq_2_pin;
2504 for (;;) {
2505
2506 if (!entry)
2507 break;
2508
2509 apic = entry->apic;
2510 pin = entry->pin;
2511 io_apic_eoi(apic, pin);
2512 entry = entry->next;
2513 }
2514}
2515
2516static void
2517eoi_ioapic_irq(struct irq_desc *desc)
2518{
2519 struct irq_cfg *cfg;
2520 unsigned long flags;
2521 unsigned int irq;
2522
2523 irq = desc->irq;
2524 cfg = desc->chip_data;
2525
2526 spin_lock_irqsave(&ioapic_lock, flags);
2527 __eoi_ioapic_irq(irq, cfg);
2528 spin_unlock_irqrestore(&ioapic_lock, flags);
2529}
2530
2532static void ack_x2apic_level(unsigned int irq) 2531static void ack_x2apic_level(unsigned int irq)
2533{ 2532{
2533 struct irq_desc *desc = irq_to_desc(irq);
2534 ack_x2APIC_irq(); 2534 ack_x2APIC_irq();
2535 eoi_ioapic_irq(desc);
2535} 2536}
2536 2537
2537static void ack_x2apic_edge(unsigned int irq) 2538static void ack_x2apic_edge(unsigned int irq)
@@ -2901,10 +2902,8 @@ static inline void __init check_timer(void)
2901 * 8259A. 2902 * 8259A.
2902 */ 2903 */
2903 if (pin1 == -1) { 2904 if (pin1 == -1) {
2904#ifdef CONFIG_INTR_REMAP
2905 if (intr_remapping_enabled) 2905 if (intr_remapping_enabled)
2906 panic("BIOS bug: timer not connected to IO-APIC"); 2906 panic("BIOS bug: timer not connected to IO-APIC");
2907#endif
2908 pin1 = pin2; 2907 pin1 = pin2;
2909 apic1 = apic2; 2908 apic1 = apic2;
2910 no_pin1 = 1; 2909 no_pin1 = 1;
@@ -2940,10 +2939,8 @@ static inline void __init check_timer(void)
2940 clear_IO_APIC_pin(0, pin1); 2939 clear_IO_APIC_pin(0, pin1);
2941 goto out; 2940 goto out;
2942 } 2941 }
2943#ifdef CONFIG_INTR_REMAP
2944 if (intr_remapping_enabled) 2942 if (intr_remapping_enabled)
2945 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2943 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2946#endif
2947 local_irq_disable(); 2944 local_irq_disable();
2948 clear_IO_APIC_pin(apic1, pin1); 2945 clear_IO_APIC_pin(apic1, pin1);
2949 if (!no_pin1) 2946 if (!no_pin1)
@@ -3237,9 +3234,7 @@ void destroy_irq(unsigned int irq)
3237 if (desc) 3234 if (desc)
3238 desc->chip_data = cfg; 3235 desc->chip_data = cfg;
3239 3236
3240#ifdef CONFIG_INTR_REMAP
3241 free_irte(irq); 3237 free_irte(irq);
3242#endif
3243 spin_lock_irqsave(&vector_lock, flags); 3238 spin_lock_irqsave(&vector_lock, flags);
3244 __clear_irq_vector(irq, cfg); 3239 __clear_irq_vector(irq, cfg);
3245 spin_unlock_irqrestore(&vector_lock, flags); 3240 spin_unlock_irqrestore(&vector_lock, flags);
@@ -3265,7 +3260,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3265 3260
3266 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3261 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3267 3262
3268#ifdef CONFIG_INTR_REMAP
3269 if (irq_remapped(irq)) { 3263 if (irq_remapped(irq)) {
3270 struct irte irte; 3264 struct irte irte;
3271 int ir_index; 3265 int ir_index;
@@ -3291,10 +3285,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3291 MSI_ADDR_IR_SHV | 3285 MSI_ADDR_IR_SHV |
3292 MSI_ADDR_IR_INDEX1(ir_index) | 3286 MSI_ADDR_IR_INDEX1(ir_index) |
3293 MSI_ADDR_IR_INDEX2(ir_index); 3287 MSI_ADDR_IR_INDEX2(ir_index);
3294 } else 3288 } else {
3295#endif 3289 if (x2apic_enabled())
3296 { 3290 msg->address_hi = MSI_ADDR_BASE_HI |
3297 msg->address_hi = MSI_ADDR_BASE_HI; 3291 MSI_ADDR_EXT_DEST_ID(dest);
3292 else
3293 msg->address_hi = MSI_ADDR_BASE_HI;
3294
3298 msg->address_lo = 3295 msg->address_lo =
3299 MSI_ADDR_BASE_LO | 3296 MSI_ADDR_BASE_LO |
3300 ((apic->irq_dest_mode == 0) ? 3297 ((apic->irq_dest_mode == 0) ?
@@ -3405,6 +3402,7 @@ static struct irq_chip msi_ir_chip = {
3405#endif 3402#endif
3406 .retrigger = ioapic_retrigger_irq, 3403 .retrigger = ioapic_retrigger_irq,
3407}; 3404};
3405#endif
3408 3406
3409/* 3407/*
3410 * Map the PCI dev to the corresponding remapping hardware unit 3408 * Map the PCI dev to the corresponding remapping hardware unit
@@ -3432,7 +3430,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3432 } 3430 }
3433 return index; 3431 return index;
3434} 3432}
3435#endif
3436 3433
3437static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3434static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3438{ 3435{
@@ -3446,7 +3443,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3446 set_irq_msi(irq, msidesc); 3443 set_irq_msi(irq, msidesc);
3447 write_msi_msg(irq, &msg); 3444 write_msi_msg(irq, &msg);
3448 3445
3449#ifdef CONFIG_INTR_REMAP
3450 if (irq_remapped(irq)) { 3446 if (irq_remapped(irq)) {
3451 struct irq_desc *desc = irq_to_desc(irq); 3447 struct irq_desc *desc = irq_to_desc(irq);
3452 /* 3448 /*
@@ -3455,7 +3451,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3455 desc->status |= IRQ_MOVE_PCNTXT; 3451 desc->status |= IRQ_MOVE_PCNTXT;
3456 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3452 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3457 } else 3453 } else
3458#endif
3459 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3454 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3460 3455
3461 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3456 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3469,11 +3464,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3469 int ret, sub_handle; 3464 int ret, sub_handle;
3470 struct msi_desc *msidesc; 3465 struct msi_desc *msidesc;
3471 unsigned int irq_want; 3466 unsigned int irq_want;
3472
3473#ifdef CONFIG_INTR_REMAP
3474 struct intel_iommu *iommu = 0; 3467 struct intel_iommu *iommu = 0;
3475 int index = 0; 3468 int index = 0;
3476#endif
3477 3469
3478 irq_want = nr_irqs_gsi; 3470 irq_want = nr_irqs_gsi;
3479 sub_handle = 0; 3471 sub_handle = 0;
@@ -3482,7 +3474,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3482 if (irq == 0) 3474 if (irq == 0)
3483 return -1; 3475 return -1;
3484 irq_want = irq + 1; 3476 irq_want = irq + 1;
3485#ifdef CONFIG_INTR_REMAP
3486 if (!intr_remapping_enabled) 3477 if (!intr_remapping_enabled)
3487 goto no_ir; 3478 goto no_ir;
3488 3479
@@ -3510,7 +3501,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3510 set_irte_irq(irq, iommu, index, sub_handle); 3501 set_irte_irq(irq, iommu, index, sub_handle);
3511 } 3502 }
3512no_ir: 3503no_ir:
3513#endif
3514 ret = setup_msi_irq(dev, msidesc, irq); 3504 ret = setup_msi_irq(dev, msidesc, irq);
3515 if (ret < 0) 3505 if (ret < 0)
3516 goto error; 3506 goto error;
@@ -3528,7 +3518,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3528 destroy_irq(irq); 3518 destroy_irq(irq);
3529} 3519}
3530 3520
3531#ifdef CONFIG_DMAR 3521#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3532#ifdef CONFIG_SMP 3522#ifdef CONFIG_SMP
3533static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3523static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3534{ 3524{
@@ -4045,11 +4035,9 @@ void __init setup_ioapic_dest(void)
4045 else 4035 else
4046 mask = apic->target_cpus(); 4036 mask = apic->target_cpus();
4047 4037
4048#ifdef CONFIG_INTR_REMAP
4049 if (intr_remapping_enabled) 4038 if (intr_remapping_enabled)
4050 set_ir_ioapic_affinity_irq_desc(desc, mask); 4039 set_ir_ioapic_affinity_irq_desc(desc, mask);
4051 else 4040 else
4052#endif
4053 set_ioapic_affinity_irq_desc(desc, mask); 4041 set_ioapic_affinity_irq_desc(desc, mask);
4054 } 4042 }
4055 4043
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 8d7748efe6a..1783652bb0e 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -68,6 +68,13 @@ void __init default_setup_apic_routing(void)
68 apic = &apic_physflat; 68 apic = &apic_physflat;
69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
70 } 70 }
71
72 /*
73 * Now that apic routing model is selected, configure the
74 * fault handling for intr remapping.
75 */
76 if (intr_remapping_enabled)
77 enable_drhd_fault_handling();
71} 78}
72 79
73/* Same for both flat and physical. */ 80/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8fb87b6dd63..4a903e2f0d1 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
57 unsigned long query_cpu; 57 unsigned long query_cpu;
58 unsigned long flags; 58 unsigned long flags;
59 59
60 x2apic_wrmsr_fence();
61
60 local_irq_save(flags); 62 local_irq_save(flags);
61 for_each_cpu(query_cpu, mask) { 63 for_each_cpu(query_cpu, mask) {
62 __x2apic_send_IPI_dest( 64 __x2apic_send_IPI_dest(
@@ -73,6 +75,8 @@ static void
73 unsigned long query_cpu; 75 unsigned long query_cpu;
74 unsigned long flags; 76 unsigned long flags;
75 77
78 x2apic_wrmsr_fence();
79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
78 if (query_cpu == this_cpu) 82 if (query_cpu == this_cpu)
@@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector)
90 unsigned long query_cpu; 94 unsigned long query_cpu;
91 unsigned long flags; 95 unsigned long flags;
92 96
97 x2apic_wrmsr_fence();
98
93 local_irq_save(flags); 99 local_irq_save(flags);
94 for_each_online_cpu(query_cpu) { 100 for_each_online_cpu(query_cpu) {
95 if (query_cpu == this_cpu) 101 if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 23625b9f98b..a284359627e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58 unsigned long query_cpu; 58 unsigned long query_cpu;
59 unsigned long flags; 59 unsigned long flags;
60 60
61 x2apic_wrmsr_fence();
62
61 local_irq_save(flags); 63 local_irq_save(flags);
62 for_each_cpu(query_cpu, mask) { 64 for_each_cpu(query_cpu, mask) {
63 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 65 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
@@ -73,6 +75,8 @@ static void
73 unsigned long query_cpu; 75 unsigned long query_cpu;
74 unsigned long flags; 76 unsigned long flags;
75 77
78 x2apic_wrmsr_fence();
79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
78 if (query_cpu != this_cpu) 82 if (query_cpu != this_cpu)
@@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector)
89 unsigned long query_cpu; 93 unsigned long query_cpu;
90 unsigned long flags; 94 unsigned long flags;
91 95
96 x2apic_wrmsr_fence();
97
92 local_irq_save(flags); 98 local_irq_save(flags);
93 for_each_online_cpu(query_cpu) { 99 for_each_online_cpu(query_cpu) {
94 if (query_cpu == this_cpu) 100 if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412..fc999e6fc46 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,15 +83,15 @@ void __init setup_bios_corruption_check(void)
83 u64 size; 83 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE); 84 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
85 85
86 if (addr == 0) 86 if (!(addr + 1))
87 break;
88
89 if (addr >= corruption_check_size)
87 break; 90 break;
88 91
89 if ((addr + size) > corruption_check_size) 92 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr; 93 size = corruption_check_size - addr;
91 94
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED); 95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr; 96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size; 97 scan_areas[num_scan_areas].size = size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2d..4e242f9a06e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,11 +14,12 @@ obj-y += vmware.o hypervisor.o
14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
15obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
16 16
17obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
18
17obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 19obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
18obj-$(CONFIG_CPU_SUP_AMD) += amd.o 20obj-$(CONFIG_CPU_SUP_AMD) += amd.o
19obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 21obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
20obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o 22obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
21obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
24 25
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 6882a735d9c..8220ae69849 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
29 u32 regs[4]; 29 u32 regs[4];
30 const struct cpuid_bit *cb; 30 const struct cpuid_bit *cb;
31 31
32 static const struct cpuid_bit cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { 0, 0, 0, 0 } 34 { 0, 0, 0, 0 }
35 }; 35 };
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80e..7e4a459daa6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5#include <asm/io.h> 5#include <asm/io.h>
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
10# include <asm/numa_64.h> 11# include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
141 } 142 }
142} 143}
143 144
145static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
146{
147#ifdef CONFIG_SMP
148 /* calling is from identify_secondary_cpu() ? */
149 if (c->cpu_index == boot_cpu_id)
150 return;
151
152 /*
153 * Certain Athlons might work (for various values of 'work') in SMP
154 * but they are not certified as MP capable.
155 */
156 /* Athlon 660/661 is valid. */
157 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
158 (c->x86_mask == 1)))
159 goto valid_k7;
160
161 /* Duron 670 is valid */
162 if ((c->x86_model == 7) && (c->x86_mask == 0))
163 goto valid_k7;
164
165 /*
166 * Athlon 662, Duron 671, and Athlon >model 7 have capability
167 * bit. It's worth noting that the A5 stepping (662) of some
168 * Athlon XP's have the MP bit set.
169 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
170 * more.
171 */
172 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
173 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
174 (c->x86_model > 7))
175 if (cpu_has_mp)
176 goto valid_k7;
177
178 /* If we get here, not a certified SMP capable AMD system. */
179
180 /*
181 * Don't taint if we are running SMP kernel on a single non-MP
182 * approved Athlon
183 */
184 WARN_ONCE(1, "WARNING: This combination of AMD"
185 "processors is not suitable for SMP.\n");
186 if (!test_taint(TAINT_UNSAFE_SMP))
187 add_taint(TAINT_UNSAFE_SMP);
188
189valid_k7:
190 ;
191#endif
192}
193
144static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 194static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
145{ 195{
146 u32 l, h; 196 u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
175 } 225 }
176 226
177 set_cpu_cap(c, X86_FEATURE_K7); 227 set_cpu_cap(c, X86_FEATURE_K7);
228
229 amd_k7_smp_check(c);
178} 230}
179#endif 231#endif
180 232
@@ -450,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
450} 502}
451#endif 503#endif
452 504
453static struct cpu_dev amd_cpu_dev __cpuinitdata = { 505static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
454 .c_vendor = "AMD", 506 .c_vendor = "AMD",
455 .c_ident = { "AuthenticAMD" }, 507 .c_ident = { "AuthenticAMD" },
456#ifdef CONFIG_X86_32 508#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc..c95e831bb09 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
1#include <linux/bitops.h>
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/init.h> 3#include <linux/init.h>
3#include <linux/bitops.h>
4 4
5#include <asm/processor.h> 5#include <asm/processor.h>
6#include <asm/msr.h>
7#include <asm/e820.h> 6#include <asm/e820.h>
8#include <asm/mtrr.h> 7#include <asm/mtrr.h>
8#include <asm/msr.h>
9 9
10#include "cpu.h" 10#include "cpu.h"
11 11
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
276 */ 276 */
277 c->x86_capability[5] = cpuid_edx(0xC0000001); 277 c->x86_capability[5] = cpuid_edx(0xC0000001);
278 } 278 }
279 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 9) { 281 if (c->x86_model >= 6 && c->x86_model <= 9) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
288 /* Before Nehemiah, the C3's had 3dNOW! */ 288 /* Before Nehemiah, the C3's had 3dNOW! */
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291#endif
292 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
293 c->x86_cache_alignment = c->x86_clflush_size * 2;
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 }
291 296
292 display_cacheinfo(c); 297 display_cacheinfo(c);
293} 298}
@@ -316,16 +321,25 @@ enum {
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) 321static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{ 322{
318 switch (c->x86) { 323 switch (c->x86) {
324#ifdef CONFIG_X86_32
319 case 5: 325 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */ 326 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); 327 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break; 328 break;
329#endif
330 case 6:
331 if (c->x86_model >= 0xf)
332 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
333 break;
323 } 334 }
335#ifdef CONFIG_X86_64
336 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
337#endif
324} 338}
325 339
326static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 340static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
327{ 341{
328 342#ifdef CONFIG_X86_32
329 char *name; 343 char *name;
330 u32 fcr_set = 0; 344 u32 fcr_set = 0;
331 u32 fcr_clr = 0; 345 u32 fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
337 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 351 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
338 */ 352 */
339 clear_cpu_cap(c, 0*32+31); 353 clear_cpu_cap(c, 0*32+31);
340 354#endif
355 early_init_centaur(c);
341 switch (c->x86) { 356 switch (c->x86) {
357#ifdef CONFIG_X86_32
342 case 5: 358 case 5:
343 switch (c->x86_model) { 359 switch (c->x86_model) {
344 case 4: 360 case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
442 } 458 }
443 sprintf(c->x86_model_id, "WinChip %s", name); 459 sprintf(c->x86_model_id, "WinChip %s", name);
444 break; 460 break;
445 461#endif
446 case 6: 462 case 6:
447 init_c3(c); 463 init_c3(c);
448 break; 464 break;
449 } 465 }
466#ifdef CONFIG_X86_64
467 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
468#endif
450} 469}
451 470
452static unsigned int __cpuinit 471static unsigned int __cpuinit
453centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) 472centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
454{ 473{
474#ifdef CONFIG_X86_32
455 /* VIA C3 CPUs (670-68F) need further shifting. */ 475 /* VIA C3 CPUs (670-68F) need further shifting. */
456 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) 476 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
457 size >>= 8; 477 size >>= 8;
@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
464 if ((c->x86 == 6) && (c->x86_model == 9) && 484 if ((c->x86 == 6) && (c->x86_model == 9) &&
465 (c->x86_mask == 1) && (size == 65)) 485 (c->x86_mask == 1) && (size == 65))
466 size -= 1; 486 size -= 1;
467 487#endif
468 return size; 488 return size;
469} 489}
470 490
471static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 491static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
472 .c_vendor = "Centaur", 492 .c_vendor = "Centaur",
473 .c_ident = { "CentaurHauls" }, 493 .c_ident = { "CentaurHauls" },
474 .c_early_init = early_init_centaur, 494 .c_early_init = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index a1625f5a1e7..00000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3
4#include <asm/cpufeature.h>
5#include <asm/processor.h>
6
7#include "cpu.h"
8
9static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
10{
11 if (c->x86 == 0x6 && c->x86_model >= 0xf)
12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
13
14 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
15}
16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{
19 early_init_centaur(c);
20
21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
22 c->x86_cache_alignment = c->x86_clflush_size * 2;
23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
24 }
25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
26}
27
28static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_vendor = "Centaur",
30 .c_ident = { "CentaurHauls" },
31 .c_early_init = early_init_centaur,
32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
34};
35
36cpu_dev_register(centaur_cpu_dev);
37
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a9e3791ca09..e2962cc1e27 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -70,7 +70,7 @@ cpumask_t cpu_callin_map;
70#endif /* CONFIG_X86_32 */ 70#endif /* CONFIG_X86_32 */
71 71
72 72
73static struct cpu_dev *this_cpu __cpuinitdata; 73static const struct cpu_dev *this_cpu __cpuinitdata;
74 74
75DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 75DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
76#ifdef CONFIG_X86_64 76#ifdef CONFIG_X86_64
@@ -284,9 +284,9 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
284 */ 284 */
285 285
286/* Look up CPU names by table lookup. */ 286/* Look up CPU names by table lookup. */
287static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) 287static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
288{ 288{
289 struct cpu_model_info *info; 289 const struct cpu_model_info *info;
290 290
291 if (c->x86_model >= 16) 291 if (c->x86_model >= 16)
292 return NULL; /* Range check */ 292 return NULL; /* Range check */
@@ -333,7 +333,7 @@ void switch_to_new_gdt(int cpu)
333 load_percpu_segment(cpu); 333 load_percpu_segment(cpu);
334} 334}
335 335
336static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 336static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
337 337
338static void __cpuinit default_init(struct cpuinfo_x86 *c) 338static void __cpuinit default_init(struct cpuinfo_x86 *c)
339{ 339{
@@ -352,7 +352,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
352#endif 352#endif
353} 353}
354 354
355static struct cpu_dev __cpuinitdata default_cpu = { 355static const struct cpu_dev __cpuinitconst default_cpu = {
356 .c_init = default_init, 356 .c_init = default_init,
357 .c_vendor = "Unknown", 357 .c_vendor = "Unknown",
358 .c_x86_vendor = X86_VENDOR_UNKNOWN, 358 .c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -574,13 +574,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
574 } 574 }
575 } 575 }
576 576
577#ifdef CONFIG_X86_64
578 if (c->extended_cpuid_level >= 0x80000008) { 577 if (c->extended_cpuid_level >= 0x80000008) {
579 u32 eax = cpuid_eax(0x80000008); 578 u32 eax = cpuid_eax(0x80000008);
580 579
581 c->x86_virt_bits = (eax >> 8) & 0xff; 580 c->x86_virt_bits = (eax >> 8) & 0xff;
582 c->x86_phys_bits = eax & 0xff; 581 c->x86_phys_bits = eax & 0xff;
583 } 582 }
583#ifdef CONFIG_X86_32
584 else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
585 c->x86_phys_bits = 36;
584#endif 586#endif
585 587
586 if (c->extended_cpuid_level >= 0x80000007) 588 if (c->extended_cpuid_level >= 0x80000007)
@@ -627,8 +629,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
627{ 629{
628#ifdef CONFIG_X86_64 630#ifdef CONFIG_X86_64
629 c->x86_clflush_size = 64; 631 c->x86_clflush_size = 64;
632 c->x86_phys_bits = 36;
633 c->x86_virt_bits = 48;
630#else 634#else
631 c->x86_clflush_size = 32; 635 c->x86_clflush_size = 32;
636 c->x86_phys_bits = 32;
637 c->x86_virt_bits = 32;
632#endif 638#endif
633 c->x86_cache_alignment = c->x86_clflush_size; 639 c->x86_cache_alignment = c->x86_clflush_size;
634 640
@@ -659,12 +665,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
659 665
660void __init early_cpu_init(void) 666void __init early_cpu_init(void)
661{ 667{
662 struct cpu_dev **cdev; 668 const struct cpu_dev *const *cdev;
663 int count = 0; 669 int count = 0;
664 670
665 printk(KERN_INFO "KERNEL supported cpus:\n"); 671 printk(KERN_INFO "KERNEL supported cpus:\n");
666 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 672 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
667 struct cpu_dev *cpudev = *cdev; 673 const struct cpu_dev *cpudev = *cdev;
668 unsigned int j; 674 unsigned int j;
669 675
670 if (count >= X86_VENDOR_NUM) 676 if (count >= X86_VENDOR_NUM)
@@ -751,9 +757,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
751 c->x86_coreid_bits = 0; 757 c->x86_coreid_bits = 0;
752#ifdef CONFIG_X86_64 758#ifdef CONFIG_X86_64
753 c->x86_clflush_size = 64; 759 c->x86_clflush_size = 64;
760 c->x86_phys_bits = 36;
761 c->x86_virt_bits = 48;
754#else 762#else
755 c->cpuid_level = -1; /* CPUID not detected */ 763 c->cpuid_level = -1; /* CPUID not detected */
756 c->x86_clflush_size = 32; 764 c->x86_clflush_size = 32;
765 c->x86_phys_bits = 32;
766 c->x86_virt_bits = 32;
757#endif 767#endif
758 c->x86_cache_alignment = c->x86_clflush_size; 768 c->x86_cache_alignment = c->x86_clflush_size;
759 memset(&c->x86_capability, 0, sizeof c->x86_capability); 769 memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -793,7 +803,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
793 803
794 /* If the model name is still unset, do table lookup. */ 804 /* If the model name is still unset, do table lookup. */
795 if (!c->x86_model_id[0]) { 805 if (!c->x86_model_id[0]) {
796 char *p; 806 const char *p;
797 p = table_lookup_model(c); 807 p = table_lookup_model(c);
798 if (p) 808 if (p)
799 strcpy(c->x86_model_id, p); 809 strcpy(c->x86_model_id, p);
@@ -872,7 +882,7 @@ struct msr_range {
872 unsigned max; 882 unsigned max;
873}; 883};
874 884
875static struct msr_range msr_range_array[] __cpuinitdata = { 885static const struct msr_range msr_range_array[] __cpuinitconst = {
876 { 0x00000000, 0x00000418}, 886 { 0x00000000, 0x00000418},
877 { 0xc0000000, 0xc000040b}, 887 { 0xc0000000, 0xc000040b},
878 { 0xc0010000, 0xc0010142}, 888 { 0xc0010000, 0xc0010142},
@@ -921,7 +931,7 @@ __setup("noclflush", setup_noclflush);
921 931
922void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 932void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
923{ 933{
924 char *vendor = NULL; 934 const char *vendor = NULL;
925 935
926 if (c->x86_vendor < X86_VENDOR_NUM) { 936 if (c->x86_vendor < X86_VENDOR_NUM) {
927 vendor = this_cpu->c_vendor; 937 vendor = this_cpu->c_vendor;
@@ -1139,8 +1149,7 @@ void __cpuinit cpu_init(void)
1139 1149
1140 atomic_inc(&init_mm.mm_count); 1150 atomic_inc(&init_mm.mm_count);
1141 me->active_mm = &init_mm; 1151 me->active_mm = &init_mm;
1142 if (me->mm) 1152 BUG_ON(me->mm);
1143 BUG();
1144 enter_lazy_tlb(&init_mm, me); 1153 enter_lazy_tlb(&init_mm, me);
1145 1154
1146 load_sp0(t, &current->thread); 1155 load_sp0(t, &current->thread);
@@ -1197,8 +1206,7 @@ void __cpuinit cpu_init(void)
1197 */ 1206 */
1198 atomic_inc(&init_mm.mm_count); 1207 atomic_inc(&init_mm.mm_count);
1199 curr->active_mm = &init_mm; 1208 curr->active_mm = &init_mm;
1200 if (curr->mm) 1209 BUG_ON(curr->mm);
1201 BUG();
1202 enter_lazy_tlb(&init_mm, curr); 1210 enter_lazy_tlb(&init_mm, curr);
1203 1211
1204 load_sp0(t, thread); 1212 load_sp0(t, thread);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a3921..9469ecb5aeb 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -5,15 +5,15 @@
5struct cpu_model_info { 5struct cpu_model_info {
6 int vendor; 6 int vendor;
7 int family; 7 int family;
8 char *model_names[16]; 8 const char *model_names[16];
9}; 9};
10 10
11/* attempt to consolidate cpu attributes */ 11/* attempt to consolidate cpu attributes */
12struct cpu_dev { 12struct cpu_dev {
13 char * c_vendor; 13 const char * c_vendor;
14 14
15 /* some have two possibilities for cpuid string */ 15 /* some have two possibilities for cpuid string */
16 char * c_ident[2]; 16 const char * c_ident[2];
17 17
18 struct cpu_model_info c_models[4]; 18 struct cpu_model_info c_models[4];
19 19
@@ -25,11 +25,12 @@ struct cpu_dev {
25}; 25};
26 26
27#define cpu_dev_register(cpu_devX) \ 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \ 28 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \ 29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX; 30 &cpu_devX;
31 31
32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[];
33 34
34extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void display_cacheinfo(struct cpuinfo_x86 *c);
35 36
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 00000000000..46e29ab96c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,901 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38
39static DEFINE_MUTEX(cpu_debug_lock);
40
41static struct dentry *cpu_debugfs_dir;
42
43static struct cpu_debug_base cpu_base[] = {
44 { "mc", CPU_MC, 0 },
45 { "monitor", CPU_MONITOR, 0 },
46 { "time", CPU_TIME, 0 },
47 { "pmc", CPU_PMC, 1 },
48 { "platform", CPU_PLATFORM, 0 },
49 { "apic", CPU_APIC, 0 },
50 { "poweron", CPU_POWERON, 0 },
51 { "control", CPU_CONTROL, 0 },
52 { "features", CPU_FEATURES, 0 },
53 { "lastbranch", CPU_LBRANCH, 0 },
54 { "bios", CPU_BIOS, 0 },
55 { "freq", CPU_FREQ, 0 },
56 { "mtrr", CPU_MTRR, 0 },
57 { "perf", CPU_PERF, 0 },
58 { "cache", CPU_CACHE, 0 },
59 { "sysenter", CPU_SYSENTER, 0 },
60 { "therm", CPU_THERM, 0 },
61 { "misc", CPU_MISC, 0 },
62 { "debug", CPU_DEBUG, 0 },
63 { "pat", CPU_PAT, 0 },
64 { "vmx", CPU_VMX, 0 },
65 { "call", CPU_CALL, 0 },
66 { "base", CPU_BASE, 0 },
67 { "ver", CPU_VER, 0 },
68 { "conf", CPU_CONF, 0 },
69 { "smm", CPU_SMM, 0 },
70 { "svm", CPU_SVM, 0 },
71 { "osvm", CPU_OSVM, 0 },
72 { "tss", CPU_TSS, 0 },
73 { "cr", CPU_CR, 0 },
74 { "dt", CPU_DT, 0 },
75 { "registers", CPU_REG_ALL, 0 },
76};
77
78static struct cpu_file_base cpu_file[] = {
79 { "index", CPU_REG_ALL, 0 },
80 { "value", CPU_REG_ALL, 1 },
81};
82
83/* Intel Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
91
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
96
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
101
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
106
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
111
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
117
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
126
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
135
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
142
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
155
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178};
179
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{
355 unsigned vendor, modelflag;
356 int i, index;
357
358 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS)
360 return 1;
361
362 modelflag = per_cpu(cpu_modelflag, cpu);
363 vendor = per_cpu(cpu_model, cpu) >> 16;
364 index = get_cpu_range_count(cpu);
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 }
380
381 /* Invalid */
382 return 0;
383}
384
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag)
387{
388 unsigned modelflag;
389
390 modelflag = per_cpu(cpu_modelflag, cpu);
391 *max = 0;
392 switch (per_cpu(cpu_model, cpu) >> 16) {
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408
409 return *max;
410}
411
412/* This function can also be called with seq = NULL for printk */
413static void print_cpu_data(struct seq_file *seq, unsigned type,
414 u32 low, u32 high)
415{
416 struct cpu_private *priv;
417 u64 val = high;
418
419 if (seq) {
420 priv = seq->private;
421 if (priv->file) {
422 val = (val << 32) | low;
423 seq_printf(seq, "0x%llx\n", val);
424 } else
425 seq_printf(seq, " %08x: %08x_%08x\n",
426 type, high, low);
427 } else
428 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
429}
430
431/* This function can also be called with seq = NULL for printk */
432static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
433{
434 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv;
436 u32 low, high;
437 int i, range;
438
439 if (seq) {
440 priv = seq->private;
441 if (priv->file) {
442 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
443 &low, &high))
444 print_cpu_data(seq, priv->reg, low, high);
445 return;
446 }
447 }
448
449 range = get_cpu_range_count(cpu);
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue;
454
455 for (msr = msr_min; msr <= msr_max; msr++) {
456 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
457 continue;
458 print_cpu_data(seq, msr, low, high);
459 }
460 }
461}
462
463static void print_tss(void *arg)
464{
465 struct pt_regs *regs = task_pt_regs(current);
466 struct seq_file *seq = arg;
467 unsigned int seg;
468
469 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
470 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
471 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
472 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
473
474 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
475 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
476 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
477 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
478
479#ifdef CONFIG_X86_64
480 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
481 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
482 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
483 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
484 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
485 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
486 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
487 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
488#endif
489
490 asm("movl %%cs,%0" : "=r" (seg));
491 seq_printf(seq, " CS\t: %04x\n", seg);
492 asm("movl %%ds,%0" : "=r" (seg));
493 seq_printf(seq, " DS\t: %04x\n", seg);
494 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
495 asm("movl %%es,%0" : "=r" (seg));
496 seq_printf(seq, " ES\t: %04x\n", seg);
497 asm("movl %%fs,%0" : "=r" (seg));
498 seq_printf(seq, " FS\t: %04x\n", seg);
499 asm("movl %%gs,%0" : "=r" (seg));
500 seq_printf(seq, " GS\t: %04x\n", seg);
501
502 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
503
504 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
505}
506
507static void print_cr(void *arg)
508{
509 struct seq_file *seq = arg;
510
511 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
512 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
513 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
514 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
515#ifdef CONFIG_X86_64
516 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
517#endif
518}
519
520static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
521{
522 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
523}
524
525static void print_dt(void *seq)
526{
527 struct desc_ptr dt;
528 unsigned long ldt;
529
530 /* IDT */
531 store_idt((struct desc_ptr *)&dt);
532 print_desc_ptr("IDT", seq, dt);
533
534 /* GDT */
535 store_gdt((struct desc_ptr *)&dt);
536 print_desc_ptr("GDT", seq, dt);
537
538 /* LDT */
539 store_ldt(ldt);
540 seq_printf(seq, " LDT\t: %016lx\n", ldt);
541
542 /* TR */
543 store_tr(ldt);
544 seq_printf(seq, " TR\t: %016lx\n", ldt);
545}
546
547static void print_dr(void *arg)
548{
549 struct seq_file *seq = arg;
550 unsigned long dr;
551 int i;
552
553 for (i = 0; i < 8; i++) {
554 /* Ignore db4, db5 */
555 if ((i == 4) || (i == 5))
556 continue;
557 get_debugreg(dr, i);
558 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
559 }
560
561 seq_printf(seq, "\n MSR\t:\n");
562}
563
564static void print_apic(void *arg)
565{
566 struct seq_file *seq = arg;
567
568#ifdef CONFIG_X86_LOCAL_APIC
569 seq_printf(seq, " LAPIC\t:\n");
570 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
571 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
572 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
573 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
574 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
575 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
576 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
577 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
578 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
579 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
580 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
581 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
582 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
583 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
584 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
585 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
586 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
587 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
588 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
589 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
590 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
591#endif /* CONFIG_X86_LOCAL_APIC */
592
593 seq_printf(seq, "\n MSR\t:\n");
594}
595
596static int cpu_seq_show(struct seq_file *seq, void *v)
597{
598 struct cpu_private *priv = seq->private;
599
600 if (priv == NULL)
601 return -EINVAL;
602
603 switch (cpu_base[priv->type].flag) {
604 case CPU_TSS:
605 smp_call_function_single(priv->cpu, print_tss, seq, 1);
606 break;
607 case CPU_CR:
608 smp_call_function_single(priv->cpu, print_cr, seq, 1);
609 break;
610 case CPU_DT:
611 smp_call_function_single(priv->cpu, print_dt, seq, 1);
612 break;
613 case CPU_DEBUG:
614 if (priv->file == CPU_INDEX_BIT)
615 smp_call_function_single(priv->cpu, print_dr, seq, 1);
616 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
617 break;
618 case CPU_APIC:
619 if (priv->file == CPU_INDEX_BIT)
620 smp_call_function_single(priv->cpu, print_apic, seq, 1);
621 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
622 break;
623
624 default:
625 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
626 break;
627 }
628 seq_printf(seq, "\n");
629
630 return 0;
631}
632
633static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
634{
635 if (*pos == 0) /* One time is enough ;-) */
636 return seq;
637
638 return NULL;
639}
640
641static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
642{
643 (*pos)++;
644
645 return cpu_seq_start(seq, pos);
646}
647
648static void cpu_seq_stop(struct seq_file *seq, void *v)
649{
650}
651
652static const struct seq_operations cpu_seq_ops = {
653 .start = cpu_seq_start,
654 .next = cpu_seq_next,
655 .stop = cpu_seq_stop,
656 .show = cpu_seq_show,
657};
658
659static int cpu_seq_open(struct inode *inode, struct file *file)
660{
661 struct cpu_private *priv = inode->i_private;
662 struct seq_file *seq;
663 int err;
664
665 err = seq_open(file, &cpu_seq_ops);
666 if (!err) {
667 seq = file->private_data;
668 seq->private = priv;
669 }
670
671 return err;
672}
673
674static int write_msr(struct cpu_private *priv, u64 val)
675{
676 u32 low, high;
677
678 high = (val >> 32) & 0xffffffff;
679 low = val & 0xffffffff;
680
681 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
682 return 0;
683
684 return -EPERM;
685}
686
687static int write_cpu_register(struct cpu_private *priv, const char *buf)
688{
689 int ret = -EPERM;
690 u64 val;
691
692 ret = strict_strtoull(buf, 0, &val);
693 if (ret < 0)
694 return ret;
695
696 /* Supporting only MSRs */
697 if (priv->type < CPU_TSS_BIT)
698 return write_msr(priv, val);
699
700 return ret;
701}
702
703static ssize_t cpu_write(struct file *file, const char __user *ubuf,
704 size_t count, loff_t *off)
705{
706 struct seq_file *seq = file->private_data;
707 struct cpu_private *priv = seq->private;
708 char buf[19];
709
710 if ((priv == NULL) || (count >= sizeof(buf)))
711 return -EINVAL;
712
713 if (copy_from_user(&buf, ubuf, count))
714 return -EFAULT;
715
716 buf[count] = 0;
717
718 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
719 if (!write_cpu_register(priv, buf))
720 return count;
721
722 return -EACCES;
723}
724
725static const struct file_operations cpu_fops = {
726 .owner = THIS_MODULE,
727 .open = cpu_seq_open,
728 .read = seq_read,
729 .write = cpu_write,
730 .llseek = seq_lseek,
731 .release = seq_release,
732};
733
734static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
735 unsigned file, struct dentry *dentry)
736{
737 struct cpu_private *priv = NULL;
738
739 /* Already intialized */
740 if (file == CPU_INDEX_BIT)
741 if (per_cpu(cpu_arr[type].init, cpu))
742 return 0;
743
744 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
745 if (priv == NULL)
746 return -ENOMEM;
747
748 priv->cpu = cpu;
749 priv->type = type;
750 priv->reg = reg;
751 priv->file = file;
752 mutex_lock(&cpu_debug_lock);
753 per_cpu(priv_arr[type], cpu) = priv;
754 per_cpu(cpu_priv_count, cpu)++;
755 mutex_unlock(&cpu_debug_lock);
756
757 if (file)
758 debugfs_create_file(cpu_file[file].name, S_IRUGO,
759 dentry, (void *)priv, &cpu_fops);
760 else {
761 debugfs_create_file(cpu_base[type].name, S_IRUGO,
762 per_cpu(cpu_arr[type].dentry, cpu),
763 (void *)priv, &cpu_fops);
764 mutex_lock(&cpu_debug_lock);
765 per_cpu(cpu_arr[type].init, cpu) = 1;
766 mutex_unlock(&cpu_debug_lock);
767 }
768
769 return 0;
770}
771
772static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
773 struct dentry *dentry)
774{
775 unsigned file;
776 int err = 0;
777
778 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
779 err = cpu_create_file(cpu, type, reg, file, dentry);
780 if (err)
781 return err;
782 }
783
784 return err;
785}
786
787static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
788{
789 struct dentry *cpu_dentry = NULL;
790 unsigned reg, reg_min, reg_max;
791 int i, range, err = 0;
792 char reg_dir[12];
793 u32 low, high;
794
795 range = get_cpu_range_count(cpu);
796
797 for (i = 0; i < range; i++) {
798 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
799 cpu_base[type].flag))
800 continue;
801
802 for (reg = reg_min; reg <= reg_max; reg++) {
803 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
804 continue;
805
806 sprintf(reg_dir, "0x%x", reg);
807 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
808 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
809 if (err)
810 return err;
811 }
812 }
813
814 return err;
815}
816
817static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
818{
819 struct dentry *cpu_dentry = NULL;
820 unsigned type;
821 int err = 0;
822
823 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
824 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
825 continue;
826 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
827 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
828
829 if (type < CPU_TSS_BIT)
830 err = cpu_init_msr(cpu, type, cpu_dentry);
831 else
832 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
833 cpu_dentry);
834 if (err)
835 return err;
836 }
837
838 return err;
839}
840
841static int cpu_init_cpu(void)
842{
843 struct dentry *cpu_dentry = NULL;
844 struct cpuinfo_x86 *cpui;
845 char cpu_dir[12];
846 unsigned cpu;
847 int err = 0;
848
849 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
850 cpui = &cpu_data(cpu);
851 if (!cpu_has(cpui, X86_FEATURE_MSR))
852 continue;
853 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
854 (cpui->x86 << 8) |
855 (cpui->x86_model));
856 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
857
858 sprintf(cpu_dir, "cpu%d", cpu);
859 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
860 err = cpu_init_allreg(cpu, cpu_dentry);
861
862 pr_info("cpu%d(%d) debug files %d\n",
863 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
864 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
865 pr_err("Register files count %d exceeds limit %d\n",
866 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
867 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
868 err = -ENFILE;
869 }
870 if (err)
871 return err;
872 }
873
874 return err;
875}
876
877static int __init cpu_debug_init(void)
878{
879 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
880
881 return cpu_init_cpu();
882}
883
884static void __exit cpu_debug_exit(void)
885{
886 int i, cpu;
887
888 if (cpu_debugfs_dir)
889 debugfs_remove_recursive(cpu_debugfs_dir);
890
891 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
892 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
893 kfree(per_cpu(priv_arr[i], cpu));
894}
895
896module_init(cpu_debug_init);
897module_exit(cpu_debug_exit);
898
899MODULE_AUTHOR("Jaswinder Singh Rajput");
900MODULE_DESCRIPTION("CPU Debug module");
901MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c..22590cf688a 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
601 if (!data) 601 if (!data)
602 return -ENOMEM; 602 return -ENOMEM;
603 603
604 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 604 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
605 per_cpu(drv_data, cpu) = data; 605 per_cpu(drv_data, cpu) = data;
606 606
607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b585e04cbc9..3178c3acd97 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = {
277 .name = "p4-clockmod", 277 .name = "p4-clockmod",
278 .owner = THIS_MODULE, 278 .owner = THIS_MODULE,
279 .attr = p4clockmod_attr, 279 .attr = p4clockmod_attr,
280 .hide_interface = 1,
281}; 280};
282 281
283 282
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071..593171e967e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
61 */ 61 */
62static unsigned char Cx86_dir0_msb __cpuinitdata = 0; 62static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
63 63
64static char Cx86_model[][9] __cpuinitdata = { 64static const char __cpuinitconst Cx86_model[][9] = {
65 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", 65 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
66 "M II ", "Unknown" 66 "M II ", "Unknown"
67}; 67};
68static char Cx486_name[][5] __cpuinitdata = { 68static const char __cpuinitconst Cx486_name[][5] = {
69 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", 69 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
70 "SRx2", "DRx2" 70 "SRx2", "DRx2"
71}; 71};
72static char Cx486S_name[][4] __cpuinitdata = { 72static const char __cpuinitconst Cx486S_name[][4] = {
73 "S", "S2", "Se", "S2e" 73 "S", "S2", "Se", "S2e"
74}; 74};
75static char Cx486D_name[][4] __cpuinitdata = { 75static const char __cpuinitconst Cx486D_name[][4] = {
76 "DX", "DX2", "?", "?", "?", "DX4" 76 "DX", "DX2", "?", "?", "?", "DX4"
77}; 77};
78static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; 78static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
79static char cyrix_model_mult1[] __cpuinitdata = "12??43"; 79static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
80static char cyrix_model_mult2[] __cpuinitdata = "12233445"; 80static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
81 81
82/* 82/*
83 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old 83 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
435 } 435 }
436} 436}
437 437
438static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 438static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
439 .c_vendor = "Cyrix", 439 .c_vendor = "Cyrix",
440 .c_ident = { "CyrixInstead" }, 440 .c_ident = { "CyrixInstead" },
441 .c_early_init = early_init_cyrix, 441 .c_early_init = early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
446 446
447cpu_dev_register(cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
448 448
449static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
450 .c_vendor = "NSC", 450 .c_vendor = "NSC",
451 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
452 .c_init = init_nsc, 452 .c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 25c559ba8d5..b09d4eb52bb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -13,6 +13,7 @@
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ds.h> 14#include <asm/ds.h>
15#include <asm/bugs.h> 15#include <asm/bugs.h>
16#include <asm/cpu.h>
16 17
17#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
18#include <asm/topology.h> 19#include <asm/topology.h>
@@ -53,6 +54,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
53 c->x86_cache_alignment = 128; 54 c->x86_cache_alignment = 128;
54#endif 55#endif
55 56
57 /* CPUID workaround for 0F33/0F34 CPU */
58 if (c->x86 == 0xF && c->x86_model == 0x3
59 && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
60 c->x86_phys_bits = 36;
61
56 /* 62 /*
57 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate 63 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
58 * with P/T states and does not stop in deep C-states 64 * with P/T states and does not stop in deep C-states
@@ -110,6 +116,28 @@ static void __cpuinit trap_init_f00f_bug(void)
110} 116}
111#endif 117#endif
112 118
119static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
120{
121#ifdef CONFIG_SMP
122 /* calling is from identify_secondary_cpu() ? */
123 if (c->cpu_index == boot_cpu_id)
124 return;
125
126 /*
127 * Mask B, Pentium, but not Pentium MMX
128 */
129 if (c->x86 == 5 &&
130 c->x86_mask >= 1 && c->x86_mask <= 4 &&
131 c->x86_model <= 3) {
132 /*
133 * Remember we have B step Pentia with bugs
134 */
135 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
136 "with B stepping processors.\n");
137 }
138#endif
139}
140
113static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 141static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
114{ 142{
115 unsigned long lo, hi; 143 unsigned long lo, hi;
@@ -186,6 +214,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
186#ifdef CONFIG_X86_NUMAQ 214#ifdef CONFIG_X86_NUMAQ
187 numaq_tsc_disable(); 215 numaq_tsc_disable();
188#endif 216#endif
217
218 intel_smp_check(c);
189} 219}
190#else 220#else
191static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 221static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -385,7 +415,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
385} 415}
386#endif 416#endif
387 417
388static struct cpu_dev intel_cpu_dev __cpuinitdata = { 418static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
389 .c_vendor = "Intel", 419 .c_vendor = "Intel",
390 .c_ident = { "GenuineIntel" }, 420 .c_ident = { "GenuineIntel" },
391#ifdef CONFIG_X86_32 421#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7293508d8f5..c471eb1a389 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
32}; 32};
33 33
34/* all the cache descriptor types we care about (no TLB or trace cache entries) */ 34/* all the cache descriptor types we care about (no TLB or trace cache entries) */
35static struct _cache_table cache_table[] __cpuinitdata = 35static const struct _cache_table __cpuinitconst cache_table[] =
36{ 36{
37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ 37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ 38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
@@ -206,15 +206,15 @@ union l3_cache {
206 unsigned val; 206 unsigned val;
207}; 207};
208 208
209static unsigned short assocs[] __cpuinitdata = { 209static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 210 [1] = 1, [2] = 2, [4] = 4, [6] = 8,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 211 [8] = 16, [0xa] = 32, [0xb] = 48,
212 [0xc] = 64, 212 [0xc] = 64,
213 [0xf] = 0xffff // ?? 213 [0xf] = 0xffff // ??
214}; 214};
215 215
216static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
217static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 217static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
218 218
219static void __cpuinit 219static void __cpuinit
220amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 220amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb6..b2f89829bbe 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633..3552119b091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
60 } 60 }
61} 61}
62 62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str) 63static int __init mcheck_disable(char *str)
78{ 64{
79 mce_disabled = 1; 65 mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f..ca14604611e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -24,6 +26,9 @@
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/kmod.h> 27#include <linux/kmod.h>
26#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
27#include <asm/processor.h> 32#include <asm/processor.h>
28#include <asm/msr.h> 33#include <asm/msr.h>
29#include <asm/mce.h> 34#include <asm/mce.h>
@@ -32,7 +37,6 @@
32#include <asm/idle.h> 37#include <asm/idle.h>
33 38
34#define MISC_MCELOG_MINOR 227 39#define MISC_MCELOG_MINOR 227
35#define NR_SYSFS_BANKS 6
36 40
37atomic_t mce_entry; 41atomic_t mce_entry;
38 42
@@ -47,7 +51,7 @@ static int mce_dont_init;
47 */ 51 */
48static int tolerant = 1; 52static int tolerant = 1;
49static int banks; 53static int banks;
50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; 54static u64 *bank;
51static unsigned long notify_user; 55static unsigned long notify_user;
52static int rip_msr; 56static int rip_msr;
53static int mce_bootlog = -1; 57static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
58 62
59static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
61/* 78/*
62 * Lockless MCE logging infrastructure. 79 * Lockless MCE logging infrastructure.
63 * This avoids deadlocks on printk locks without having to break locks. Also 80 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
119 print_symbol("{%s}", m->ip); 136 print_symbol("{%s}", m->ip);
120 printk("\n"); 137 printk("\n");
121 } 138 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc); 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
123 if (m->addr) 140 if (m->addr)
124 printk("ADDR %Lx ", m->addr); 141 printk("ADDR %llx ", m->addr);
125 if (m->misc) 142 if (m->misc)
126 printk("MISC %Lx ", m->misc); 143 printk("MISC %llx ", m->misc);
127 printk("\n"); 144 printk("\n");
128 printk(KERN_EMERG "This is not a software problem!\n"); 145 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG "Run through mcelog --ascii to decode " 146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
149 panic(msg); 166 panic(msg);
150} 167}
151 168
152static int mce_available(struct cpuinfo_x86 *c) 169int mce_available(struct cpuinfo_x86 *c)
153{ 170{
171 if (mce_dont_init)
172 return 0;
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155} 174}
156 175
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172} 191}
173 192
174/* 193/*
175 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
176 */ 265 */
177void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
178{ 267{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
190 * error. 279 * error.
191 */ 280 */
192 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
193 283
194 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
195 285
196 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
197 && notify_die(DIE_NMI, "machine check", regs, error_code,
198 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
199 || !banks) 288 goto out2;
289 if (!banks)
200 goto out2; 290 goto out2;
201 291
202 memset(&m, 0, sizeof(struct mce)); 292 mce_setup(&m);
203 m.cpu = smp_processor_id(); 293
204 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 /* if the restart IP is not valid, we're done for */ 295 /* if the restart IP is not valid, we're done for */
206 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
210 barrier(); 300 barrier();
211 301
212 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
213 if (i < NR_SYSFS_BANKS && !bank[i]) 303 __clear_bit(i, toclear);
304 if (!bank[i])
214 continue; 305 continue;
215 306
216 m.misc = 0; 307 m.misc = 0;
217 m.addr = 0; 308 m.addr = 0;
218 m.bank = i; 309 m.bank = i;
219 m.tsc = 0;
220 310
221 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
223 continue; 313 continue;
224 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
225 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
226 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
227 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
235 no_way_out = 1; 339 no_way_out = 1;
236 kill_it = 1; 340 kill_it = 1;
237 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
238 } 348 }
239 349
240 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 354
245 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
246 if (error_code >= 0) 356 mce_log(&m);
247 rdtscll(m.tsc);
248 if (error_code != -2)
249 mce_log(&m);
250 357
251 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
252 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
255 panicm = m; 362 panicm = m;
256 panicm_found = 1; 363 panicm_found = 1;
257 } 364 }
258
259 add_taint(TAINT_MACHINE_CHECK);
260 } 365 }
261 366
262 /* Never do anything final in the polling timer */
263 if (!regs)
264 goto out;
265
266 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
267 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
268 if (!panicm_found) 369 if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
309 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
310 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
311 412
312 out:
313 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
314 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
315 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
316 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
317 out2: 419 out2:
318 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
332 * and historically has been the register value of the 434 * and historically has been the register value of the
333 * MSR_IA32_THERMAL_STATUS (Intel) msr. 435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
334 */ 436 */
335void mce_log_therm_throt_event(unsigned int cpu, __u64 status) 437void mce_log_therm_throt_event(__u64 status)
336{ 438{
337 struct mce m; 439 struct mce m;
338 440
339 memset(&m, 0, sizeof(m)); 441 mce_setup(&m);
340 m.cpu = cpu;
341 m.bank = MCE_THERMAL_BANK; 442 m.bank = MCE_THERMAL_BANK;
342 m.status = status; 443 m.status = status;
343 rdtscll(m.tsc);
344 mce_log(&m); 444 mce_log(&m);
345} 445}
346#endif /* CONFIG_X86_MCE_INTEL */ 446#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
353 453
354static int check_interval = 5 * 60; /* 5 minutes */ 454static int check_interval = 5 * 60; /* 5 minutes */
355static int next_interval; /* in jiffies */ 455static int next_interval; /* in jiffies */
356static void mcheck_timer(struct work_struct *work); 456static void mcheck_timer(unsigned long);
357static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 457static DEFINE_PER_CPU(struct timer_list, mce_timer);
358 458
359static void mcheck_check_cpu(void *info) 459static void mcheck_timer(unsigned long data)
360{ 460{
361 if (mce_available(&current_cpu_data)) 461 struct timer_list *t = &per_cpu(mce_timer, data);
362 do_machine_check(NULL, 0);
363}
364 462
365static void mcheck_timer(struct work_struct *work) 463 WARN_ON(smp_processor_id() != data);
366{ 464
367 on_each_cpu(mcheck_check_cpu, NULL, 1); 465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP,
467 &__get_cpu_var(mce_poll_banks));
368 468
369 /* 469 /*
370 * Alert userspace if needed. If we logged an MCE, reduce the 470 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
377 (int)round_jiffies_relative(check_interval*HZ)); 477 (int)round_jiffies_relative(check_interval*HZ));
378 } 478 }
379 479
380 schedule_delayed_work(&mcheck_work, next_interval); 480 t->expires = jiffies + next_interval;
481 add_timer(t);
482}
483
484static void mce_do_trigger(struct work_struct *work)
485{
486 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
381} 487}
382 488
489static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
490
383/* 491/*
384 * This is only called from process context. This is where we do 492 * Notify the user(s) about new machine check events.
385 * anything we need to alert userspace about new MCEs. This is called 493 * Can be called from interrupt context, but not from machine check/NMI
386 * directly from the poller and also from entry.S and idle, thanks to 494 * context.
387 * TIF_MCE_NOTIFY.
388 */ 495 */
389int mce_notify_user(void) 496int mce_notify_user(void)
390{ 497{
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
500
391 clear_thread_flag(TIF_MCE_NOTIFY); 501 clear_thread_flag(TIF_MCE_NOTIFY);
392 if (test_and_clear_bit(0, &notify_user)) { 502 if (test_and_clear_bit(0, &notify_user)) {
393 static unsigned long last_print;
394 unsigned long now = jiffies;
395
396 wake_up_interruptible(&mce_wait); 503 wake_up_interruptible(&mce_wait);
397 if (trigger[0])
398 call_usermodehelper(trigger, trigger_argv, NULL,
399 UMH_NO_WAIT);
400 504
401 if (time_after_eq(now, last_print + (check_interval*HZ))) { 505 /*
402 last_print = now; 506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
508 * executed.
509 */
510 if (trigger[0] && !work_pending(&mce_trigger_work))
511 schedule_work(&mce_trigger_work);
512
513 if (__ratelimit(&ratelimit))
403 printk(KERN_INFO "Machine check events logged\n"); 514 printk(KERN_INFO "Machine check events logged\n");
404 }
405 515
406 return 1; 516 return 1;
407 } 517 }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
425 535
426static __init int periodic_mcheck_init(void) 536static __init int periodic_mcheck_init(void)
427{ 537{
428 next_interval = check_interval * HZ; 538 idle_notifier_register(&mce_idle_notifier);
429 if (next_interval) 539 return 0;
430 schedule_delayed_work(&mcheck_work,
431 round_jiffies_relative(next_interval));
432 idle_notifier_register(&mce_idle_notifier);
433 return 0;
434} 540}
435__initcall(periodic_mcheck_init); 541__initcall(periodic_mcheck_init);
436 542
437
438/* 543/*
439 * Initialize Machine Checks for a CPU. 544 * Initialize Machine Checks for a CPU.
440 */ 545 */
441static void mce_init(void *dummy) 546static int mce_cap_init(void)
442{ 547{
443 u64 cap; 548 u64 cap;
444 int i; 549 unsigned b;
445 550
446 rdmsrl(MSR_IA32_MCG_CAP, cap); 551 rdmsrl(MSR_IA32_MCG_CAP, cap);
447 banks = cap & 0xff; 552 b = cap & 0xff;
448 if (banks > MCE_EXTENDED_BANK) { 553 if (b > MAX_NR_BANKS) {
449 banks = MCE_EXTENDED_BANK; 554 printk(KERN_WARNING
450 printk(KERN_INFO "MCE: warning: using only %d banks\n", 555 "MCE: Using only %u machine check banks out of %u\n",
451 MCE_EXTENDED_BANK); 556 MAX_NR_BANKS, b);
557 b = MAX_NR_BANKS;
452 } 558 }
559
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks != 0 && b != banks);
562 banks = b;
563 if (!bank) {
564 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
565 if (!bank)
566 return -ENOMEM;
567 memset(bank, 0xff, banks * sizeof(u64));
568 }
569
453 /* Use accurate RIP reporting if available. */ 570 /* Use accurate RIP reporting if available. */
454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 571 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 rip_msr = MSR_IA32_MCG_EIP; 572 rip_msr = MSR_IA32_MCG_EIP;
456 573
457 /* Log the machine checks left over from the previous reset. 574 return 0;
458 This also clears all registers */ 575}
459 do_machine_check(NULL, mce_bootlog ? -1 : -2); 576
577static void mce_init(void *dummy)
578{
579 u64 cap;
580 int i;
581 mce_banks_t all_banks;
582
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 bitmap_fill(all_banks, MAX_NR_BANKS);
587 machine_check_poll(MCP_UC, &all_banks);
460 588
461 set_in_cr4(X86_CR4_MCE); 589 set_in_cr4(X86_CR4_MCE);
462 590
591 rdmsrl(MSR_IA32_MCG_CAP, cap);
463 if (cap & MCG_CTL_P) 592 if (cap & MCG_CTL_P)
464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 593 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 594
466 for (i = 0; i < banks; i++) { 595 for (i = 0; i < banks; i++) {
467 if (i < NR_SYSFS_BANKS) 596 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 597 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 } 598 }
474} 599}
475 600
476/* Add per CPU specific workarounds here */ 601/* Add per CPU specific workarounds here */
477static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 602static void mce_cpu_quirks(struct cpuinfo_x86 *c)
478{ 603{
479 /* This should be disabled by the BIOS, but isn't always */ 604 /* This should be disabled by the BIOS, but isn't always */
480 if (c->x86_vendor == X86_VENDOR_AMD) { 605 if (c->x86_vendor == X86_VENDOR_AMD) {
481 if(c->x86 == 15) 606 if (c->x86 == 15 && banks > 4)
482 /* disable GART TBL walk error reporting, which trips off 607 /* disable GART TBL walk error reporting, which trips off
483 incorrectly with the IOMMU & 3ware & Cerberus. */ 608 incorrectly with the IOMMU & 3ware & Cerberus. */
484 clear_bit(10, &bank[4]); 609 clear_bit(10, (unsigned long *)&bank[4]);
485 if(c->x86 <= 17 && mce_bootlog < 0) 610 if(c->x86 <= 17 && mce_bootlog < 0)
486 /* Lots of broken BIOS around that don't clear them 611 /* Lots of broken BIOS around that don't clear them
487 by default and leave crap in there. Don't log. */ 612 by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
504 } 629 }
505} 630}
506 631
632static void mce_init_timer(void)
633{
634 struct timer_list *t = &__get_cpu_var(mce_timer);
635
636 /* data race harmless because everyone sets to the same value */
637 if (!next_interval)
638 next_interval = check_interval * HZ;
639 if (!next_interval)
640 return;
641 setup_timer(t, mcheck_timer, smp_processor_id());
642 t->expires = round_jiffies(jiffies + next_interval);
643 add_timer(t);
644}
645
507/* 646/*
508 * Called for each booted CPU to set up machine checks. 647 * Called for each booted CPU to set up machine checks.
509 * Must be called with preempt off. 648 * Must be called with preempt off.
510 */ 649 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 650void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 651{
513 mce_cpu_quirks(c); 652 if (!mce_available(c))
653 return;
514 654
515 if (mce_dont_init || 655 if (mce_cap_init() < 0) {
516 !mce_available(c)) 656 mce_dont_init = 1;
517 return; 657 return;
658 }
659 mce_cpu_quirks(c);
518 660
519 mce_init(NULL); 661 mce_init(NULL);
520 mce_cpu_features(c); 662 mce_cpu_features(c);
663 mce_init_timer();
521} 664}
522 665
523/* 666/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573{ 716{
574 unsigned long *cpu_tsc; 717 unsigned long *cpu_tsc;
575 static DEFINE_MUTEX(mce_read_mutex); 718 static DEFINE_MUTEX(mce_read_mutex);
576 unsigned next; 719 unsigned prev, next;
577 char __user *buf = ubuf; 720 char __user *buf = ubuf;
578 int i, err; 721 int i, err;
579 722
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
592 } 735 }
593 736
594 err = 0; 737 err = 0;
595 for (i = 0; i < next; i++) { 738 prev = 0;
596 unsigned long start = jiffies; 739 do {
597 740 for (i = prev; i < next; i++) {
598 while (!mcelog.entry[i].finished) { 741 unsigned long start = jiffies;
599 if (time_after_eq(jiffies, start + 2)) { 742
600 memset(mcelog.entry + i,0, sizeof(struct mce)); 743 while (!mcelog.entry[i].finished) {
601 goto timeout; 744 if (time_after_eq(jiffies, start + 2)) {
745 memset(mcelog.entry + i, 0,
746 sizeof(struct mce));
747 goto timeout;
748 }
749 cpu_relax();
602 } 750 }
603 cpu_relax(); 751 smp_rmb();
752 err |= copy_to_user(buf, mcelog.entry + i,
753 sizeof(struct mce));
754 buf += sizeof(struct mce);
755timeout:
756 ;
604 } 757 }
605 smp_rmb();
606 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 buf += sizeof(struct mce);
608 timeout:
609 ;
610 }
611 758
612 memset(mcelog.entry, 0, next * sizeof(struct mce)); 759 memset(mcelog.entry + prev, 0,
613 mcelog.next = 0; 760 (next - prev) * sizeof(struct mce));
761 prev = next;
762 next = cmpxchg(&mcelog.next, prev, 0);
763 } while (next != prev);
614 764
615 synchronize_sched(); 765 synchronize_sched();
616 766
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
680 &mce_chrdev_ops, 830 &mce_chrdev_ops,
681}; 831};
682 832
683static unsigned long old_cr4 __initdata;
684
685void __init stop_mce(void)
686{
687 old_cr4 = read_cr4();
688 clear_in_cr4(X86_CR4_MCE);
689}
690
691void __init restart_mce(void)
692{
693 if (old_cr4 & X86_CR4_MCE)
694 set_in_cr4(X86_CR4_MCE);
695}
696
697/* 833/*
698 * Old style boot options parsing. Only for compatibility. 834 * Old style boot options parsing. Only for compatibility.
699 */ 835 */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
703 return 1; 839 return 1;
704} 840}
705 841
706/* mce=off disables machine check. Note you can re-enable it later 842/* mce=off disables machine check.
707 using sysfs.
708 mce=TOLERANCELEVEL (number, see above) 843 mce=TOLERANCELEVEL (number, see above)
709 mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710 mce=nobootlog Don't log MCEs from before booting. */ 845 mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
728 * Sysfs support 863 * Sysfs support
729 */ 864 */
730 865
866/*
867 * Disable machine checks on suspend and shutdown. We can't really handle
868 * them later.
869 */
870static int mce_disable(void)
871{
872 int i;
873
874 for (i = 0; i < banks; i++)
875 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
876 return 0;
877}
878
879static int mce_suspend(struct sys_device *dev, pm_message_t state)
880{
881 return mce_disable();
882}
883
884static int mce_shutdown(struct sys_device *dev)
885{
886 return mce_disable();
887}
888
731/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. 889/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732 Only one CPU is active at this time, the others get readded later using 890 Only one CPU is active at this time, the others get readded later using
733 CPU hotplug. */ 891 CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
738 return 0; 896 return 0;
739} 897}
740 898
899static void mce_cpu_restart(void *data)
900{
901 del_timer_sync(&__get_cpu_var(mce_timer));
902 if (mce_available(&current_cpu_data))
903 mce_init(NULL);
904 mce_init_timer();
905}
906
741/* Reinit MCEs after user configuration changes */ 907/* Reinit MCEs after user configuration changes */
742static void mce_restart(void) 908static void mce_restart(void)
743{ 909{
744 if (next_interval)
745 cancel_delayed_work(&mcheck_work);
746 /* Timer race is harmless here */
747 on_each_cpu(mce_init, NULL, 1);
748 next_interval = check_interval * HZ; 910 next_interval = check_interval * HZ;
749 if (next_interval) 911 on_each_cpu(mce_cpu_restart, NULL, 1);
750 schedule_delayed_work(&mcheck_work,
751 round_jiffies_relative(next_interval));
752} 912}
753 913
754static struct sysdev_class mce_sysclass = { 914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
755 .resume = mce_resume, 917 .resume = mce_resume,
756 .name = "machinecheck", 918 .name = "machinecheck",
757}; 919};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
778 } \ 940 } \
779 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
780 942
781/* 943static struct sysdev_attribute *bank_attrs;
782 * TBD should generate these dynamically based on number of available banks. 944
783 * Have only 6 contol banks in /sysfs until then. 945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
784 */ 946 char *buf)
785ACCESSOR(bank0ctl,bank[0],mce_restart()) 947{
786ACCESSOR(bank1ctl,bank[1],mce_restart()) 948 u64 b = bank[attr - bank_attrs];
787ACCESSOR(bank2ctl,bank[2],mce_restart()) 949 return sprintf(buf, "%llx\n", b);
788ACCESSOR(bank3ctl,bank[3],mce_restart()) 950}
789ACCESSOR(bank4ctl,bank[4],mce_restart()) 951
790ACCESSOR(bank5ctl,bank[5],mce_restart()) 952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
791 963
792static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, 964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
793 char *buf) 965 char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
814static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
815ACCESSOR(check_interval,check_interval,mce_restart()) 987ACCESSOR(check_interval,check_interval,mce_restart())
816static struct sysdev_attribute *mce_attributes[] = { 988static struct sysdev_attribute *mce_attributes[] = {
817 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
818 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
819 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
820 NULL 990 NULL
821}; 991};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
845 if (err) 1015 if (err)
846 goto error; 1016 goto error;
847 } 1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
848 cpu_set(cpu, mce_device_initialized); 1024 cpu_set(cpu, mce_device_initialized);
849 1025
850 return 0; 1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
851error: 1032error:
852 while (i--) { 1033 while (--i >= 0) {
853 sysdev_remove_file(&per_cpu(device_mce,cpu), 1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
854 mce_attributes[i]); 1035 mce_attributes[i]);
855 } 1036 }
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
868 for (i = 0; mce_attributes[i]; i++) 1049 for (i = 0; mce_attributes[i]; i++)
869 sysdev_remove_file(&per_cpu(device_mce,cpu), 1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
870 mce_attributes[i]); 1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
871 sysdev_unregister(&per_cpu(device_mce,cpu)); 1055 sysdev_unregister(&per_cpu(device_mce,cpu));
872 cpu_clear(cpu, mce_device_initialized); 1056 cpu_clear(cpu, mce_device_initialized);
873} 1057}
874 1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
875/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
876static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, 1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 unsigned long action, void *hcpu) 1088 unsigned long action, void *hcpu)
878{ 1089{
879 unsigned int cpu = (unsigned long)hcpu; 1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
880 1092
881 switch (action) { 1093 switch (action) {
882 case CPU_ONLINE: 1094 case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
891 threshold_cpu_callback(action, cpu); 1103 threshold_cpu_callback(action, cpu);
892 mce_remove_device(cpu); 1104 mce_remove_device(cpu);
893 break; 1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies + next_interval);
1114 add_timer_on(t, cpu);
1115 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116 break;
1117 case CPU_POST_DEAD:
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu);
1120 break;
894 } 1121 }
895 return NOTIFY_OK; 1122 return NOTIFY_OK;
896} 1123}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
899 .notifier_call = mce_cpu_callback, 1126 .notifier_call = mce_cpu_callback,
900}; 1127};
901 1128
1129static __init int mce_init_banks(void)
1130{
1131 int i;
1132
1133 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1134 GFP_KERNEL);
1135 if (!bank_attrs)
1136 return -ENOMEM;
1137
1138 for (i = 0; i < banks; i++) {
1139 struct sysdev_attribute *a = &bank_attrs[i];
1140 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1141 if (!a->attr.name)
1142 goto nomem;
1143 a->attr.mode = 0644;
1144 a->show = show_bank;
1145 a->store = set_bank;
1146 }
1147 return 0;
1148
1149nomem:
1150 while (--i >= 0)
1151 kfree(bank_attrs[i].attr.name);
1152 kfree(bank_attrs);
1153 bank_attrs = NULL;
1154 return -ENOMEM;
1155}
1156
902static __init int mce_init_device(void) 1157static __init int mce_init_device(void)
903{ 1158{
904 int err; 1159 int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
906 1161
907 if (!mce_available(&boot_cpu_data)) 1162 if (!mce_available(&boot_cpu_data))
908 return -EIO; 1163 return -EIO;
1164
1165 err = mce_init_banks();
1166 if (err)
1167 return err;
1168
909 err = sysdev_class_register(&mce_sysclass); 1169 err = sysdev_class_register(&mce_sysclass);
910 if (err) 1170 if (err)
911 return err; 1171 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd46..7d01be86887 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
79 79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81 81
82static void amd_threshold_interrupt(void);
83
82/* 84/*
83 * CPU Initialization 85 * CPU Initialization
84 */ 86 */
@@ -90,7 +92,8 @@ struct thresh_restart {
90}; 92};
91 93
92/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
93static long threshold_restart_bank(void *_tr) 95/* Called via smp_call_function_single() */
96static void threshold_restart_bank(void *_tr)
94{ 97{
95 struct thresh_restart *tr = _tr; 98 struct thresh_restart *tr = _tr;
96 u32 mci_misc_hi, mci_misc_lo; 99 u32 mci_misc_hi, mci_misc_lo;
@@ -117,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
117 120
118 mci_misc_hi |= MASK_COUNT_EN_HI; 121 mci_misc_hi |= MASK_COUNT_EN_HI;
119 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 122 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
120 return 0;
121} 123}
122 124
123/* cpu init entry point, called from mce.c with preempt off */ 125/* cpu init entry point, called from mce.c with preempt off */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
174 tr.reset = 0; 176 tr.reset = 0;
175 tr.old_limit = 0; 177 tr.old_limit = 0;
176 threshold_restart_bank(&tr); 178 threshold_restart_bank(&tr);
179
180 mce_threshold_vector = amd_threshold_interrupt;
177 } 181 }
178 } 182 }
179} 183}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
187 * the interrupt goes off when error_count reaches threshold_limit. 191 * the interrupt goes off when error_count reaches threshold_limit.
188 * the handler will simply log mcelog w/ software defined bank number. 192 * the handler will simply log mcelog w/ software defined bank number.
189 */ 193 */
190asmlinkage void mce_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
191{ 195{
192 unsigned int bank, block; 196 unsigned int bank, block;
193 struct mce m; 197 struct mce m;
194 u32 low = 0, high = 0, address = 0; 198 u32 low = 0, high = 0, address = 0;
195 199
196 ack_APIC_irq(); 200 mce_setup(&m);
197 exit_idle();
198 irq_enter();
199
200 memset(&m, 0, sizeof(m));
201 rdtscll(m.tsc);
202 m.cpu = smp_processor_id();
203 201
204 /* assume first bank caused it */ 202 /* assume first bank caused it */
205 for (bank = 0; bank < NR_BANKS; ++bank) { 203 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
233 231
234 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
235 event. */ 233 event. */
236 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks));
237 236
238 if (high & MASK_OVERFLOW_HI) { 237 if (high & MASK_OVERFLOW_HI) {
239 rdmsrl(address, m.misc); 238 rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
243 + bank * NR_BLOCKS 242 + bank * NR_BLOCKS
244 + block; 243 + block;
245 mce_log(&m); 244 mce_log(&m);
246 goto out; 245 return;
247 } 246 }
248 } 247 }
249 } 248 }
250out:
251 inc_irq_stat(irq_threshold_count);
252 irq_exit();
253} 249}
254 250
255/* 251/*
@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
283 tr.b = b; 279 tr.b = b;
284 tr.reset = 0; 280 tr.reset = 0;
285 tr.old_limit = 0; 281 tr.old_limit = 0;
286 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
287 283
288 return end - buf; 284 return end - buf;
289} 285}
@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
305 tr.b = b; 301 tr.b = b;
306 tr.reset = 0; 302 tr.reset = 0;
307 303
308 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
309 305
310 return end - buf; 306 return end - buf;
311} 307}
312 308
313static long local_error_count(void *_b) 309struct threshold_block_cross_cpu {
310 struct threshold_block *tb;
311 long retval;
312};
313
314static void local_error_count_handler(void *_tbcc)
314{ 315{
315 struct threshold_block *b = _b; 316 struct threshold_block_cross_cpu *tbcc = _tbcc;
317 struct threshold_block *b = tbcc->tb;
316 u32 low, high; 318 u32 low, high;
317 319
318 rdmsr(b->address, low, high); 320 rdmsr(b->address, low, high);
319 return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); 321 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
320} 322}
321 323
322static ssize_t show_error_count(struct threshold_block *b, char *buf) 324static ssize_t show_error_count(struct threshold_block *b, char *buf)
323{ 325{
324 return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); 326 struct threshold_block_cross_cpu tbcc = { .tb = b, };
327
328 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
329 return sprintf(buf, "%lx\n", tbcc.retval);
325} 330}
326 331
327static ssize_t store_error_count(struct threshold_block *b, 332static ssize_t store_error_count(struct threshold_block *b,
@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
329{ 334{
330 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; 335 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
331 336
332 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 337 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
333 return 1; 338 return 1;
334} 339}
335 340
@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
398 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
399 return 0; 404 return 0;
400 405
401 if (rdmsr_safe(address, &low, &high)) 406 if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
402 return 0; 407 return 0;
403 408
404 if (!(high & MASK_VALID_HI)) { 409 if (!(high & MASK_VALID_HI)) {
@@ -462,12 +467,11 @@ out_free:
462 return err; 467 return err;
463} 468}
464 469
465static __cpuinit long local_allocate_threshold_blocks(void *_bank) 470static __cpuinit long
471local_allocate_threshold_blocks(int cpu, unsigned int bank)
466{ 472{
467 unsigned int *bank = _bank; 473 return allocate_threshold_blocks(cpu, bank, 0,
468 474 MSR_IA32_MC0_MISC + bank * 4);
469 return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
470 MSR_IA32_MC0_MISC + *bank * 4);
471} 475}
472 476
473/* symlinks sibling shared banks to first core. first core owns dir/files. */ 477/* symlinks sibling shared banks to first core. first core owns dir/files. */
@@ -530,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
530 534
531 per_cpu(threshold_banks, cpu)[bank] = b; 535 per_cpu(threshold_banks, cpu)[bank] = b;
532 536
533 err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); 537 err = local_allocate_threshold_blocks(cpu, bank);
534 if (err) 538 if (err)
535 goto out_free; 539 goto out_free;
536 540
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e..57df3d38347 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
1/* 1/*
2 * Intel specific MCE features. 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> 3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
4 */ 6 */
5 7
6#include <linux/init.h> 8#include <linux/init.h>
@@ -13,6 +15,7 @@
13#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
16 19
17asmlinkage void smp_thermal_interrupt(void) 20asmlinkage void smp_thermal_interrupt(void)
18{ 21{
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
25 28
26 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
27 if (therm_throt_process(msr_val & 1)) 30 if (therm_throt_process(msr_val & 1))
28 mce_log_therm_throt_event(smp_processor_id(), msr_val); 31 mce_log_therm_throt_event(msr_val);
29 32
30 inc_irq_stat(irq_thermal_count); 33 inc_irq_stat(irq_thermal_count);
31 irq_exit(); 34 irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
85 return; 88 return;
86} 89}
87 90
91/*
92 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened.
94 * Normally we pick those up using a regular polling timer.
95 * Also supports reliable discovery of shared banks.
96 */
97
98static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
99
100/*
101 * cmci_discover_lock protects against parallel discovery attempts
102 * which could race against each other.
103 */
104static DEFINE_SPINLOCK(cmci_discover_lock);
105
106#define CMCI_THRESHOLD 1
107
108static int cmci_supported(int *banks)
109{
110 u64 cap;
111
112 /*
113 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this
115 * makes sure none of the backdoors are entered otherwise.
116 */
117 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
118 return 0;
119 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
120 return 0;
121 rdmsrl(MSR_IA32_MCG_CAP, cap);
122 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
123 return !!(cap & MCG_CMCI_P);
124}
125
126/*
127 * The interrupt handler. This is called on every event.
128 * Just call the poller directly to log any events.
129 * This could in theory increase the threshold under high load,
130 * but doesn't for now.
131 */
132static void intel_threshold_interrupt(void)
133{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user();
136}
137
138static void print_update(char *type, int *hdr, int num)
139{
140 if (*hdr == 0)
141 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
142 *hdr = 1;
143 printk(KERN_CONT " %s:%d", type, num);
144}
145
146/*
147 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
148 * on this CPU. Use the algorithm recommended in the SDM to discover shared
149 * banks.
150 */
151static void cmci_discover(int banks, int boot)
152{
153 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
154 int hdr = 0;
155 int i;
156
157 spin_lock(&cmci_discover_lock);
158 for (i = 0; i < banks; i++) {
159 u64 val;
160
161 if (test_bit(i, owned))
162 continue;
163
164 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
165
166 /* Already owned by someone else? */
167 if (val & CMCI_EN) {
168 if (test_and_clear_bit(i, owned) || boot)
169 print_update("SHD", &hdr, i);
170 __clear_bit(i, __get_cpu_var(mce_poll_banks));
171 continue;
172 }
173
174 val |= CMCI_EN | CMCI_THRESHOLD;
175 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
176 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
177
178 /* Did the enable bit stick? -- the bank supports CMCI */
179 if (val & CMCI_EN) {
180 if (!test_and_set_bit(i, owned) || boot)
181 print_update("CMCI", &hdr, i);
182 __clear_bit(i, __get_cpu_var(mce_poll_banks));
183 } else {
184 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
185 }
186 }
187 spin_unlock(&cmci_discover_lock);
188 if (hdr)
189 printk(KERN_CONT "\n");
190}
191
192/*
193 * Just in case we missed an event during initialization check
194 * all the CMCI owned banks.
195 */
196void cmci_recheck(void)
197{
198 unsigned long flags;
199 int banks;
200
201 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
202 return;
203 local_irq_save(flags);
204 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
205 local_irq_restore(flags);
206}
207
208/*
209 * Disable CMCI on this CPU for all banks it owns when it goes down.
210 * This allows other CPUs to claim the banks on rediscovery.
211 */
212void cmci_clear(void)
213{
214 int i;
215 int banks;
216 u64 val;
217
218 if (!cmci_supported(&banks))
219 return;
220 spin_lock(&cmci_discover_lock);
221 for (i = 0; i < banks; i++) {
222 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
223 continue;
224 /* Disable CMCI */
225 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
226 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
227 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
228 __clear_bit(i, __get_cpu_var(mce_banks_owned));
229 }
230 spin_unlock(&cmci_discover_lock);
231}
232
233/*
234 * After a CPU went down cycle through all the others and rediscover
235 * Must run in process context.
236 */
237void cmci_rediscover(int dying)
238{
239 int banks;
240 int cpu;
241 cpumask_var_t old;
242
243 if (!cmci_supported(&banks))
244 return;
245 if (!alloc_cpumask_var(&old, GFP_KERNEL))
246 return;
247 cpumask_copy(old, &current->cpus_allowed);
248
249 for_each_online_cpu (cpu) {
250 if (cpu == dying)
251 continue;
252 if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
253 continue;
254 /* Recheck banks in case CPUs don't all have the same */
255 if (cmci_supported(&banks))
256 cmci_discover(banks, 0);
257 }
258
259 set_cpus_allowed_ptr(current, old);
260 free_cpumask_var(old);
261}
262
263/*
264 * Reenable CMCI on this CPU in case a CPU down failed.
265 */
266void cmci_reenable(void)
267{
268 int banks;
269 if (cmci_supported(&banks))
270 cmci_discover(banks, 0);
271}
272
273static void intel_init_cmci(void)
274{
275 int banks;
276
277 if (!cmci_supported(&banks))
278 return;
279
280 mce_threshold_vector = intel_threshold_interrupt;
281 cmci_discover(banks, 1);
282 /*
283 * For CPU #0 this runs with still disabled APIC, but that's
284 * ok because only the vector is set up. We still do another
285 * check for the banks later for CPU #0 just to make sure
286 * to not miss any events.
287 */
288 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
289 cmci_recheck();
290}
291
88void mce_intel_feature_init(struct cpuinfo_x86 *c) 292void mce_intel_feature_init(struct cpuinfo_x86 *c)
89{ 293{
90 intel_init_thermal(c); 294 intel_init_thermal(c);
295 intel_init_cmci();
91} 296}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 00000000000..23ee9e730f7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
1/*
2 * Common corrected MCE threshold handler code:
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6
7#include <asm/irq_vectors.h>
8#include <asm/apic.h>
9#include <asm/idle.h>
10#include <asm/mce.h>
11
12static void default_threshold_interrupt(void)
13{
14 printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
15 THRESHOLD_APIC_VECTOR);
16}
17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19
20asmlinkage void mce_threshold_interrupt(void)
21{
22 exit_idle();
23 irq_enter();
24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector();
26 irq_exit();
27 /* Ack only at the end to avoid potential reentry */
28 ack_APIC_irq();
29}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc053364..f4361b56f8e 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 00000000000..ce0fe4b5c04
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1101 @@
1/* MTRR (Memory Type Range Register) cleanup
2
3 Copyright (C) 2009 Yinghai Lu
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License as published by the Free Software Foundation; either
8 version 2 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with this library; if not, write to the Free
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18*/
19
20#include <linux/module.h>
21#include <linux/init.h>
22#include <linux/pci.h>
23#include <linux/smp.h>
24#include <linux/cpu.h>
25#include <linux/mutex.h>
26#include <linux/sort.h>
27
28#include <asm/e820.h>
29#include <asm/mtrr.h>
30#include <asm/uaccess.h>
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/kvm_para.h>
34#include "mtrr.h"
35
36/* should be related to MTRR_VAR_RANGES nums */
37#define RANGE_NUM 256
38
39struct res_range {
40 unsigned long start;
41 unsigned long end;
42};
43
44static int __init
45add_range(struct res_range *range, int nr_range, unsigned long start,
46 unsigned long end)
47{
48 /* out of slots */
49 if (nr_range >= RANGE_NUM)
50 return nr_range;
51
52 range[nr_range].start = start;
53 range[nr_range].end = end;
54
55 nr_range++;
56
57 return nr_range;
58}
59
60static int __init
61add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
62 unsigned long end)
63{
64 int i;
65
66 /* try to merge it with old one */
67 for (i = 0; i < nr_range; i++) {
68 unsigned long final_start, final_end;
69 unsigned long common_start, common_end;
70
71 if (!range[i].end)
72 continue;
73
74 common_start = max(range[i].start, start);
75 common_end = min(range[i].end, end);
76 if (common_start > common_end + 1)
77 continue;
78
79 final_start = min(range[i].start, start);
80 final_end = max(range[i].end, end);
81
82 range[i].start = final_start;
83 range[i].end = final_end;
84 return nr_range;
85 }
86
87 /* need to add that */
88 return add_range(range, nr_range, start, end);
89}
90
91static void __init
92subtract_range(struct res_range *range, unsigned long start, unsigned long end)
93{
94 int i, j;
95
96 for (j = 0; j < RANGE_NUM; j++) {
97 if (!range[j].end)
98 continue;
99
100 if (start <= range[j].start && end >= range[j].end) {
101 range[j].start = 0;
102 range[j].end = 0;
103 continue;
104 }
105
106 if (start <= range[j].start && end < range[j].end &&
107 range[j].start < end + 1) {
108 range[j].start = end + 1;
109 continue;
110 }
111
112
113 if (start > range[j].start && end >= range[j].end &&
114 range[j].end > start - 1) {
115 range[j].end = start - 1;
116 continue;
117 }
118
119 if (start > range[j].start && end < range[j].end) {
120 /* find the new spare */
121 for (i = 0; i < RANGE_NUM; i++) {
122 if (range[i].end == 0)
123 break;
124 }
125 if (i < RANGE_NUM) {
126 range[i].end = range[j].end;
127 range[i].start = end + 1;
128 } else {
129 printk(KERN_ERR "run of slot in ranges\n");
130 }
131 range[j].end = start - 1;
132 continue;
133 }
134 }
135}
136
137static int __init cmp_range(const void *x1, const void *x2)
138{
139 const struct res_range *r1 = x1;
140 const struct res_range *r2 = x2;
141 long start1, start2;
142
143 start1 = r1->start;
144 start2 = r2->start;
145
146 return start1 - start2;
147}
148
149struct var_mtrr_range_state {
150 unsigned long base_pfn;
151 unsigned long size_pfn;
152 mtrr_type type;
153};
154
155static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
156static int __initdata debug_print;
157
158static int __init
159x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
160 unsigned long extra_remove_base,
161 unsigned long extra_remove_size)
162{
163 unsigned long base, size;
164 mtrr_type type;
165 int i;
166
167 for (i = 0; i < num_var_ranges; i++) {
168 type = range_state[i].type;
169 if (type != MTRR_TYPE_WRBACK)
170 continue;
171 base = range_state[i].base_pfn;
172 size = range_state[i].size_pfn;
173 nr_range = add_range_with_merge(range, nr_range, base,
174 base + size - 1);
175 }
176 if (debug_print) {
177 printk(KERN_DEBUG "After WB checking\n");
178 for (i = 0; i < nr_range; i++)
179 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
180 range[i].start, range[i].end + 1);
181 }
182
183 /* take out UC ranges */
184 for (i = 0; i < num_var_ranges; i++) {
185 type = range_state[i].type;
186 if (type != MTRR_TYPE_UNCACHABLE &&
187 type != MTRR_TYPE_WRPROT)
188 continue;
189 size = range_state[i].size_pfn;
190 if (!size)
191 continue;
192 base = range_state[i].base_pfn;
193 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
194 (mtrr_state.enabled & 1)) {
195 /* Var MTRR contains UC entry below 1M? Skip it: */
196 printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
197 "contains strange UC entry under 1M, check "
198 "with your system vendor!\n", i);
199 if (base + size <= (1<<(20-PAGE_SHIFT)))
200 continue;
201 size -= (1<<(20-PAGE_SHIFT)) - base;
202 base = 1<<(20-PAGE_SHIFT);
203 }
204 subtract_range(range, base, base + size - 1);
205 }
206 if (extra_remove_size)
207 subtract_range(range, extra_remove_base,
208 extra_remove_base + extra_remove_size - 1);
209
210 /* get new range num */
211 nr_range = 0;
212 for (i = 0; i < RANGE_NUM; i++) {
213 if (!range[i].end)
214 continue;
215 nr_range++;
216 }
217 if (debug_print) {
218 printk(KERN_DEBUG "After UC checking\n");
219 for (i = 0; i < nr_range; i++)
220 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
221 range[i].start, range[i].end + 1);
222 }
223
224 /* sort the ranges */
225 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
226 if (debug_print) {
227 printk(KERN_DEBUG "After sorting\n");
228 for (i = 0; i < nr_range; i++)
229 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
230 range[i].start, range[i].end + 1);
231 }
232
233 /* clear those is not used */
234 for (i = nr_range; i < RANGE_NUM; i++)
235 memset(&range[i], 0, sizeof(range[i]));
236
237 return nr_range;
238}
239
240static struct res_range __initdata range[RANGE_NUM];
241static int __initdata nr_range;
242
243#ifdef CONFIG_MTRR_SANITIZER
244
245static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
246{
247 unsigned long sum;
248 int i;
249
250 sum = 0;
251 for (i = 0; i < nr_range; i++)
252 sum += range[i].end + 1 - range[i].start;
253
254 return sum;
255}
256
257static int enable_mtrr_cleanup __initdata =
258 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
259
260static int __init disable_mtrr_cleanup_setup(char *str)
261{
262 enable_mtrr_cleanup = 0;
263 return 0;
264}
265early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
266
267static int __init enable_mtrr_cleanup_setup(char *str)
268{
269 enable_mtrr_cleanup = 1;
270 return 0;
271}
272early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
273
274static int __init mtrr_cleanup_debug_setup(char *str)
275{
276 debug_print = 1;
277 return 0;
278}
279early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
280
281struct var_mtrr_state {
282 unsigned long range_startk;
283 unsigned long range_sizek;
284 unsigned long chunk_sizek;
285 unsigned long gran_sizek;
286 unsigned int reg;
287};
288
289static void __init
290set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
291 unsigned char type, unsigned int address_bits)
292{
293 u32 base_lo, base_hi, mask_lo, mask_hi;
294 u64 base, mask;
295
296 if (!sizek) {
297 fill_mtrr_var_range(reg, 0, 0, 0, 0);
298 return;
299 }
300
301 mask = (1ULL << address_bits) - 1;
302 mask &= ~((((u64)sizek) << 10) - 1);
303
304 base = ((u64)basek) << 10;
305
306 base |= type;
307 mask |= 0x800;
308
309 base_lo = base & ((1ULL<<32) - 1);
310 base_hi = base >> 32;
311
312 mask_lo = mask & ((1ULL<<32) - 1);
313 mask_hi = mask >> 32;
314
315 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
316}
317
318static void __init
319save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
320 unsigned char type)
321{
322 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
323 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
324 range_state[reg].type = type;
325}
326
327static void __init
328set_var_mtrr_all(unsigned int address_bits)
329{
330 unsigned long basek, sizek;
331 unsigned char type;
332 unsigned int reg;
333
334 for (reg = 0; reg < num_var_ranges; reg++) {
335 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
336 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
337 type = range_state[reg].type;
338
339 set_var_mtrr(reg, basek, sizek, type, address_bits);
340 }
341}
342
343static unsigned long to_size_factor(unsigned long sizek, char *factorp)
344{
345 char factor;
346 unsigned long base = sizek;
347
348 if (base & ((1<<10) - 1)) {
349 /* not MB alignment */
350 factor = 'K';
351 } else if (base & ((1<<20) - 1)) {
352 factor = 'M';
353 base >>= 10;
354 } else {
355 factor = 'G';
356 base >>= 20;
357 }
358
359 *factorp = factor;
360
361 return base;
362}
363
364static unsigned int __init
365range_to_mtrr(unsigned int reg, unsigned long range_startk,
366 unsigned long range_sizek, unsigned char type)
367{
368 if (!range_sizek || (reg >= num_var_ranges))
369 return reg;
370
371 while (range_sizek) {
372 unsigned long max_align, align;
373 unsigned long sizek;
374
375 /* Compute the maximum size I can make a range */
376 if (range_startk)
377 max_align = ffs(range_startk) - 1;
378 else
379 max_align = 32;
380 align = fls(range_sizek) - 1;
381 if (align > max_align)
382 align = max_align;
383
384 sizek = 1 << align;
385 if (debug_print) {
386 char start_factor = 'K', size_factor = 'K';
387 unsigned long start_base, size_base;
388
389 start_base = to_size_factor(range_startk,
390 &start_factor),
391 size_base = to_size_factor(sizek, &size_factor),
392
393 printk(KERN_DEBUG "Setting variable MTRR %d, "
394 "base: %ld%cB, range: %ld%cB, type %s\n",
395 reg, start_base, start_factor,
396 size_base, size_factor,
397 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
398 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
399 );
400 }
401 save_var_mtrr(reg++, range_startk, sizek, type);
402 range_startk += sizek;
403 range_sizek -= sizek;
404 if (reg >= num_var_ranges)
405 break;
406 }
407 return reg;
408}
409
410static unsigned __init
411range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
412 unsigned long sizek)
413{
414 unsigned long hole_basek, hole_sizek;
415 unsigned long second_basek, second_sizek;
416 unsigned long range0_basek, range0_sizek;
417 unsigned long range_basek, range_sizek;
418 unsigned long chunk_sizek;
419 unsigned long gran_sizek;
420
421 hole_basek = 0;
422 hole_sizek = 0;
423 second_basek = 0;
424 second_sizek = 0;
425 chunk_sizek = state->chunk_sizek;
426 gran_sizek = state->gran_sizek;
427
428 /* align with gran size, prevent small block used up MTRRs */
429 range_basek = ALIGN(state->range_startk, gran_sizek);
430 if ((range_basek > basek) && basek)
431 return second_sizek;
432 state->range_sizek -= (range_basek - state->range_startk);
433 range_sizek = ALIGN(state->range_sizek, gran_sizek);
434
435 while (range_sizek > state->range_sizek) {
436 range_sizek -= gran_sizek;
437 if (!range_sizek)
438 return 0;
439 }
440 state->range_sizek = range_sizek;
441
442 /* try to append some small hole */
443 range0_basek = state->range_startk;
444 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
445
446 /* no increase */
447 if (range0_sizek == state->range_sizek) {
448 if (debug_print)
449 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
450 range0_basek<<10,
451 (range0_basek + state->range_sizek)<<10);
452 state->reg = range_to_mtrr(state->reg, range0_basek,
453 state->range_sizek, MTRR_TYPE_WRBACK);
454 return 0;
455 }
456
457 /* only cut back, when it is not the last */
458 if (sizek) {
459 while (range0_basek + range0_sizek > (basek + sizek)) {
460 if (range0_sizek >= chunk_sizek)
461 range0_sizek -= chunk_sizek;
462 else
463 range0_sizek = 0;
464
465 if (!range0_sizek)
466 break;
467 }
468 }
469
470second_try:
471 range_basek = range0_basek + range0_sizek;
472
473 /* one hole in the middle */
474 if (range_basek > basek && range_basek <= (basek + sizek))
475 second_sizek = range_basek - basek;
476
477 if (range0_sizek > state->range_sizek) {
478
479 /* one hole in middle or at end */
480 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
481
482 /* hole size should be less than half of range0 size */
483 if (hole_sizek >= (range0_sizek >> 1) &&
484 range0_sizek >= chunk_sizek) {
485 range0_sizek -= chunk_sizek;
486 second_sizek = 0;
487 hole_sizek = 0;
488
489 goto second_try;
490 }
491 }
492
493 if (range0_sizek) {
494 if (debug_print)
495 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
496 range0_basek<<10,
497 (range0_basek + range0_sizek)<<10);
498 state->reg = range_to_mtrr(state->reg, range0_basek,
499 range0_sizek, MTRR_TYPE_WRBACK);
500 }
501
502 if (range0_sizek < state->range_sizek) {
503 /* need to handle left over */
504 range_sizek = state->range_sizek - range0_sizek;
505
506 if (debug_print)
507 printk(KERN_DEBUG "range: %016lx - %016lx\n",
508 range_basek<<10,
509 (range_basek + range_sizek)<<10);
510 state->reg = range_to_mtrr(state->reg, range_basek,
511 range_sizek, MTRR_TYPE_WRBACK);
512 }
513
514 if (hole_sizek) {
515 hole_basek = range_basek - hole_sizek - second_sizek;
516 if (debug_print)
517 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
518 hole_basek<<10,
519 (hole_basek + hole_sizek)<<10);
520 state->reg = range_to_mtrr(state->reg, hole_basek,
521 hole_sizek, MTRR_TYPE_UNCACHABLE);
522 }
523
524 return second_sizek;
525}
526
527static void __init
528set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
529 unsigned long size_pfn)
530{
531 unsigned long basek, sizek;
532 unsigned long second_sizek = 0;
533
534 if (state->reg >= num_var_ranges)
535 return;
536
537 basek = base_pfn << (PAGE_SHIFT - 10);
538 sizek = size_pfn << (PAGE_SHIFT - 10);
539
540 /* See if I can merge with the last range */
541 if ((basek <= 1024) ||
542 (state->range_startk + state->range_sizek == basek)) {
543 unsigned long endk = basek + sizek;
544 state->range_sizek = endk - state->range_startk;
545 return;
546 }
547 /* Write the range mtrrs */
548 if (state->range_sizek != 0)
549 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
550
551 /* Allocate an msr */
552 state->range_startk = basek + second_sizek;
553 state->range_sizek = sizek - second_sizek;
554}
555
556/* mininum size of mtrr block that can take hole */
557static u64 mtrr_chunk_size __initdata = (256ULL<<20);
558
559static int __init parse_mtrr_chunk_size_opt(char *p)
560{
561 if (!p)
562 return -EINVAL;
563 mtrr_chunk_size = memparse(p, &p);
564 return 0;
565}
566early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
567
568/* granity of mtrr of block */
569static u64 mtrr_gran_size __initdata;
570
571static int __init parse_mtrr_gran_size_opt(char *p)
572{
573 if (!p)
574 return -EINVAL;
575 mtrr_gran_size = memparse(p, &p);
576 return 0;
577}
578early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
579
580static int nr_mtrr_spare_reg __initdata =
581 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
582
583static int __init parse_mtrr_spare_reg(char *arg)
584{
585 if (arg)
586 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
587 return 0;
588}
589
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591
592static int __init
593x86_setup_var_mtrrs(struct res_range *range, int nr_range,
594 u64 chunk_size, u64 gran_size)
595{
596 struct var_mtrr_state var_state;
597 int i;
598 int num_reg;
599
600 var_state.range_startk = 0;
601 var_state.range_sizek = 0;
602 var_state.reg = 0;
603 var_state.chunk_sizek = chunk_size >> 10;
604 var_state.gran_sizek = gran_size >> 10;
605
606 memset(range_state, 0, sizeof(range_state));
607
608 /* Write the range etc */
609 for (i = 0; i < nr_range; i++)
610 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1);
612
613 /* Write the last range */
614 if (var_state.range_sizek != 0)
615 range_to_mtrr_with_hole(&var_state, 0, 0);
616
617 num_reg = var_state.reg;
618 /* Clear out the extra MTRR's */
619 while (var_state.reg < num_var_ranges) {
620 save_var_mtrr(var_state.reg, 0, 0, 0);
621 var_state.reg++;
622 }
623
624 return num_reg;
625}
626
627struct mtrr_cleanup_result {
628 unsigned long gran_sizek;
629 unsigned long chunk_sizek;
630 unsigned long lose_cover_sizek;
631 unsigned int num_reg;
632 int bad;
633};
634
635/*
636 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
637 * chunk size: gran_size, ..., 2G
638 * so we need (1+16)*8
639 */
640#define NUM_RESULT 136
641#define PSHIFT (PAGE_SHIFT - 10)
642
643static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
644static unsigned long __initdata min_loss_pfn[RANGE_NUM];
645
646static void __init print_out_mtrr_range_state(void)
647{
648 int i;
649 char start_factor = 'K', size_factor = 'K';
650 unsigned long start_base, size_base;
651 mtrr_type type;
652
653 for (i = 0; i < num_var_ranges; i++) {
654
655 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
656 if (!size_base)
657 continue;
658
659 size_base = to_size_factor(size_base, &size_factor),
660 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
661 start_base = to_size_factor(start_base, &start_factor),
662 type = range_state[i].type;
663
664 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
665 i, start_base, start_factor,
666 size_base, size_factor,
667 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
668 ((type == MTRR_TYPE_WRPROT) ? "WP" :
669 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
670 );
671 }
672}
673
674static int __init mtrr_need_cleanup(void)
675{
676 int i;
677 mtrr_type type;
678 unsigned long size;
679 /* extra one for all 0 */
680 int num[MTRR_NUM_TYPES + 1];
681
682 /* check entries number */
683 memset(num, 0, sizeof(num));
684 for (i = 0; i < num_var_ranges; i++) {
685 type = range_state[i].type;
686 size = range_state[i].size_pfn;
687 if (type >= MTRR_NUM_TYPES)
688 continue;
689 if (!size)
690 type = MTRR_NUM_TYPES;
691 if (type == MTRR_TYPE_WRPROT)
692 type = MTRR_TYPE_UNCACHABLE;
693 num[type]++;
694 }
695
696 /* check if we got UC entries */
697 if (!num[MTRR_TYPE_UNCACHABLE])
698 return 0;
699
700 /* check if we only had WB and UC */
701 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
702 num_var_ranges - num[MTRR_NUM_TYPES])
703 return 0;
704
705 return 1;
706}
707
708static unsigned long __initdata range_sums;
709static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
710 unsigned long extra_remove_base,
711 unsigned long extra_remove_size,
712 int i)
713{
714 int num_reg;
715 static struct res_range range_new[RANGE_NUM];
716 static int nr_range_new;
717 unsigned long range_sums_new;
718
719 /* convert ranges to var ranges state */
720 num_reg = x86_setup_var_mtrrs(range, nr_range,
721 chunk_size, gran_size);
722
723 /* we got new setting in range_state, check it */
724 memset(range_new, 0, sizeof(range_new));
725 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
726 extra_remove_base, extra_remove_size);
727 range_sums_new = sum_ranges(range_new, nr_range_new);
728
729 result[i].chunk_sizek = chunk_size >> 10;
730 result[i].gran_sizek = gran_size >> 10;
731 result[i].num_reg = num_reg;
732 if (range_sums < range_sums_new) {
733 result[i].lose_cover_sizek =
734 (range_sums_new - range_sums) << PSHIFT;
735 result[i].bad = 1;
736 } else
737 result[i].lose_cover_sizek =
738 (range_sums - range_sums_new) << PSHIFT;
739
740 /* double check it */
741 if (!result[i].bad && !result[i].lose_cover_sizek) {
742 if (nr_range_new != nr_range ||
743 memcmp(range, range_new, sizeof(range)))
744 result[i].bad = 1;
745 }
746
747 if (!result[i].bad && (range_sums - range_sums_new <
748 min_loss_pfn[num_reg])) {
749 min_loss_pfn[num_reg] =
750 range_sums - range_sums_new;
751 }
752}
753
754static void __init mtrr_print_out_one_result(int i)
755{
756 char gran_factor, chunk_factor, lose_factor;
757 unsigned long gran_base, chunk_base, lose_base;
758
759 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
760 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
761 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
762 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
763 result[i].bad ? "*BAD*" : " ",
764 gran_base, gran_factor, chunk_base, chunk_factor);
765 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
766 result[i].num_reg, result[i].bad ? "-" : "",
767 lose_base, lose_factor);
768}
769
770static int __init mtrr_search_optimal_index(void)
771{
772 int i;
773 int num_reg_good;
774 int index_good;
775
776 if (nr_mtrr_spare_reg >= num_var_ranges)
777 nr_mtrr_spare_reg = num_var_ranges - 1;
778 num_reg_good = -1;
779 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
780 if (!min_loss_pfn[i])
781 num_reg_good = i;
782 }
783
784 index_good = -1;
785 if (num_reg_good != -1) {
786 for (i = 0; i < NUM_RESULT; i++) {
787 if (!result[i].bad &&
788 result[i].num_reg == num_reg_good &&
789 !result[i].lose_cover_sizek) {
790 index_good = i;
791 break;
792 }
793 }
794 }
795
796 return index_good;
797}
798
799
800int __init mtrr_cleanup(unsigned address_bits)
801{
802 unsigned long extra_remove_base, extra_remove_size;
803 unsigned long base, size, def, dummy;
804 mtrr_type type;
805 u64 chunk_size, gran_size;
806 int index_good;
807 int i;
808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy);
812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0;
815
816 /* get it and store it aside */
817 memset(range_state, 0, sizeof(range_state));
818 for (i = 0; i < num_var_ranges; i++) {
819 mtrr_if->get(i, &base, &size, &type);
820 range_state[i].base_pfn = base;
821 range_state[i].size_pfn = size;
822 range_state[i].type = type;
823 }
824
825 /* check if we need handle it and can handle it */
826 if (!mtrr_need_cleanup())
827 return 0;
828
829 /* print original var MTRRs at first, for debugging: */
830 printk(KERN_DEBUG "original variable MTRRs\n");
831 print_out_mtrr_range_state();
832
833 memset(range, 0, sizeof(range));
834 extra_remove_size = 0;
835 extra_remove_base = 1 << (32 - PAGE_SHIFT);
836 if (mtrr_tom2)
837 extra_remove_size =
838 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
839 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
840 extra_remove_size);
841 /*
842 * [0, 1M) should always be coverred by var mtrr with WB
843 * and fixed mtrrs should take effective before var mtrr for it
844 */
845 nr_range = add_range_with_merge(range, nr_range, 0,
846 (1ULL<<(20 - PAGE_SHIFT)) - 1);
847 /* sort the ranges */
848 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
849
850 range_sums = sum_ranges(range, nr_range);
851 printk(KERN_INFO "total RAM coverred: %ldM\n",
852 range_sums >> (20 - PAGE_SHIFT));
853
854 if (mtrr_chunk_size && mtrr_gran_size) {
855 i = 0;
856 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
857 extra_remove_base, extra_remove_size, i);
858
859 mtrr_print_out_one_result(i);
860
861 if (!result[i].bad) {
862 set_var_mtrr_all(address_bits);
863 printk(KERN_DEBUG "New variable MTRRs\n");
864 print_out_mtrr_range_state();
865 return 1;
866 }
867 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
868 "will find optimal one\n");
869 }
870
871 i = 0;
872 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
873 memset(result, 0, sizeof(result));
874 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
875
876 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
877 chunk_size <<= 1) {
878
879 if (i >= NUM_RESULT)
880 continue;
881
882 mtrr_calc_range_state(chunk_size, gran_size,
883 extra_remove_base, extra_remove_size, i);
884 if (debug_print) {
885 mtrr_print_out_one_result(i);
886 printk(KERN_INFO "\n");
887 }
888
889 i++;
890 }
891 }
892
893 /* try to find the optimal index */
894 index_good = mtrr_search_optimal_index();
895
896 if (index_good != -1) {
897 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
898 i = index_good;
899 mtrr_print_out_one_result(i);
900
901 /* convert ranges to var ranges state */
902 chunk_size = result[i].chunk_sizek;
903 chunk_size <<= 10;
904 gran_size = result[i].gran_sizek;
905 gran_size <<= 10;
906 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
907 set_var_mtrr_all(address_bits);
908 printk(KERN_DEBUG "New variable MTRRs\n");
909 print_out_mtrr_range_state();
910 return 1;
911 } else {
912 /* print out all */
913 for (i = 0; i < NUM_RESULT; i++)
914 mtrr_print_out_one_result(i);
915 }
916
917 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
918 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
919
920 return 0;
921}
922#else
923int __init mtrr_cleanup(unsigned address_bits)
924{
925 return 0;
926}
927#endif
928
929static int disable_mtrr_trim;
930
931static int __init disable_mtrr_trim_setup(char *str)
932{
933 disable_mtrr_trim = 1;
934 return 0;
935}
936early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
937
938/*
939 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
940 * for memory >4GB. Check for that here.
941 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
942 * apply to are wrong, but so far we don't know of any such case in the wild.
943 */
944#define Tom2Enabled (1U << 21)
945#define Tom2ForceMemTypeWB (1U << 22)
946
947int __init amd_special_default_mtrr(void)
948{
949 u32 l, h;
950
951 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
952 return 0;
953 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
954 return 0;
955 /* In case some hypervisor doesn't pass SYSCFG through */
956 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
957 return 0;
958 /*
959 * Memory between 4GB and top of mem is forced WB by this magic bit.
960 * Reserved before K8RevF, but should be zero there.
961 */
962 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
963 (Tom2Enabled | Tom2ForceMemTypeWB))
964 return 1;
965 return 0;
966}
967
968static u64 __init real_trim_memory(unsigned long start_pfn,
969 unsigned long limit_pfn)
970{
971 u64 trim_start, trim_size;
972 trim_start = start_pfn;
973 trim_start <<= PAGE_SHIFT;
974 trim_size = limit_pfn;
975 trim_size <<= PAGE_SHIFT;
976 trim_size -= trim_start;
977
978 return e820_update_range(trim_start, trim_size, E820_RAM,
979 E820_RESERVED);
980}
981/**
982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
983 * @end_pfn: ending page frame number
984 *
985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
986 * memory configurations. This routine checks that the highest MTRR matches
987 * the end of memory, to make sure the MTRRs having a write back type cover
988 * all of the memory the kernel is intending to use. If not, it'll trim any
989 * memory off the end by adjusting end_pfn, removing it from the kernel's
990 * allocation pools, warning the user with an obnoxious message.
991 */
992int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
993{
994 unsigned long i, base, size, highest_pfn = 0, def, dummy;
995 mtrr_type type;
996 u64 total_trim_size;
997
998 /* extra one for all 0 */
999 int num[MTRR_NUM_TYPES + 1];
1000 /*
1001 * Make sure we only trim uncachable memory on machines that
1002 * support the Intel MTRR architecture:
1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy);
1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0;
1010
1011 /* get it and store it aside */
1012 memset(range_state, 0, sizeof(range_state));
1013 for (i = 0; i < num_var_ranges; i++) {
1014 mtrr_if->get(i, &base, &size, &type);
1015 range_state[i].base_pfn = base;
1016 range_state[i].size_pfn = size;
1017 range_state[i].type = type;
1018 }
1019
1020 /* Find highest cached pfn */
1021 for (i = 0; i < num_var_ranges; i++) {
1022 type = range_state[i].type;
1023 if (type != MTRR_TYPE_WRBACK)
1024 continue;
1025 base = range_state[i].base_pfn;
1026 size = range_state[i].size_pfn;
1027 if (highest_pfn < base + size)
1028 highest_pfn = base + size;
1029 }
1030
1031 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1032 if (!highest_pfn) {
1033 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1034 return 0;
1035 }
1036
1037 /* check entries number */
1038 memset(num, 0, sizeof(num));
1039 for (i = 0; i < num_var_ranges; i++) {
1040 type = range_state[i].type;
1041 if (type >= MTRR_NUM_TYPES)
1042 continue;
1043 size = range_state[i].size_pfn;
1044 if (!size)
1045 type = MTRR_NUM_TYPES;
1046 num[type]++;
1047 }
1048
1049 /* no entry for WB? */
1050 if (!num[MTRR_TYPE_WRBACK])
1051 return 0;
1052
1053 /* check if we only had WB and UC */
1054 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1055 num_var_ranges - num[MTRR_NUM_TYPES])
1056 return 0;
1057
1058 memset(range, 0, sizeof(range));
1059 nr_range = 0;
1060 if (mtrr_tom2) {
1061 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1062 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1063 if (highest_pfn < range[nr_range].end + 1)
1064 highest_pfn = range[nr_range].end + 1;
1065 nr_range++;
1066 }
1067 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1068
1069 total_trim_size = 0;
1070 /* check the head */
1071 if (range[0].start)
1072 total_trim_size += real_trim_memory(0, range[0].start);
1073 /* check the holes */
1074 for (i = 0; i < nr_range - 1; i++) {
1075 if (range[i].end + 1 < range[i+1].start)
1076 total_trim_size += real_trim_memory(range[i].end + 1,
1077 range[i+1].start);
1078 }
1079 /* check the top */
1080 i = nr_range - 1;
1081 if (range[i].end + 1 < end_pfn)
1082 total_trim_size += real_trim_memory(range[i].end + 1,
1083 end_pfn);
1084
1085 if (total_trim_size) {
1086 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1087 " all of memory, losing %lluMB of RAM.\n",
1088 total_trim_size >> 20);
1089
1090 if (!changed_by_mtrr_cleanup)
1091 WARN_ON(1);
1092
1093 printk(KERN_INFO "update e820 for mtrr\n");
1094 update_e820();
1095
1096 return 1;
1097 }
1098
1099 return 0;
1100}
1101
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95..37f28fc7cf9 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
33struct mtrr_state_type mtrr_state = {}; 33struct mtrr_state_type mtrr_state = {};
34EXPORT_SYMBOL_GPL(mtrr_state); 34EXPORT_SYMBOL_GPL(mtrr_state);
35 35
36static int __initdata mtrr_show; 36/**
37static int __init mtrr_debug(char *opt) 37 * BIOS is expected to clear MtrrFixDramModEn bit, see for example
38 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
39 * Opteron Processors" (26094 Rev. 3.30 February 2006), section
40 * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
41 * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
42 * 0 for operation."
43 */
44static inline void k8_check_syscfg_dram_mod_en(void)
38{ 45{
39 mtrr_show = 1; 46 u32 lo, hi;
40 return 0; 47
48 if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
49 (boot_cpu_data.x86 >= 0x0f)))
50 return;
51
52 rdmsr(MSR_K8_SYSCFG, lo, hi);
53 if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
54 printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
55 " not cleared by BIOS, clearing this bit\n",
56 smp_processor_id());
57 lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
58 mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
59 }
41} 60}
42early_param("mtrr.show", mtrr_debug);
43 61
44/* 62/*
45 * Returns the effective MTRR type for the region 63 * Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
174 unsigned int *p = (unsigned int *) frs; 192 unsigned int *p = (unsigned int *) frs;
175 int i; 193 int i;
176 194
195 k8_check_syscfg_dram_mod_en();
196
177 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
178 198
179 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
188 get_fixed_ranges(mtrr_state.fixed_ranges); 208 get_fixed_ranges(mtrr_state.fixed_ranges);
189} 209}
190 210
191static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) 211static unsigned __initdata last_fixed_start;
212static unsigned __initdata last_fixed_end;
213static mtrr_type __initdata last_fixed_type;
214
215static void __init print_fixed_last(void)
216{
217 if (!last_fixed_end)
218 return;
219
220 printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start,
221 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
222
223 last_fixed_end = 0;
224}
225
226static void __init update_fixed_last(unsigned base, unsigned end,
227 mtrr_type type)
228{
229 last_fixed_start = base;
230 last_fixed_end = end;
231 last_fixed_type = type;
232}
233
234static void __init print_fixed(unsigned base, unsigned step,
235 const mtrr_type *types)
192{ 236{
193 unsigned i; 237 unsigned i;
194 238
195 for (i = 0; i < 8; ++i, ++types, base += step) 239 for (i = 0; i < 8; ++i, ++types, base += step) {
196 printk(KERN_INFO "MTRR %05X-%05X %s\n", 240 if (last_fixed_end == 0) {
197 base, base + step - 1, mtrr_attrib_to_str(*types)); 241 update_fixed_last(base, base + step, *types);
242 continue;
243 }
244 if (last_fixed_end == base && last_fixed_type == *types) {
245 last_fixed_end = base + step;
246 continue;
247 }
248 /* new segments: gap or different type */
249 print_fixed_last();
250 update_fixed_last(base, base + step, *types);
251 }
198} 252}
199 253
200static void prepare_set(void); 254static void prepare_set(void);
201static void post_set(void); 255static void post_set(void);
202 256
257static void __init print_mtrr_state(void)
258{
259 unsigned int i;
260 int high_width;
261
262 printk(KERN_DEBUG "MTRR default type: %s\n",
263 mtrr_attrib_to_str(mtrr_state.def_type));
264 if (mtrr_state.have_fixed) {
265 printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
266 mtrr_state.enabled & 1 ? "en" : "dis");
267 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
268 for (i = 0; i < 2; ++i)
269 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
270 for (i = 0; i < 8; ++i)
271 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
272
273 /* tail */
274 print_fixed_last();
275 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis");
278 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
279 for (i = 0; i < num_var_ranges; ++i) {
280 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
281 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
282 i,
283 high_width,
284 mtrr_state.var_ranges[i].base_hi,
285 mtrr_state.var_ranges[i].base_lo >> 12,
286 high_width,
287 mtrr_state.var_ranges[i].mask_hi,
288 mtrr_state.var_ranges[i].mask_lo >> 12,
289 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
290 else
291 printk(KERN_DEBUG " %u disabled\n", i);
292 }
293 if (mtrr_tom2) {
294 printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
295 mtrr_tom2, mtrr_tom2>>20);
296 }
297}
298
203/* Grab all of the MTRR state for this CPU into *state */ 299/* Grab all of the MTRR state for this CPU into *state */
204void __init get_mtrr_state(void) 300void __init get_mtrr_state(void)
205{ 301{
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
231 mtrr_tom2 |= low; 327 mtrr_tom2 |= low;
232 mtrr_tom2 &= 0xffffff800000ULL; 328 mtrr_tom2 &= 0xffffff800000ULL;
233 } 329 }
234 if (mtrr_show) { 330
235 int high_width; 331 print_mtrr_state();
236 332
237 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
238 if (mtrr_state.have_fixed) {
239 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
240 mtrr_state.enabled & 1 ? "en" : "dis");
241 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
242 for (i = 0; i < 2; ++i)
243 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
244 for (i = 0; i < 8; ++i)
245 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
246 }
247 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
248 mtrr_state.enabled & 2 ? "en" : "dis");
249 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
250 for (i = 0; i < num_var_ranges; ++i) {
251 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
252 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
253 i,
254 high_width,
255 mtrr_state.var_ranges[i].base_hi,
256 mtrr_state.var_ranges[i].base_lo >> 12,
257 high_width,
258 mtrr_state.var_ranges[i].mask_hi,
259 mtrr_state.var_ranges[i].mask_lo >> 12,
260 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
261 else
262 printk(KERN_INFO "MTRR %u disabled\n", i);
263 }
264 if (mtrr_tom2) {
265 printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
266 mtrr_tom2, mtrr_tom2>>20);
267 }
268 }
269 mtrr_state_set = 1; 333 mtrr_state_set = 1;
270 334
271 /* PAT setup for BP. We need to go through sync steps here */ 335 /* PAT setup for BP. We need to go through sync steps here */
@@ -308,27 +372,10 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
308} 372}
309 373
310/** 374/**
311 * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
312 * see AMD publication no. 24593, chapter 3.2.1 for more information
313 */
314static inline void k8_enable_fixed_iorrs(void)
315{
316 unsigned lo, hi;
317
318 rdmsr(MSR_K8_SYSCFG, lo, hi);
319 mtrr_wrmsr(MSR_K8_SYSCFG, lo
320 | K8_MTRRFIXRANGE_DRAM_ENABLE
321 | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
322}
323
324/**
325 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have 375 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
326 * @msr: MSR address of the MTTR which should be checked and updated 376 * @msr: MSR address of the MTTR which should be checked and updated
327 * @changed: pointer which indicates whether the MTRR needed to be changed 377 * @changed: pointer which indicates whether the MTRR needed to be changed
328 * @msrwords: pointer to the MSR values which the MSR should have 378 * @msrwords: pointer to the MSR values which the MSR should have
329 *
330 * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
331 * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
332 */ 379 */
333static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) 380static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
334{ 381{
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
337 rdmsr(msr, lo, hi); 384 rdmsr(msr, lo, hi);
338 385
339 if (lo != msrwords[0] || hi != msrwords[1]) { 386 if (lo != msrwords[0] || hi != msrwords[1]) {
340 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
341 (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
342 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
343 k8_enable_fixed_iorrs();
344 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 387 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
345 *changed = true; 388 *changed = true;
346 } 389 }
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
376{ 419{
377 unsigned int mask_lo, mask_hi, base_lo, base_hi; 420 unsigned int mask_lo, mask_hi, base_lo, base_hi;
378 unsigned int tmp, hi; 421 unsigned int tmp, hi;
422 int cpu;
423
424 /*
425 * get_mtrr doesn't need to update mtrr_state, also it could be called
426 * from any cpu, so try to print it out directly.
427 */
428 cpu = get_cpu();
379 429
380 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 430 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
431
381 if ((mask_lo & 0x800) == 0) { 432 if ((mask_lo & 0x800) == 0) {
382 /* Invalid (i.e. free) range */ 433 /* Invalid (i.e. free) range */
383 *base = 0; 434 *base = 0;
384 *size = 0; 435 *size = 0;
385 *type = 0; 436 *type = 0;
386 return; 437 goto out_put_cpu;
387 } 438 }
388 439
389 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 440 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
390 441
391 /* Work out the shifted address mask. */ 442 /* Work out the shifted address mask: */
392 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; 443 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
393 mask_lo = size_or_mask | tmp; 444 mask_lo = size_or_mask | tmp;
394 /* Expand tmp with high bits to all 1s*/ 445
446 /* Expand tmp with high bits to all 1s: */
395 hi = fls(tmp); 447 hi = fls(tmp);
396 if (hi > 0) { 448 if (hi > 0) {
397 tmp |= ~((1<<(hi - 1)) - 1); 449 tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
402 } 454 }
403 } 455 }
404 456
405 /* This works correctly if size is a power of two, i.e. a 457 /*
406 contiguous range. */ 458 * This works correctly if size is a power of two, i.e. a
459 * contiguous range:
460 */
407 *size = -mask_lo; 461 *size = -mask_lo;
408 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; 462 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
409 *type = base_lo & 0xff; 463 *type = base_lo & 0xff;
464
465 printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n",
466 cpu, reg, *base, *size,
467 mtrr_attrib_to_str(*type & 0xff));
468out_put_cpu:
469 put_cpu();
410} 470}
411 471
412/** 472/**
@@ -419,6 +479,8 @@ static int set_fixed_ranges(mtrr_type * frs)
419 bool changed = false; 479 bool changed = false;
420 int block=-1, range; 480 int block=-1, range;
421 481
482 k8_check_syscfg_dram_mod_en();
483
422 while (fixed_range_blocks[++block].ranges) 484 while (fixed_range_blocks[++block].ranges)
423 for (range=0; range < fixed_range_blocks[block].ranges; range++) 485 for (range=0; range < fixed_range_blocks[block].ranges; range++)
424 set_fixed_range(fixed_range_blocks[block].base_msr + range, 486 set_fixed_range(fixed_range_blocks[block].base_msr + range,
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b825..03cda01f57c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
574 unsigned long lsize; 574 unsigned long lsize;
575}; 575};
576 576
577static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; 577static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
578 578
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 579static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
580{ 580{
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
582 582
583 for (i = 0; i < num_var_ranges; i++) { 583 for (i = 0; i < num_var_ranges; i++) {
584 mtrr_if->get(i, 584 mtrr_if->get(i,
585 &mtrr_state[i].lbase, 585 &mtrr_value[i].lbase,
586 &mtrr_state[i].lsize, 586 &mtrr_value[i].lsize,
587 &mtrr_state[i].ltype); 587 &mtrr_value[i].ltype);
588 } 588 }
589 return 0; 589 return 0;
590} 590}
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
594 int i; 594 int i;
595 595
596 for (i = 0; i < num_var_ranges; i++) { 596 for (i = 0; i < num_var_ranges; i++) {
597 if (mtrr_state[i].lsize) 597 if (mtrr_value[i].lsize)
598 set_mtrr(i, 598 set_mtrr(i,
599 mtrr_state[i].lbase, 599 mtrr_value[i].lbase,
600 mtrr_state[i].lsize, 600 mtrr_value[i].lsize,
601 mtrr_state[i].ltype); 601 mtrr_value[i].ltype);
602 } 602 }
603 return 0; 603 return 0;
604} 604}
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
610 .resume = mtrr_restore, 610 .resume = mtrr_restore,
611}; 611};
612 612
613/* should be related to MTRR_VAR_RANGES nums */ 613int __initdata changed_by_mtrr_cleanup;
614#define RANGE_NUM 256
615
616struct res_range {
617 unsigned long start;
618 unsigned long end;
619};
620
621static int __init
622add_range(struct res_range *range, int nr_range, unsigned long start,
623 unsigned long end)
624{
625 /* out of slots */
626 if (nr_range >= RANGE_NUM)
627 return nr_range;
628
629 range[nr_range].start = start;
630 range[nr_range].end = end;
631
632 nr_range++;
633
634 return nr_range;
635}
636
637static int __init
638add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
639 unsigned long end)
640{
641 int i;
642
643 /* try to merge it with old one */
644 for (i = 0; i < nr_range; i++) {
645 unsigned long final_start, final_end;
646 unsigned long common_start, common_end;
647
648 if (!range[i].end)
649 continue;
650
651 common_start = max(range[i].start, start);
652 common_end = min(range[i].end, end);
653 if (common_start > common_end + 1)
654 continue;
655
656 final_start = min(range[i].start, start);
657 final_end = max(range[i].end, end);
658
659 range[i].start = final_start;
660 range[i].end = final_end;
661 return nr_range;
662 }
663
664 /* need to add that */
665 return add_range(range, nr_range, start, end);
666}
667
668static void __init
669subtract_range(struct res_range *range, unsigned long start, unsigned long end)
670{
671 int i, j;
672
673 for (j = 0; j < RANGE_NUM; j++) {
674 if (!range[j].end)
675 continue;
676
677 if (start <= range[j].start && end >= range[j].end) {
678 range[j].start = 0;
679 range[j].end = 0;
680 continue;
681 }
682
683 if (start <= range[j].start && end < range[j].end &&
684 range[j].start < end + 1) {
685 range[j].start = end + 1;
686 continue;
687 }
688
689
690 if (start > range[j].start && end >= range[j].end &&
691 range[j].end > start - 1) {
692 range[j].end = start - 1;
693 continue;
694 }
695
696 if (start > range[j].start && end < range[j].end) {
697 /* find the new spare */
698 for (i = 0; i < RANGE_NUM; i++) {
699 if (range[i].end == 0)
700 break;
701 }
702 if (i < RANGE_NUM) {
703 range[i].end = range[j].end;
704 range[i].start = end + 1;
705 } else {
706 printk(KERN_ERR "run of slot in ranges\n");
707 }
708 range[j].end = start - 1;
709 continue;
710 }
711 }
712}
713
714static int __init cmp_range(const void *x1, const void *x2)
715{
716 const struct res_range *r1 = x1;
717 const struct res_range *r2 = x2;
718 long start1, start2;
719
720 start1 = r1->start;
721 start2 = r2->start;
722
723 return start1 - start2;
724}
725
726struct var_mtrr_range_state {
727 unsigned long base_pfn;
728 unsigned long size_pfn;
729 mtrr_type type;
730};
731
732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print;
734
735static int __init
736x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
737 unsigned long extra_remove_base,
738 unsigned long extra_remove_size)
739{
740 unsigned long i, base, size;
741 mtrr_type type;
742
743 for (i = 0; i < num_var_ranges; i++) {
744 type = range_state[i].type;
745 if (type != MTRR_TYPE_WRBACK)
746 continue;
747 base = range_state[i].base_pfn;
748 size = range_state[i].size_pfn;
749 nr_range = add_range_with_merge(range, nr_range, base,
750 base + size - 1);
751 }
752 if (debug_print) {
753 printk(KERN_DEBUG "After WB checking\n");
754 for (i = 0; i < nr_range; i++)
755 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
756 range[i].start, range[i].end + 1);
757 }
758
759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
764 continue;
765 size = range_state[i].size_pfn;
766 if (!size)
767 continue;
768 base = range_state[i].base_pfn;
769 subtract_range(range, base, base + size - 1);
770 }
771 if (extra_remove_size)
772 subtract_range(range, extra_remove_base,
773 extra_remove_base + extra_remove_size - 1);
774
775 /* get new range num */
776 nr_range = 0;
777 for (i = 0; i < RANGE_NUM; i++) {
778 if (!range[i].end)
779 continue;
780 nr_range++;
781 }
782 if (debug_print) {
783 printk(KERN_DEBUG "After UC checking\n");
784 for (i = 0; i < nr_range; i++)
785 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
786 range[i].start, range[i].end + 1);
787 }
788
789 /* sort the ranges */
790 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
791 if (debug_print) {
792 printk(KERN_DEBUG "After sorting\n");
793 for (i = 0; i < nr_range; i++)
794 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
795 range[i].start, range[i].end + 1);
796 }
797
798 /* clear those is not used */
799 for (i = nr_range; i < RANGE_NUM; i++)
800 memset(&range[i], 0, sizeof(range[i]));
801
802 return nr_range;
803}
804
805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
807
808#ifdef CONFIG_MTRR_SANITIZER
809
810static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
811{
812 unsigned long sum;
813 int i;
814
815 sum = 0;
816 for (i = 0; i < nr_range; i++)
817 sum += range[i].end + 1 - range[i].start;
818
819 return sum;
820}
821
822static int enable_mtrr_cleanup __initdata =
823 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
824
825static int __init disable_mtrr_cleanup_setup(char *str)
826{
827 enable_mtrr_cleanup = 0;
828 return 0;
829}
830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
831
832static int __init enable_mtrr_cleanup_setup(char *str)
833{
834 enable_mtrr_cleanup = 1;
835 return 0;
836}
837early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
838
839static int __init mtrr_cleanup_debug_setup(char *str)
840{
841 debug_print = 1;
842 return 0;
843}
844early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
845
846struct var_mtrr_state {
847 unsigned long range_startk;
848 unsigned long range_sizek;
849 unsigned long chunk_sizek;
850 unsigned long gran_sizek;
851 unsigned int reg;
852};
853
854static void __init
855set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
856 unsigned char type, unsigned int address_bits)
857{
858 u32 base_lo, base_hi, mask_lo, mask_hi;
859 u64 base, mask;
860
861 if (!sizek) {
862 fill_mtrr_var_range(reg, 0, 0, 0, 0);
863 return;
864 }
865
866 mask = (1ULL << address_bits) - 1;
867 mask &= ~((((u64)sizek) << 10) - 1);
868
869 base = ((u64)basek) << 10;
870
871 base |= type;
872 mask |= 0x800;
873
874 base_lo = base & ((1ULL<<32) - 1);
875 base_hi = base >> 32;
876
877 mask_lo = mask & ((1ULL<<32) - 1);
878 mask_hi = mask >> 32;
879
880 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
881}
882
883static void __init
884save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
885 unsigned char type)
886{
887 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
888 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
889 range_state[reg].type = type;
890}
891
892static void __init
893set_var_mtrr_all(unsigned int address_bits)
894{
895 unsigned long basek, sizek;
896 unsigned char type;
897 unsigned int reg;
898
899 for (reg = 0; reg < num_var_ranges; reg++) {
900 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
901 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
902 type = range_state[reg].type;
903
904 set_var_mtrr(reg, basek, sizek, type, address_bits);
905 }
906}
907
908static unsigned long to_size_factor(unsigned long sizek, char *factorp)
909{
910 char factor;
911 unsigned long base = sizek;
912
913 if (base & ((1<<10) - 1)) {
914 /* not MB alignment */
915 factor = 'K';
916 } else if (base & ((1<<20) - 1)){
917 factor = 'M';
918 base >>= 10;
919 } else {
920 factor = 'G';
921 base >>= 20;
922 }
923
924 *factorp = factor;
925
926 return base;
927}
928
929static unsigned int __init
930range_to_mtrr(unsigned int reg, unsigned long range_startk,
931 unsigned long range_sizek, unsigned char type)
932{
933 if (!range_sizek || (reg >= num_var_ranges))
934 return reg;
935
936 while (range_sizek) {
937 unsigned long max_align, align;
938 unsigned long sizek;
939
940 /* Compute the maximum size I can make a range */
941 if (range_startk)
942 max_align = ffs(range_startk) - 1;
943 else
944 max_align = 32;
945 align = fls(range_sizek) - 1;
946 if (align > max_align)
947 align = max_align;
948
949 sizek = 1 << align;
950 if (debug_print) {
951 char start_factor = 'K', size_factor = 'K';
952 unsigned long start_base, size_base;
953
954 start_base = to_size_factor(range_startk, &start_factor),
955 size_base = to_size_factor(sizek, &size_factor),
956
957 printk(KERN_DEBUG "Setting variable MTRR %d, "
958 "base: %ld%cB, range: %ld%cB, type %s\n",
959 reg, start_base, start_factor,
960 size_base, size_factor,
961 (type == MTRR_TYPE_UNCACHABLE)?"UC":
962 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
963 );
964 }
965 save_var_mtrr(reg++, range_startk, sizek, type);
966 range_startk += sizek;
967 range_sizek -= sizek;
968 if (reg >= num_var_ranges)
969 break;
970 }
971 return reg;
972}
973
974static unsigned __init
975range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
976 unsigned long sizek)
977{
978 unsigned long hole_basek, hole_sizek;
979 unsigned long second_basek, second_sizek;
980 unsigned long range0_basek, range0_sizek;
981 unsigned long range_basek, range_sizek;
982 unsigned long chunk_sizek;
983 unsigned long gran_sizek;
984
985 hole_basek = 0;
986 hole_sizek = 0;
987 second_basek = 0;
988 second_sizek = 0;
989 chunk_sizek = state->chunk_sizek;
990 gran_sizek = state->gran_sizek;
991
992 /* align with gran size, prevent small block used up MTRRs */
993 range_basek = ALIGN(state->range_startk, gran_sizek);
994 if ((range_basek > basek) && basek)
995 return second_sizek;
996 state->range_sizek -= (range_basek - state->range_startk);
997 range_sizek = ALIGN(state->range_sizek, gran_sizek);
998
999 while (range_sizek > state->range_sizek) {
1000 range_sizek -= gran_sizek;
1001 if (!range_sizek)
1002 return 0;
1003 }
1004 state->range_sizek = range_sizek;
1005
1006 /* try to append some small hole */
1007 range0_basek = state->range_startk;
1008 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1009
1010 /* no increase */
1011 if (range0_sizek == state->range_sizek) {
1012 if (debug_print)
1013 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
1014 range0_basek<<10,
1015 (range0_basek + state->range_sizek)<<10);
1016 state->reg = range_to_mtrr(state->reg, range0_basek,
1017 state->range_sizek, MTRR_TYPE_WRBACK);
1018 return 0;
1019 }
1020
1021 /* only cut back, when it is not the last */
1022 if (sizek) {
1023 while (range0_basek + range0_sizek > (basek + sizek)) {
1024 if (range0_sizek >= chunk_sizek)
1025 range0_sizek -= chunk_sizek;
1026 else
1027 range0_sizek = 0;
1028
1029 if (!range0_sizek)
1030 break;
1031 }
1032 }
1033
1034second_try:
1035 range_basek = range0_basek + range0_sizek;
1036
1037 /* one hole in the middle */
1038 if (range_basek > basek && range_basek <= (basek + sizek))
1039 second_sizek = range_basek - basek;
1040
1041 if (range0_sizek > state->range_sizek) {
1042
1043 /* one hole in middle or at end */
1044 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1045
1046 /* hole size should be less than half of range0 size */
1047 if (hole_sizek >= (range0_sizek >> 1) &&
1048 range0_sizek >= chunk_sizek) {
1049 range0_sizek -= chunk_sizek;
1050 second_sizek = 0;
1051 hole_sizek = 0;
1052
1053 goto second_try;
1054 }
1055 }
1056
1057 if (range0_sizek) {
1058 if (debug_print)
1059 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
1060 range0_basek<<10,
1061 (range0_basek + range0_sizek)<<10);
1062 state->reg = range_to_mtrr(state->reg, range0_basek,
1063 range0_sizek, MTRR_TYPE_WRBACK);
1064 }
1065
1066 if (range0_sizek < state->range_sizek) {
1067 /* need to handle left over */
1068 range_sizek = state->range_sizek - range0_sizek;
1069
1070 if (debug_print)
1071 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1072 range_basek<<10,
1073 (range_basek + range_sizek)<<10);
1074 state->reg = range_to_mtrr(state->reg, range_basek,
1075 range_sizek, MTRR_TYPE_WRBACK);
1076 }
1077
1078 if (hole_sizek) {
1079 hole_basek = range_basek - hole_sizek - second_sizek;
1080 if (debug_print)
1081 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1082 hole_basek<<10,
1083 (hole_basek + hole_sizek)<<10);
1084 state->reg = range_to_mtrr(state->reg, hole_basek,
1085 hole_sizek, MTRR_TYPE_UNCACHABLE);
1086 }
1087
1088 return second_sizek;
1089}
1090
1091static void __init
1092set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
1093 unsigned long size_pfn)
1094{
1095 unsigned long basek, sizek;
1096 unsigned long second_sizek = 0;
1097
1098 if (state->reg >= num_var_ranges)
1099 return;
1100
1101 basek = base_pfn << (PAGE_SHIFT - 10);
1102 sizek = size_pfn << (PAGE_SHIFT - 10);
1103
1104 /* See if I can merge with the last range */
1105 if ((basek <= 1024) ||
1106 (state->range_startk + state->range_sizek == basek)) {
1107 unsigned long endk = basek + sizek;
1108 state->range_sizek = endk - state->range_startk;
1109 return;
1110 }
1111 /* Write the range mtrrs */
1112 if (state->range_sizek != 0)
1113 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
1114
1115 /* Allocate an msr */
1116 state->range_startk = basek + second_sizek;
1117 state->range_sizek = sizek - second_sizek;
1118}
1119
1120/* mininum size of mtrr block that can take hole */
1121static u64 mtrr_chunk_size __initdata = (256ULL<<20);
1122
1123static int __init parse_mtrr_chunk_size_opt(char *p)
1124{
1125 if (!p)
1126 return -EINVAL;
1127 mtrr_chunk_size = memparse(p, &p);
1128 return 0;
1129}
1130early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
1131
1132/* granity of mtrr of block */
1133static u64 mtrr_gran_size __initdata;
1134
1135static int __init parse_mtrr_gran_size_opt(char *p)
1136{
1137 if (!p)
1138 return -EINVAL;
1139 mtrr_gran_size = memparse(p, &p);
1140 return 0;
1141}
1142early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
1143
1144static int nr_mtrr_spare_reg __initdata =
1145 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
1146
1147static int __init parse_mtrr_spare_reg(char *arg)
1148{
1149 if (arg)
1150 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
1151 return 0;
1152}
1153
1154early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
1155
1156static int __init
1157x86_setup_var_mtrrs(struct res_range *range, int nr_range,
1158 u64 chunk_size, u64 gran_size)
1159{
1160 struct var_mtrr_state var_state;
1161 int i;
1162 int num_reg;
1163
1164 var_state.range_startk = 0;
1165 var_state.range_sizek = 0;
1166 var_state.reg = 0;
1167 var_state.chunk_sizek = chunk_size >> 10;
1168 var_state.gran_sizek = gran_size >> 10;
1169
1170 memset(range_state, 0, sizeof(range_state));
1171
1172 /* Write the range etc */
1173 for (i = 0; i < nr_range; i++)
1174 set_var_mtrr_range(&var_state, range[i].start,
1175 range[i].end - range[i].start + 1);
1176
1177 /* Write the last range */
1178 if (var_state.range_sizek != 0)
1179 range_to_mtrr_with_hole(&var_state, 0, 0);
1180
1181 num_reg = var_state.reg;
1182 /* Clear out the extra MTRR's */
1183 while (var_state.reg < num_var_ranges) {
1184 save_var_mtrr(var_state.reg, 0, 0, 0);
1185 var_state.reg++;
1186 }
1187
1188 return num_reg;
1189}
1190
1191struct mtrr_cleanup_result {
1192 unsigned long gran_sizek;
1193 unsigned long chunk_sizek;
1194 unsigned long lose_cover_sizek;
1195 unsigned int num_reg;
1196 int bad;
1197};
1198
1199/*
1200 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1201 * chunk size: gran_size, ..., 2G
1202 * so we need (1+16)*8
1203 */
1204#define NUM_RESULT 136
1205#define PSHIFT (PAGE_SHIFT - 10)
1206
1207static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1208static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1209
1210static void __init print_out_mtrr_range_state(void)
1211{
1212 int i;
1213 char start_factor = 'K', size_factor = 'K';
1214 unsigned long start_base, size_base;
1215 mtrr_type type;
1216
1217 for (i = 0; i < num_var_ranges; i++) {
1218
1219 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1220 if (!size_base)
1221 continue;
1222
1223 size_base = to_size_factor(size_base, &size_factor),
1224 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1225 start_base = to_size_factor(start_base, &start_factor),
1226 type = range_state[i].type;
1227
1228 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1229 i, start_base, start_factor,
1230 size_base, size_factor,
1231 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1232 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1233 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1234 );
1235 }
1236}
1237
1238static int __init mtrr_need_cleanup(void)
1239{
1240 int i;
1241 mtrr_type type;
1242 unsigned long size;
1243 /* extra one for all 0 */
1244 int num[MTRR_NUM_TYPES + 1];
1245
1246 /* check entries number */
1247 memset(num, 0, sizeof(num));
1248 for (i = 0; i < num_var_ranges; i++) {
1249 type = range_state[i].type;
1250 size = range_state[i].size_pfn;
1251 if (type >= MTRR_NUM_TYPES)
1252 continue;
1253 if (!size)
1254 type = MTRR_NUM_TYPES;
1255 if (type == MTRR_TYPE_WRPROT)
1256 type = MTRR_TYPE_UNCACHABLE;
1257 num[type]++;
1258 }
1259
1260 /* check if we got UC entries */
1261 if (!num[MTRR_TYPE_UNCACHABLE])
1262 return 0;
1263
1264 /* check if we only had WB and UC */
1265 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1266 num_var_ranges - num[MTRR_NUM_TYPES])
1267 return 0;
1268
1269 return 1;
1270}
1271
1272static unsigned long __initdata range_sums;
1273static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 unsigned long extra_remove_base,
1275 unsigned long extra_remove_size,
1276 int i)
1277{
1278 int num_reg;
1279 static struct res_range range_new[RANGE_NUM];
1280 static int nr_range_new;
1281 unsigned long range_sums_new;
1282
1283 /* convert ranges to var ranges state */
1284 num_reg = x86_setup_var_mtrrs(range, nr_range,
1285 chunk_size, gran_size);
1286
1287 /* we got new setting in range_state, check it */
1288 memset(range_new, 0, sizeof(range_new));
1289 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1290 extra_remove_base, extra_remove_size);
1291 range_sums_new = sum_ranges(range_new, nr_range_new);
1292
1293 result[i].chunk_sizek = chunk_size >> 10;
1294 result[i].gran_sizek = gran_size >> 10;
1295 result[i].num_reg = num_reg;
1296 if (range_sums < range_sums_new) {
1297 result[i].lose_cover_sizek =
1298 (range_sums_new - range_sums) << PSHIFT;
1299 result[i].bad = 1;
1300 } else
1301 result[i].lose_cover_sizek =
1302 (range_sums - range_sums_new) << PSHIFT;
1303
1304 /* double check it */
1305 if (!result[i].bad && !result[i].lose_cover_sizek) {
1306 if (nr_range_new != nr_range ||
1307 memcmp(range, range_new, sizeof(range)))
1308 result[i].bad = 1;
1309 }
1310
1311 if (!result[i].bad && (range_sums - range_sums_new <
1312 min_loss_pfn[num_reg])) {
1313 min_loss_pfn[num_reg] =
1314 range_sums - range_sums_new;
1315 }
1316}
1317
1318static void __init mtrr_print_out_one_result(int i)
1319{
1320 char gran_factor, chunk_factor, lose_factor;
1321 unsigned long gran_base, chunk_base, lose_base;
1322
1323 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1324 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1325 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1326 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1327 result[i].bad ? "*BAD*" : " ",
1328 gran_base, gran_factor, chunk_base, chunk_factor);
1329 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1330 result[i].num_reg, result[i].bad ? "-" : "",
1331 lose_base, lose_factor);
1332}
1333
1334static int __init mtrr_search_optimal_index(void)
1335{
1336 int i;
1337 int num_reg_good;
1338 int index_good;
1339
1340 if (nr_mtrr_spare_reg >= num_var_ranges)
1341 nr_mtrr_spare_reg = num_var_ranges - 1;
1342 num_reg_good = -1;
1343 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1344 if (!min_loss_pfn[i])
1345 num_reg_good = i;
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 return index_good;
1361}
1362
1363
1364static int __init mtrr_cleanup(unsigned address_bits)
1365{
1366 unsigned long extra_remove_base, extra_remove_size;
1367 unsigned long base, size, def, dummy;
1368 mtrr_type type;
1369 u64 chunk_size, gran_size;
1370 int index_good;
1371 int i;
1372
1373 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1374 return 0;
1375 rdmsr(MTRRdefType_MSR, def, dummy);
1376 def &= 0xff;
1377 if (def != MTRR_TYPE_UNCACHABLE)
1378 return 0;
1379
1380 /* get it and store it aside */
1381 memset(range_state, 0, sizeof(range_state));
1382 for (i = 0; i < num_var_ranges; i++) {
1383 mtrr_if->get(i, &base, &size, &type);
1384 range_state[i].base_pfn = base;
1385 range_state[i].size_pfn = size;
1386 range_state[i].type = type;
1387 }
1388
1389 /* check if we need handle it and can handle it */
1390 if (!mtrr_need_cleanup())
1391 return 0;
1392
1393 /* print original var MTRRs at first, for debugging: */
1394 printk(KERN_DEBUG "original variable MTRRs\n");
1395 print_out_mtrr_range_state();
1396
1397 memset(range, 0, sizeof(range));
1398 extra_remove_size = 0;
1399 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1400 if (mtrr_tom2)
1401 extra_remove_size =
1402 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1403 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1404 extra_remove_size);
1405 /*
1406 * [0, 1M) should always be coverred by var mtrr with WB
1407 * and fixed mtrrs should take effective before var mtrr for it
1408 */
1409 nr_range = add_range_with_merge(range, nr_range, 0,
1410 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1411 /* sort the ranges */
1412 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1413
1414 range_sums = sum_ranges(range, nr_range);
1415 printk(KERN_INFO "total RAM coverred: %ldM\n",
1416 range_sums >> (20 - PAGE_SHIFT));
1417
1418 if (mtrr_chunk_size && mtrr_gran_size) {
1419 i = 0;
1420 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1421 extra_remove_base, extra_remove_size, i);
1422
1423 mtrr_print_out_one_result(i);
1424
1425 if (!result[i].bad) {
1426 set_var_mtrr_all(address_bits);
1427 return 1;
1428 }
1429 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1430 "will find optimal one\n");
1431 }
1432
1433 i = 0;
1434 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1435 memset(result, 0, sizeof(result));
1436 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1437
1438 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1439 chunk_size <<= 1) {
1440
1441 if (i >= NUM_RESULT)
1442 continue;
1443
1444 mtrr_calc_range_state(chunk_size, gran_size,
1445 extra_remove_base, extra_remove_size, i);
1446 if (debug_print) {
1447 mtrr_print_out_one_result(i);
1448 printk(KERN_INFO "\n");
1449 }
1450
1451 i++;
1452 }
1453 }
1454
1455 /* try to find the optimal index */
1456 index_good = mtrr_search_optimal_index();
1457
1458 if (index_good != -1) {
1459 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1460 i = index_good;
1461 mtrr_print_out_one_result(i);
1462
1463 /* convert ranges to var ranges state */
1464 chunk_size = result[i].chunk_sizek;
1465 chunk_size <<= 10;
1466 gran_size = result[i].gran_sizek;
1467 gran_size <<= 10;
1468 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1469 set_var_mtrr_all(address_bits);
1470 printk(KERN_DEBUG "New variable MTRRs\n");
1471 print_out_mtrr_range_state();
1472 return 1;
1473 } else {
1474 /* print out all */
1475 for (i = 0; i < NUM_RESULT; i++)
1476 mtrr_print_out_one_result(i);
1477 }
1478
1479 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
1480 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
1481
1482 return 0;
1483}
1484#else
1485static int __init mtrr_cleanup(unsigned address_bits)
1486{
1487 return 0;
1488}
1489#endif
1490
1491static int __initdata changed_by_mtrr_cleanup;
1492
1493static int disable_mtrr_trim;
1494
1495static int __init disable_mtrr_trim_setup(char *str)
1496{
1497 disable_mtrr_trim = 1;
1498 return 0;
1499}
1500early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
1501
1502/*
1503 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
1504 * for memory >4GB. Check for that here.
1505 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
1506 * apply to are wrong, but so far we don't know of any such case in the wild.
1507 */
1508#define Tom2Enabled (1U << 21)
1509#define Tom2ForceMemTypeWB (1U << 22)
1510
1511int __init amd_special_default_mtrr(void)
1512{
1513 u32 l, h;
1514
1515 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
1516 return 0;
1517 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
1518 return 0;
1519 /* In case some hypervisor doesn't pass SYSCFG through */
1520 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
1521 return 0;
1522 /*
1523 * Memory between 4GB and top of mem is forced WB by this magic bit.
1524 * Reserved before K8RevF, but should be zero there.
1525 */
1526 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
1527 (Tom2Enabled | Tom2ForceMemTypeWB))
1528 return 1;
1529 return 0;
1530}
1531
1532static u64 __init real_trim_memory(unsigned long start_pfn,
1533 unsigned long limit_pfn)
1534{
1535 u64 trim_start, trim_size;
1536 trim_start = start_pfn;
1537 trim_start <<= PAGE_SHIFT;
1538 trim_size = limit_pfn;
1539 trim_size <<= PAGE_SHIFT;
1540 trim_size -= trim_start;
1541
1542 return e820_update_range(trim_start, trim_size, E820_RAM,
1543 E820_RESERVED);
1544}
1545/**
1546 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
1547 * @end_pfn: ending page frame number
1548 *
1549 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
1550 * memory configurations. This routine checks that the highest MTRR matches
1551 * the end of memory, to make sure the MTRRs having a write back type cover
1552 * all of the memory the kernel is intending to use. If not, it'll trim any
1553 * memory off the end by adjusting end_pfn, removing it from the kernel's
1554 * allocation pools, warning the user with an obnoxious message.
1555 */
1556int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1557{
1558 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1559 mtrr_type type;
1560 u64 total_trim_size;
1561
1562 /* extra one for all 0 */
1563 int num[MTRR_NUM_TYPES + 1];
1564 /*
1565 * Make sure we only trim uncachable memory on machines that
1566 * support the Intel MTRR architecture:
1567 */
1568 if (!is_cpu(INTEL) || disable_mtrr_trim)
1569 return 0;
1570 rdmsr(MTRRdefType_MSR, def, dummy);
1571 def &= 0xff;
1572 if (def != MTRR_TYPE_UNCACHABLE)
1573 return 0;
1574
1575 /* get it and store it aside */
1576 memset(range_state, 0, sizeof(range_state));
1577 for (i = 0; i < num_var_ranges; i++) {
1578 mtrr_if->get(i, &base, &size, &type);
1579 range_state[i].base_pfn = base;
1580 range_state[i].size_pfn = size;
1581 range_state[i].type = type;
1582 }
1583
1584 /* Find highest cached pfn */
1585 for (i = 0; i < num_var_ranges; i++) {
1586 type = range_state[i].type;
1587 if (type != MTRR_TYPE_WRBACK)
1588 continue;
1589 base = range_state[i].base_pfn;
1590 size = range_state[i].size_pfn;
1591 if (highest_pfn < base + size)
1592 highest_pfn = base + size;
1593 }
1594
1595 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1596 if (!highest_pfn) {
1597 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1598 return 0;
1599 }
1600
1601 /* check entries number */
1602 memset(num, 0, sizeof(num));
1603 for (i = 0; i < num_var_ranges; i++) {
1604 type = range_state[i].type;
1605 if (type >= MTRR_NUM_TYPES)
1606 continue;
1607 size = range_state[i].size_pfn;
1608 if (!size)
1609 type = MTRR_NUM_TYPES;
1610 num[type]++;
1611 }
1612
1613 /* no entry for WB? */
1614 if (!num[MTRR_TYPE_WRBACK])
1615 return 0;
1616
1617 /* check if we only had WB and UC */
1618 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1619 num_var_ranges - num[MTRR_NUM_TYPES])
1620 return 0;
1621
1622 memset(range, 0, sizeof(range));
1623 nr_range = 0;
1624 if (mtrr_tom2) {
1625 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1626 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1627 if (highest_pfn < range[nr_range].end + 1)
1628 highest_pfn = range[nr_range].end + 1;
1629 nr_range++;
1630 }
1631 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1632
1633 total_trim_size = 0;
1634 /* check the head */
1635 if (range[0].start)
1636 total_trim_size += real_trim_memory(0, range[0].start);
1637 /* check the holes */
1638 for (i = 0; i < nr_range - 1; i++) {
1639 if (range[i].end + 1 < range[i+1].start)
1640 total_trim_size += real_trim_memory(range[i].end + 1,
1641 range[i+1].start);
1642 }
1643 /* check the top */
1644 i = nr_range - 1;
1645 if (range[i].end + 1 < end_pfn)
1646 total_trim_size += real_trim_memory(range[i].end + 1,
1647 end_pfn);
1648
1649 if (total_trim_size) {
1650 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1651 " all of memory, losing %lluMB of RAM.\n",
1652 total_trim_size >> 20);
1653
1654 if (!changed_by_mtrr_cleanup)
1655 WARN_ON(1);
1656
1657 printk(KERN_INFO "update e820 for mtrr\n");
1658 update_e820();
1659
1660 return 1;
1661 }
1662
1663 return 0;
1664}
1665 614
1666/** 615/**
1667 * mtrr_bp_init - initialize mtrrs on the boot CPU 616 * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6..77f67f7b347 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
79 79
80extern unsigned int num_var_ranges; 80extern unsigned int num_var_ranges;
81extern u64 mtrr_tom2; 81extern u64 mtrr_tom2;
82extern struct mtrr_state_type mtrr_state;
82 83
83void mtrr_state_warn(void); 84void mtrr_state_warn(void);
84const char *mtrr_attrib_to_str(int x); 85const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
88int amd_init_mtrr(void); 89int amd_init_mtrr(void);
89int cyrix_init_mtrr(void); 90int cyrix_init_mtrr(void);
90int centaur_init_mtrr(void); 91int centaur_init_mtrr(void);
92
93extern int changed_by_mtrr_cleanup;
94extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5a..bb62b3e5caa 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
98#endif 98#endif
99} 99}
100 100
101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
102 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
103 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta, 104 .c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e096..fd2c37bf7ac 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
8 * so no special init takes place. 8 * so no special init takes place.
9 */ 9 */
10 10
11static struct cpu_dev umc_cpu_dev __cpuinitdata = { 11static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
12 .c_vendor = "UMC", 12 .c_vendor = "UMC",
13 .c_ident = { "UMC UMC UMC" }, 13 .c_ident = { "UMC UMC UMC" },
14 .c_models = { 14 .c_models = {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 169a120587b..87b67e3a765 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
729 729
730 spin_unlock_irqrestore(&ds_lock, irq); 730 spin_unlock_irqrestore(&ds_lock, irq);
731 731
732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); 732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
733 ds_resume_pebs(tracer); 733 ds_resume_pebs(tracer);
734 734
735 return tracer; 735 return tracer;
@@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1029 1029
1030void ds_exit_thread(struct task_struct *tsk) 1030void ds_exit_thread(struct task_struct *tsk)
1031{ 1031{
1032 WARN_ON(tsk->thread.ds_ctx);
1033} 1032}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 508bec1cee2..fb638d9ce6d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
110/* 110/*
111 * Add a memory region to the kernel e820 map. 111 * Add a memory region to the kernel e820 map.
112 */ 112 */
113void __init e820_add_region(u64 start, u64 size, int type) 113static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
114 int type)
114{ 115{
115 int x = e820.nr_map; 116 int x = e820x->nr_map;
116 117
117 if (x == ARRAY_SIZE(e820.map)) { 118 if (x == ARRAY_SIZE(e820x->map)) {
118 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); 119 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
119 return; 120 return;
120 } 121 }
121 122
122 e820.map[x].addr = start; 123 e820x->map[x].addr = start;
123 e820.map[x].size = size; 124 e820x->map[x].size = size;
124 e820.map[x].type = type; 125 e820x->map[x].type = type;
125 e820.nr_map++; 126 e820x->nr_map++;
127}
128
129void __init e820_add_region(u64 start, u64 size, int type)
130{
131 __e820_add_region(&e820, start, size, type);
132}
133
134static void __init e820_print_type(u32 type)
135{
136 switch (type) {
137 case E820_RAM:
138 case E820_RESERVED_KERN:
139 printk(KERN_CONT "(usable)");
140 break;
141 case E820_RESERVED:
142 printk(KERN_CONT "(reserved)");
143 break;
144 case E820_ACPI:
145 printk(KERN_CONT "(ACPI data)");
146 break;
147 case E820_NVS:
148 printk(KERN_CONT "(ACPI NVS)");
149 break;
150 case E820_UNUSABLE:
151 printk(KERN_CONT "(unusable)");
152 break;
153 default:
154 printk(KERN_CONT "type %u", type);
155 break;
156 }
126} 157}
127 158
128void __init e820_print_map(char *who) 159void __init e820_print_map(char *who)
@@ -134,27 +165,8 @@ void __init e820_print_map(char *who)
134 (unsigned long long) e820.map[i].addr, 165 (unsigned long long) e820.map[i].addr,
135 (unsigned long long) 166 (unsigned long long)
136 (e820.map[i].addr + e820.map[i].size)); 167 (e820.map[i].addr + e820.map[i].size));
137 switch (e820.map[i].type) { 168 e820_print_type(e820.map[i].type);
138 case E820_RAM: 169 printk(KERN_CONT "\n");
139 case E820_RESERVED_KERN:
140 printk(KERN_CONT "(usable)\n");
141 break;
142 case E820_RESERVED:
143 printk(KERN_CONT "(reserved)\n");
144 break;
145 case E820_ACPI:
146 printk(KERN_CONT "(ACPI data)\n");
147 break;
148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n");
150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
154 default:
155 printk(KERN_CONT "type %u\n", e820.map[i].type);
156 break;
157 }
158 } 170 }
159} 171}
160 172
@@ -417,11 +429,12 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
417 return __append_e820_map(biosmap, nr_map); 429 return __append_e820_map(biosmap, nr_map);
418} 430}
419 431
420static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, 432static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
421 u64 size, unsigned old_type, 433 u64 size, unsigned old_type,
422 unsigned new_type) 434 unsigned new_type)
423{ 435{
424 int i; 436 u64 end;
437 unsigned int i;
425 u64 real_updated_size = 0; 438 u64 real_updated_size = 0;
426 439
427 BUG_ON(old_type == new_type); 440 BUG_ON(old_type == new_type);
@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
429 if (size > (ULLONG_MAX - start)) 442 if (size > (ULLONG_MAX - start))
430 size = ULLONG_MAX - start; 443 size = ULLONG_MAX - start;
431 444
432 for (i = 0; i < e820.nr_map; i++) { 445 end = start + size;
446 printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
447 (unsigned long long) start,
448 (unsigned long long) end);
449 e820_print_type(old_type);
450 printk(KERN_CONT " ==> ");
451 e820_print_type(new_type);
452 printk(KERN_CONT "\n");
453
454 for (i = 0; i < e820x->nr_map; i++) {
433 struct e820entry *ei = &e820x->map[i]; 455 struct e820entry *ei = &e820x->map[i];
434 u64 final_start, final_end; 456 u64 final_start, final_end;
457 u64 ei_end;
458
435 if (ei->type != old_type) 459 if (ei->type != old_type)
436 continue; 460 continue;
437 /* totally covered? */ 461
438 if (ei->addr >= start && 462 ei_end = ei->addr + ei->size;
439 (ei->addr + ei->size) <= (start + size)) { 463 /* totally covered by new range? */
464 if (ei->addr >= start && ei_end <= end) {
440 ei->type = new_type; 465 ei->type = new_type;
441 real_updated_size += ei->size; 466 real_updated_size += ei->size;
442 continue; 467 continue;
443 } 468 }
469
470 /* new range is totally covered? */
471 if (ei->addr < start && ei_end > end) {
472 __e820_add_region(e820x, start, size, new_type);
473 __e820_add_region(e820x, end, ei_end - end, ei->type);
474 ei->size = start - ei->addr;
475 real_updated_size += size;
476 continue;
477 }
478
444 /* partially covered */ 479 /* partially covered */
445 final_start = max(start, ei->addr); 480 final_start = max(start, ei->addr);
446 final_end = min(start + size, ei->addr + ei->size); 481 final_end = min(end, ei_end);
447 if (final_start >= final_end) 482 if (final_start >= final_end)
448 continue; 483 continue;
449 e820_add_region(final_start, final_end - final_start, 484
450 new_type); 485 __e820_add_region(e820x, final_start, final_end - final_start,
486 new_type);
487
451 real_updated_size += final_end - final_start; 488 real_updated_size += final_end - final_start;
452 489
490 /*
491 * left range could be head or tail, so need to update
492 * size at first.
493 */
453 ei->size -= final_end - final_start; 494 ei->size -= final_end - final_start;
454 if (ei->addr < final_start) 495 if (ei->addr < final_start)
455 continue; 496 continue;
@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
461u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, 502u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
462 unsigned new_type) 503 unsigned new_type)
463{ 504{
464 return e820_update_range_map(&e820, start, size, old_type, new_type); 505 return __e820_update_range(&e820, start, size, old_type, new_type);
465} 506}
466 507
467static u64 __init e820_update_range_saved(u64 start, u64 size, 508static u64 __init e820_update_range_saved(u64 start, u64 size,
468 unsigned old_type, unsigned new_type) 509 unsigned old_type, unsigned new_type)
469{ 510{
470 return e820_update_range_map(&e820_saved, start, size, old_type, 511 return __e820_update_range(&e820_saved, start, size, old_type,
471 new_type); 512 new_type);
472} 513}
473 514
@@ -1020,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1020 continue; 1061 continue;
1021 return addr; 1062 return addr;
1022 } 1063 }
1023 return -1UL;
1024 1064
1065 return -1ULL;
1025} 1066}
1026 1067
1027/* 1068/*
@@ -1034,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
1034 u64 start; 1075 u64 start;
1035 1076
1036 start = startt; 1077 start = startt;
1037 while (size < sizet) 1078 while (size < sizet && (start + 1))
1038 start = find_e820_area_size(start, &size, align); 1079 start = find_e820_area_size(start, &size, align);
1039 1080
1040 if (size < sizet) 1081 if (size < sizet)
1041 return 0; 1082 return 0;
1042 1083
1084#ifdef CONFIG_X86_32
1085 if (start >= MAXMEM)
1086 return 0;
1087 if (start + size > MAXMEM)
1088 size = MAXMEM - start;
1089#endif
1090
1043 addr = round_down(start + size - sizet, align); 1091 addr = round_down(start + size - sizet, align);
1092 if (addr < start)
1093 return 0;
1044 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); 1094 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
1045 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); 1095 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
1046 printk(KERN_INFO "update e820 for early_reserve_e820\n"); 1096 printk(KERN_INFO "update e820 for early_reserve_e820\n");
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 639ad98238a..335f049d110 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); 250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251} 251}
252 252
253static void dbgp_mdelay(int ms) 253static void __init dbgp_mdelay(int ms)
254{ 254{
255 int i; 255 int i;
256 256
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
311 writel(hi, &ehci_debug->data47); 311 writel(hi, &ehci_debug->data47);
312} 312}
313 313
314static void dbgp_get_data(void *buf, int size) 314static void __init dbgp_get_data(void *buf, int size)
315{ 315{
316 unsigned char *bytes = buf; 316 unsigned char *bytes = buf;
317 u32 lo, hi; 317 u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
355 return ret; 355 return ret;
356} 356}
357 357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, 358static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size) 359 int size)
360{ 360{
361 u32 pids, addr, ctrl; 361 u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
386 return ret; 386 return ret;
387} 387}
388 388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request, 389static int __init dbgp_control_msg(unsigned devnum, int requesttype,
390 int value, int index, void *data, int size) 390 int request, int value, int index, void *data, int size)
391{ 391{
392 u32 pids, addr, ctrl; 392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req; 393 struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
489 return 0; 489 return 0;
490} 490}
491 491
492static int ehci_reset_port(int port) 492static int __init ehci_reset_port(int port)
493{ 493{
494 u32 portsc; 494 u32 portsc;
495 u32 delay_time, delay; 495 u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
532 return -EBUSY; 532 return -EBUSY;
533} 533}
534 534
535static int ehci_wait_for_port(int port) 535static int __init ehci_wait_for_port(int port)
536{ 536{
537 u32 status; 537 u32 status;
538 int ret, reps; 538 int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
557 557
558typedef void (*set_debug_port_t)(int port); 558typedef void (*set_debug_port_t)(int port);
559 559
560static void default_set_debug_port(int port) 560static void __init default_set_debug_port(int port)
561{ 561{
562} 562}
563 563
564static set_debug_port_t set_debug_port = default_set_debug_port; 564static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
565 565
566static void nvidia_set_debug_port(int port) 566static void __init nvidia_set_debug_port(int port)
567{ 567{
568 u32 dword; 568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index b205272ad39..1736acc4d7a 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
469 efi_memory_desc_t *md; 469 efi_memory_desc_t *md;
470 efi_status_t status; 470 efi_status_t status;
471 unsigned long size; 471 unsigned long size;
472 u64 end, systab, addr, npages; 472 u64 end, systab, addr, npages, end_pfn;
473 void *p, *va; 473 void *p, *va;
474 474
475 efi.systab = NULL; 475 efi.systab = NULL;
@@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void)
481 size = md->num_pages << EFI_PAGE_SHIFT; 481 size = md->num_pages << EFI_PAGE_SHIFT;
482 end = md->phys_addr + size; 482 end = md->phys_addr + size;
483 483
484 if (PFN_UP(end) <= max_low_pfn_mapped) 484 end_pfn = PFN_UP(end);
485 if (end_pfn <= max_low_pfn_mapped
486 || (end_pfn > (1UL << (32 - PAGE_SHIFT))
487 && end_pfn <= max_pfn_mapped))
485 va = __va(md->phys_addr); 488 va = __va(md->phys_addr);
486 else 489 else
487 va = efi_ioremap(md->phys_addr, size); 490 va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index a4ee29127fd..22c3b7828c5 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void)
100 100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
102{ 102{
103 static unsigned pages_mapped __initdata; 103 unsigned long last_map_pfn;
104 unsigned i, pages;
105 unsigned long offset;
106 104
107 pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); 105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
108 offset = phys_addr & ~PAGE_MASK; 106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
109 phys_addr &= PAGE_MASK;
110
111 if (pages_mapped + pages > MAX_EFI_IO_PAGES)
112 return NULL; 107 return NULL;
113 108
114 for (i = 0; i < pages; i++) { 109 return (void __iomem *)__va(phys_addr);
115 __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
116 phys_addr, PAGE_KERNEL);
117 phys_addr += PAGE_SIZE;
118 pages_mapped++;
119 }
120
121 return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
122 (pages_mapped - pages)) + offset;
123} 110}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 899e8938e79..c929add475c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -442,8 +442,7 @@ sysenter_past_esp:
442 442
443 GET_THREAD_INFO(%ebp) 443 GET_THREAD_INFO(%ebp)
444 444
445 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 445 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
446 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
447 jnz sysenter_audit 446 jnz sysenter_audit
448sysenter_do_call: 447sysenter_do_call:
449 cmpl $(nr_syscalls), %eax 448 cmpl $(nr_syscalls), %eax
@@ -454,7 +453,7 @@ sysenter_do_call:
454 DISABLE_INTERRUPTS(CLBR_ANY) 453 DISABLE_INTERRUPTS(CLBR_ANY)
455 TRACE_IRQS_OFF 454 TRACE_IRQS_OFF
456 movl TI_flags(%ebp), %ecx 455 movl TI_flags(%ebp), %ecx
457 testw $_TIF_ALLWORK_MASK, %cx 456 testl $_TIF_ALLWORK_MASK, %ecx
458 jne sysexit_audit 457 jne sysexit_audit
459sysenter_exit: 458sysenter_exit:
460/* if something modifies registers it must also disable sysexit */ 459/* if something modifies registers it must also disable sysexit */
@@ -468,7 +467,7 @@ sysenter_exit:
468 467
469#ifdef CONFIG_AUDITSYSCALL 468#ifdef CONFIG_AUDITSYSCALL
470sysenter_audit: 469sysenter_audit:
471 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 470 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
472 jnz syscall_trace_entry 471 jnz syscall_trace_entry
473 addl $4,%esp 472 addl $4,%esp
474 CFI_ADJUST_CFA_OFFSET -4 473 CFI_ADJUST_CFA_OFFSET -4
@@ -485,7 +484,7 @@ sysenter_audit:
485 jmp sysenter_do_call 484 jmp sysenter_do_call
486 485
487sysexit_audit: 486sysexit_audit:
488 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 487 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
489 jne syscall_exit_work 488 jne syscall_exit_work
490 TRACE_IRQS_ON 489 TRACE_IRQS_ON
491 ENABLE_INTERRUPTS(CLBR_ANY) 490 ENABLE_INTERRUPTS(CLBR_ANY)
@@ -498,7 +497,7 @@ sysexit_audit:
498 DISABLE_INTERRUPTS(CLBR_ANY) 497 DISABLE_INTERRUPTS(CLBR_ANY)
499 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
500 movl TI_flags(%ebp), %ecx 499 movl TI_flags(%ebp), %ecx
501 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 500 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
502 jne syscall_exit_work 501 jne syscall_exit_work
503 movl PT_EAX(%esp),%eax /* reload syscall return value */ 502 movl PT_EAX(%esp),%eax /* reload syscall return value */
504 jmp sysenter_exit 503 jmp sysenter_exit
@@ -523,8 +522,7 @@ ENTRY(system_call)
523 SAVE_ALL 522 SAVE_ALL
524 GET_THREAD_INFO(%ebp) 523 GET_THREAD_INFO(%ebp)
525 # system call tracing in operation / emulation 524 # system call tracing in operation / emulation
526 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 525 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
527 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
528 jnz syscall_trace_entry 526 jnz syscall_trace_entry
529 cmpl $(nr_syscalls), %eax 527 cmpl $(nr_syscalls), %eax
530 jae syscall_badsys 528 jae syscall_badsys
@@ -538,7 +536,7 @@ syscall_exit:
538 # between sampling and the iret 536 # between sampling and the iret
539 TRACE_IRQS_OFF 537 TRACE_IRQS_OFF
540 movl TI_flags(%ebp), %ecx 538 movl TI_flags(%ebp), %ecx
541 testw $_TIF_ALLWORK_MASK, %cx # current->work 539 testl $_TIF_ALLWORK_MASK, %ecx # current->work
542 jne syscall_exit_work 540 jne syscall_exit_work
543 541
544restore_all: 542restore_all:
@@ -673,7 +671,7 @@ END(syscall_trace_entry)
673 # perform syscall exit tracing 671 # perform syscall exit tracing
674 ALIGN 672 ALIGN
675syscall_exit_work: 673syscall_exit_work:
676 testb $_TIF_WORK_SYSCALL_EXIT, %cl 674 testl $_TIF_WORK_SYSCALL_EXIT, %ecx
677 jz work_pending 675 jz work_pending
678 TRACE_IRQS_ON 676 TRACE_IRQS_ON
679 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 677 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 83d1836b946..a331ec38af9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -368,6 +368,7 @@ ENTRY(save_rest)
368END(save_rest) 368END(save_rest)
369 369
370/* save complete stack frame */ 370/* save complete stack frame */
371 .pushsection .kprobes.text, "ax"
371ENTRY(save_paranoid) 372ENTRY(save_paranoid)
372 XCPT_FRAME 1 RDI+8 373 XCPT_FRAME 1 RDI+8
373 cld 374 cld
@@ -396,6 +397,7 @@ ENTRY(save_paranoid)
3961: ret 3971: ret
397 CFI_ENDPROC 398 CFI_ENDPROC
398END(save_paranoid) 399END(save_paranoid)
400 .popsection
399 401
400/* 402/*
401 * A newly forked process directly context switches into this address. 403 * A newly forked process directly context switches into this address.
@@ -416,7 +418,6 @@ ENTRY(ret_from_fork)
416 418
417 GET_THREAD_INFO(%rcx) 419 GET_THREAD_INFO(%rcx)
418 420
419 CFI_REMEMBER_STATE
420 RESTORE_REST 421 RESTORE_REST
421 422
422 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 423 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -428,7 +429,6 @@ ENTRY(ret_from_fork)
428 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 429 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
429 jmp ret_from_sys_call # go to the SYSRET fastpath 430 jmp ret_from_sys_call # go to the SYSRET fastpath
430 431
431 CFI_RESTORE_STATE
432 CFI_ENDPROC 432 CFI_ENDPROC
433END(ret_from_fork) 433END(ret_from_fork)
434 434
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
984#endif 984#endif
985apicinterrupt LOCAL_TIMER_VECTOR \ 985apicinterrupt LOCAL_TIMER_VECTOR \
986 apic_timer_interrupt smp_apic_timer_interrupt 986 apic_timer_interrupt smp_apic_timer_interrupt
987apicinterrupt GENERIC_INTERRUPT_VECTOR \
988 generic_interrupt smp_generic_interrupt
987 989
988#ifdef CONFIG_SMP 990#ifdef CONFIG_SMP
989apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 991apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe18..3f8579f8d42 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
18{ 18{
19 reserve_trampoline_memory(); 19 reserve_trampoline_memory();
20 20
21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
22 22
23#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
24 /* Reserve INITRD */ 24 /* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 30 }
31#endif 31#endif
32 reserve_early(init_pg_tables_start, init_pg_tables_end,
33 "INIT_PG_TABLE");
34
35 reserve_ebda_region(); 32 reserve_ebda_region();
36 33
37 /* 34 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b27224769..70eaa852c73 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
100 100
101 reserve_trampoline_memory(); 101 reserve_trampoline_memory();
102 102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 104
105#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */ 106 /* Reserve INITRD */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c32ca19d591..30683883e0c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -38,42 +38,40 @@
38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id 38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
39 39
40/* 40/*
41 * This is how much memory *in addition to the memory covered up to 41 * This is how much memory in addition to the memory covered up to
42 * and including _end* we need mapped initially. 42 * and including _end we need mapped initially.
43 * We need: 43 * We need:
44 * - one bit for each possible page, but only in low memory, which means 44 * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
45 * 2^32/4096/8 = 128K worst case (4G/4G split.) 45 * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
46 * - enough space to map all low memory, which means
47 * (2^32/4096) / 1024 pages (worst case, non PAE)
48 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
49 * - a few pages for allocator use before the kernel pagetable has
50 * been set up
51 * 46 *
52 * Modulo rounding, each megabyte assigned here requires a kilobyte of 47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
53 * memory, which is currently unreclaimed. 48 * memory, which is currently unreclaimed.
54 * 49 *
55 * This should be a multiple of a page. 50 * This should be a multiple of a page.
51 *
52 * KERNEL_IMAGE_SIZE should be greater than pa(_end)
53 * and small than max_low_pfn, otherwise will waste some page table entries
56 */ 54 */
57LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
58
59/*
60 * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
61 * pagetables from above the 16MB DMA limit, so we'll have to set
62 * up pagetables 16MB more (worst-case):
63 */
64#ifdef CONFIG_DEBUG_PAGEALLOC
65LOW_PAGES = LOW_PAGES + 0x1000000
66#endif
67 55
68#if PTRS_PER_PMD > 1 56#if PTRS_PER_PMD > 1
69PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD 57#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
70#else 58#else
71PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) 59#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
72#endif 60#endif
73BOOTBITMAP_SIZE = LOW_PAGES / 8
74ALLOCATOR_SLOP = 4
75 61
76INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm 62/* Enough space to fit pagetables for the low memory linear map */
63MAPPING_BEYOND_END = \
64 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
65
66/*
67 * Worst-case size of the kernel mapping we need to make:
68 * the worst-case size of the kernel itself, plus the extra we need
69 * to map for the linear map.
70 */
71KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
72
73INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
74RESERVE_BRK(pagetables, INIT_MAP_SIZE)
77 75
78/* 76/*
79 * 32-bit kernel entrypoint; only used by the boot CPU. On entry, 77 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -166,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
166 164
167/* 165/*
168 * Initialize page tables. This creates a PDE and a set of page 166 * Initialize page tables. This creates a PDE and a set of page
169 * tables, which are located immediately beyond _end. The variable 167 * tables, which are located immediately beyond __brk_base. The variable
170 * init_pg_tables_end is set up to point to the first "safe" location. 168 * _brk_end is set up to point to the first "safe" location.
171 * Mappings are created both at virtual address 0 (identity mapping) 169 * Mappings are created both at virtual address 0 (identity mapping)
172 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. 170 * and PAGE_OFFSET for up to _end.
173 * 171 *
174 * Note that the stack is not yet set up! 172 * Note that the stack is not yet set up!
175 */ 173 */
@@ -190,8 +188,7 @@ default_entry:
190 188
191 xorl %ebx,%ebx /* %ebx is kept at zero */ 189 xorl %ebx,%ebx /* %ebx is kept at zero */
192 190
193 movl $pa(pg0), %edi 191 movl $pa(__brk_base), %edi
194 movl %edi, pa(init_pg_tables_start)
195 movl $pa(swapper_pg_pmd), %edx 192 movl $pa(swapper_pg_pmd), %edx
196 movl $PTE_IDENT_ATTR, %eax 193 movl $PTE_IDENT_ATTR, %eax
19710: 19410:
@@ -209,14 +206,14 @@ default_entry:
209 loop 11b 206 loop 11b
210 207
211 /* 208 /*
212 * End condition: we must map up to and including INIT_MAP_BEYOND_END 209 * End condition: we must map up to the end + MAPPING_BEYOND_END.
213 * bytes beyond the end of our own page tables.
214 */ 210 */
215 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 211 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
216 cmpl %ebp,%eax 212 cmpl %ebp,%eax
217 jb 10b 213 jb 10b
2181: 2141:
219 movl %edi,pa(init_pg_tables_end) 215 addl $__PAGE_OFFSET, %edi
216 movl %edi, pa(_brk_end)
220 shrl $12, %eax 217 shrl $12, %eax
221 movl %eax, pa(max_pfn_mapped) 218 movl %eax, pa(max_pfn_mapped)
222 219
@@ -227,8 +224,7 @@ default_entry:
227 224
228page_pde_offset = (__PAGE_OFFSET >> 20); 225page_pde_offset = (__PAGE_OFFSET >> 20);
229 226
230 movl $pa(pg0), %edi 227 movl $pa(__brk_base), %edi
231 movl %edi, pa(init_pg_tables_start)
232 movl $pa(swapper_pg_dir), %edx 228 movl $pa(swapper_pg_dir), %edx
233 movl $PTE_IDENT_ATTR, %eax 229 movl $PTE_IDENT_ATTR, %eax
23410: 23010:
@@ -242,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
242 addl $0x1000,%eax 238 addl $0x1000,%eax
243 loop 11b 239 loop 11b
244 /* 240 /*
245 * End condition: we must map up to and including INIT_MAP_BEYOND_END 241 * End condition: we must map up to the end + MAPPING_BEYOND_END.
246 * bytes beyond the end of our own page tables; the +0x007 is
247 * the attribute bits
248 */ 242 */
249 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 243 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
250 cmpl %ebp,%eax 244 cmpl %ebp,%eax
251 jb 10b 245 jb 10b
252 movl %edi,pa(init_pg_tables_end) 246 addl $__PAGE_OFFSET, %edi
247 movl %edi, pa(_brk_end)
253 shrl $12, %eax 248 shrl $12, %eax
254 movl %eax, pa(max_pfn_mapped) 249 movl %eax, pa(max_pfn_mapped)
255 250
@@ -636,6 +631,7 @@ swapper_pg_fixmap:
636 .fill 1024,4,0 631 .fill 1024,4,0
637ENTRY(empty_zero_page) 632ENTRY(empty_zero_page)
638 .fill 4096,1,0 633 .fill 4096,1,0
634
639/* 635/*
640 * This starts the data section. 636 * This starts the data section.
641 */ 637 */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0..f2f8540a7f3 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
136#ifdef CONFIG_X86_32 136#ifdef CONFIG_X86_32
137 if (!HAVE_HWFP) { 137 if (!HAVE_HWFP) {
138 memset(tsk->thread.xstate, 0, xstate_size); 138 memset(tsk->thread.xstate, 0, xstate_size);
139 finit(); 139 finit_task(tsk);
140 set_stopped_child_used_math(tsk); 140 set_stopped_child_used_math(tsk);
141 return 0; 141 return 0;
142 } 142 }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f13ca1650aa..b8ac3b6cf77 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
15 15
16atomic_t irq_err_count; 16atomic_t irq_err_count;
17 17
18/* Function pointer for generic interrupt vector handling */
19void (*generic_interrupt_extension)(void) = NULL;
20
18/* 21/*
19 * 'what should we do if we get a hw irq event on an illegal vector'. 22 * 'what should we do if we get a hw irq event on an illegal vector'.
20 * each architecture has to answer this themselves. 23 * each architecture has to answer this themselves.
@@ -42,55 +45,61 @@ void ack_bad_irq(unsigned int irq)
42/* 45/*
43 * /proc/interrupts printing: 46 * /proc/interrupts printing:
44 */ 47 */
45static int show_other_interrupts(struct seq_file *p) 48static int show_other_interrupts(struct seq_file *p, int prec)
46{ 49{
47 int j; 50 int j;
48 51
49 seq_printf(p, "NMI: "); 52 seq_printf(p, "%*s: ", prec, "NMI");
50 for_each_online_cpu(j) 53 for_each_online_cpu(j)
51 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); 54 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
52 seq_printf(p, " Non-maskable interrupts\n"); 55 seq_printf(p, " Non-maskable interrupts\n");
53#ifdef CONFIG_X86_LOCAL_APIC 56#ifdef CONFIG_X86_LOCAL_APIC
54 seq_printf(p, "LOC: "); 57 seq_printf(p, "%*s: ", prec, "LOC");
55 for_each_online_cpu(j) 58 for_each_online_cpu(j)
56 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 59 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
57 seq_printf(p, " Local timer interrupts\n"); 60 seq_printf(p, " Local timer interrupts\n");
58#endif 61#endif
62 if (generic_interrupt_extension) {
63 seq_printf(p, "PLT: ");
64 for_each_online_cpu(j)
65 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
66 seq_printf(p, " Platform interrupts\n");
67 }
59#ifdef CONFIG_SMP 68#ifdef CONFIG_SMP
60 seq_printf(p, "RES: "); 69 seq_printf(p, "%*s: ", prec, "RES");
61 for_each_online_cpu(j) 70 for_each_online_cpu(j)
62 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); 71 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
63 seq_printf(p, " Rescheduling interrupts\n"); 72 seq_printf(p, " Rescheduling interrupts\n");
64 seq_printf(p, "CAL: "); 73 seq_printf(p, "%*s: ", prec, "CAL");
65 for_each_online_cpu(j) 74 for_each_online_cpu(j)
66 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); 75 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
67 seq_printf(p, " Function call interrupts\n"); 76 seq_printf(p, " Function call interrupts\n");
68 seq_printf(p, "TLB: "); 77 seq_printf(p, "%*s: ", prec, "TLB");
69 for_each_online_cpu(j) 78 for_each_online_cpu(j)
70 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 79 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
71 seq_printf(p, " TLB shootdowns\n"); 80 seq_printf(p, " TLB shootdowns\n");
72#endif 81#endif
73#ifdef CONFIG_X86_MCE 82#ifdef CONFIG_X86_MCE
74 seq_printf(p, "TRM: "); 83 seq_printf(p, "%*s: ", prec, "TRM");
75 for_each_online_cpu(j) 84 for_each_online_cpu(j)
76 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 85 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
77 seq_printf(p, " Thermal event interrupts\n"); 86 seq_printf(p, " Thermal event interrupts\n");
78# ifdef CONFIG_X86_64 87# ifdef CONFIG_X86_64
79 seq_printf(p, "THR: "); 88 seq_printf(p, "%*s: ", prec, "THR");
80 for_each_online_cpu(j) 89 for_each_online_cpu(j)
81 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 90 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
82 seq_printf(p, " Threshold APIC interrupts\n"); 91 seq_printf(p, " Threshold APIC interrupts\n");
83# endif 92# endif
84#endif 93#endif
85#ifdef CONFIG_X86_LOCAL_APIC 94#ifdef CONFIG_X86_LOCAL_APIC
86 seq_printf(p, "SPU: "); 95 seq_printf(p, "%*s: ", prec, "SPU");
87 for_each_online_cpu(j) 96 for_each_online_cpu(j)
88 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 97 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
89 seq_printf(p, " Spurious interrupts\n"); 98 seq_printf(p, " Spurious interrupts\n");
90#endif 99#endif
91 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); 100 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
92#if defined(CONFIG_X86_IO_APIC) 101#if defined(CONFIG_X86_IO_APIC)
93 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); 102 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
94#endif 103#endif
95 return 0; 104 return 0;
96} 105}
@@ -98,19 +107,22 @@ static int show_other_interrupts(struct seq_file *p)
98int show_interrupts(struct seq_file *p, void *v) 107int show_interrupts(struct seq_file *p, void *v)
99{ 108{
100 unsigned long flags, any_count = 0; 109 unsigned long flags, any_count = 0;
101 int i = *(loff_t *) v, j; 110 int i = *(loff_t *) v, j, prec;
102 struct irqaction *action; 111 struct irqaction *action;
103 struct irq_desc *desc; 112 struct irq_desc *desc;
104 113
105 if (i > nr_irqs) 114 if (i > nr_irqs)
106 return 0; 115 return 0;
107 116
117 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
118 j *= 10;
119
108 if (i == nr_irqs) 120 if (i == nr_irqs)
109 return show_other_interrupts(p); 121 return show_other_interrupts(p, prec);
110 122
111 /* print header */ 123 /* print header */
112 if (i == 0) { 124 if (i == 0) {
113 seq_printf(p, " "); 125 seq_printf(p, "%*s", prec + 8, "");
114 for_each_online_cpu(j) 126 for_each_online_cpu(j)
115 seq_printf(p, "CPU%-8d", j); 127 seq_printf(p, "CPU%-8d", j);
116 seq_putc(p, '\n'); 128 seq_putc(p, '\n');
@@ -131,7 +143,7 @@ int show_interrupts(struct seq_file *p, void *v)
131 if (!action && !any_count) 143 if (!action && !any_count)
132 goto out; 144 goto out;
133 145
134 seq_printf(p, "%3d: ", i); 146 seq_printf(p, "%*d: ", prec, i);
135#ifndef CONFIG_SMP 147#ifndef CONFIG_SMP
136 seq_printf(p, "%10u ", kstat_irqs(i)); 148 seq_printf(p, "%10u ", kstat_irqs(i));
137#else 149#else
@@ -163,6 +175,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
163#ifdef CONFIG_X86_LOCAL_APIC 175#ifdef CONFIG_X86_LOCAL_APIC
164 sum += irq_stats(cpu)->apic_timer_irqs; 176 sum += irq_stats(cpu)->apic_timer_irqs;
165#endif 177#endif
178 if (generic_interrupt_extension)
179 sum += irq_stats(cpu)->generic_irqs;
166#ifdef CONFIG_SMP 180#ifdef CONFIG_SMP
167 sum += irq_stats(cpu)->irq_resched_count; 181 sum += irq_stats(cpu)->irq_resched_count;
168 sum += irq_stats(cpu)->irq_call_count; 182 sum += irq_stats(cpu)->irq_call_count;
@@ -226,4 +240,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
226 return 1; 240 return 1;
227} 241}
228 242
243/*
244 * Handler for GENERIC_INTERRUPT_VECTOR.
245 */
246void smp_generic_interrupt(struct pt_regs *regs)
247{
248 struct pt_regs *old_regs = set_irq_regs(regs);
249
250 ack_APIC_irq();
251
252 exit_idle();
253
254 irq_enter();
255
256 inc_irq_stat(generic_irqs);
257
258 if (generic_interrupt_extension)
259 generic_interrupt_extension();
260
261 irq_exit();
262
263 set_irq_regs(old_regs);
264}
265
229EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 266EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b2427..3b09634a515 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21 22
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
55union irq_ctx { 56union irq_ctx {
56 struct thread_info tinfo; 57 struct thread_info tinfo;
57 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
58}; 59} __attribute__((aligned(PAGE_SIZE)));
59 60
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
62 63
63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
65 66
66static void call_on_stack(void *func, void *stack) 67static void call_on_stack(void *func, void *stack)
67{ 68{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
81 u32 *isp, arg1, arg2; 82 u32 *isp, arg1, arg2;
82 83
83 curctx = (union irq_ctx *) current_thread_info(); 84 curctx = (union irq_ctx *) current_thread_info();
84 irqctx = hardirq_ctx[smp_processor_id()]; 85 irqctx = __get_cpu_var(hardirq_ctx);
85 86
86 /* 87 /*
87 * this is where we switch to the IRQ stack. However, if we are 88 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
125{ 126{
126 union irq_ctx *irqctx; 127 union irq_ctx *irqctx;
127 128
128 if (hardirq_ctx[cpu]) 129 if (per_cpu(hardirq_ctx, cpu))
129 return; 130 return;
130 131
131 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 132 irqctx = &per_cpu(hardirq_stack, cpu);
132 irqctx->tinfo.task = NULL; 133 irqctx->tinfo.task = NULL;
133 irqctx->tinfo.exec_domain = NULL; 134 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 135 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
137 138
138 hardirq_ctx[cpu] = irqctx; 139 per_cpu(hardirq_ctx, cpu) = irqctx;
139 140
140 irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; 141 irqctx = &per_cpu(softirq_stack, cpu);
141 irqctx->tinfo.task = NULL; 142 irqctx->tinfo.task = NULL;
142 irqctx->tinfo.exec_domain = NULL; 143 irqctx->tinfo.exec_domain = NULL;
143 irqctx->tinfo.cpu = cpu; 144 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.preempt_count = 0; 145 irqctx->tinfo.preempt_count = 0;
145 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
146 147
147 softirq_ctx[cpu] = irqctx; 148 per_cpu(softirq_ctx, cpu) = irqctx;
148 149
149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 150 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
150 cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); 151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
151} 152}
152 153
153void irq_ctx_exit(int cpu) 154void irq_ctx_exit(int cpu)
154{ 155{
155 hardirq_ctx[cpu] = NULL; 156 per_cpu(hardirq_ctx, cpu) = NULL;
156} 157}
157 158
158asmlinkage void do_softirq(void) 159asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
169 170
170 if (local_softirq_pending()) { 171 if (local_softirq_pending()) {
171 curctx = current_thread_info(); 172 curctx = current_thread_info();
172 irqctx = softirq_ctx[smp_processor_id()]; 173 irqctx = __get_cpu_var(softirq_ctx);
173 irqctx->tinfo.task = curctx->task; 174 irqctx->tinfo.task = curctx->task;
174 irqctx->tinfo.previous_esp = current_stack_pointer; 175 irqctx->tinfo.previous_esp = current_stack_pointer;
175 176
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 50b8c3a3006..bc132610544 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -175,6 +175,9 @@ void __init native_init_IRQ(void)
175 /* self generated IPI for local APIC timer */ 175 /* self generated IPI for local APIC timer */
176 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 176 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
177 177
178 /* generic IPI for platform specific use */
179 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
180
178 /* IPI vectors for APIC spurious and error interrupts */ 181 /* IPI vectors for APIC spurious and error interrupts */
179 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 182 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
180 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 183 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f3..c7a49e0ffbf 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
147 /* self generated IPI for local APIC timer */ 147 /* self generated IPI for local APIC timer */
148 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 148 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
149 149
150 /* generic IPI for platform specific use */
151 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
152
150 /* IPI vectors for APIC spurious and error interrupts */ 153 /* IPI vectors for APIC spurious and error interrupts */
151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 154 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 155 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e948b28a5a9..4558dd3918c 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,6 +193,9 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
193 kprobe_opcode_t opcode; 193 kprobe_opcode_t opcode;
194 kprobe_opcode_t *orig_opcodes = opcodes; 194 kprobe_opcode_t *orig_opcodes = opcodes;
195 195
196 if (search_exception_tables(opcodes))
197 return 0; /* Page fault may occur on this address. */
198
196retry: 199retry:
197 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) 200 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
198 return 0; 201 return 0;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c781a6..e7368c1da01 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h> 15#include <linux/suspend.h>
16#include <linux/gfp.h> 16#include <linux/gfp.h>
17#include <linux/io.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19#include <asm/pgalloc.h> 20#include <asm/pgalloc.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/io.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/cpufeature.h> 24#include <asm/cpufeature.h>
25#include <asm/desc.h> 25#include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
63 "\tmovl %%eax,%%fs\n" 63 "\tmovl %%eax,%%fs\n"
64 "\tmovl %%eax,%%gs\n" 64 "\tmovl %%eax,%%gs\n"
65 "\tmovl %%eax,%%ss\n" 65 "\tmovl %%eax,%%ss\n"
66 ::: "eax", "memory"); 66 : : : "eax", "memory");
67#undef STR 67#undef STR
68#undef __STR 68#undef __STR
69} 69}
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
208 /* We need to put APICs in legacy mode so that we can 208 /*
209 * We need to put APICs in legacy mode so that we can
209 * get timer interrupts in second kernel. kexec/kdump 210 * get timer interrupts in second kernel. kexec/kdump
210 * paths already have calls to disable_IO_APIC() in 211 * paths already have calls to disable_IO_APIC() in
211 * one form or other. kexec jump path also need 212 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
227 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) 228 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
228 << PAGE_SHIFT); 229 << PAGE_SHIFT);
229 230
230 /* The segment registers are funny things, they have both a 231 /*
232 * The segment registers are funny things, they have both a
231 * visible and an invisible part. Whenever the visible part is 233 * visible and an invisible part. Whenever the visible part is
232 * set to a specific selector, the invisible part is loaded 234 * set to a specific selector, the invisible part is loaded
233 * with from a table in memory. At no other time is the 235 * with from a table in memory. At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
237 * segments, before I zap the gdt with an invalid value. 239 * segments, before I zap the gdt with an invalid value.
238 */ 240 */
239 load_segments(); 241 load_segments();
240 /* The gdt & idt are now invalid. 242 /*
243 * The gdt & idt are now invalid.
241 * If you want to load them you must set up your own idt & gdt. 244 * If you want to load them you must set up your own idt & gdt.
242 */ 245 */
243 set_gdt(phys_to_virt(0),0); 246 set_gdt(phys_to_virt(0), 0);
244 set_idt(phys_to_virt(0),0); 247 set_idt(phys_to_virt(0), 0);
245 248
246 /* now call it */ 249 /* now call it */
247 image->start = relocate_kernel_ptr((unsigned long)image->head, 250 image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51b7fd..89cea4d4467 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,47 @@
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/io.h>
16#include <linux/suspend.h>
15 17
16#include <asm/pgtable.h> 18#include <asm/pgtable.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
18#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
19#include <asm/io.h> 21
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr)
24{
25 pud_t *pud;
26 pmd_t *pmd;
27 struct page *page;
28 int result = -ENOMEM;
29
30 addr &= PMD_MASK;
31 pgd += pgd_index(addr);
32 if (!pgd_present(*pgd)) {
33 page = kimage_alloc_control_pages(image, 0);
34 if (!page)
35 goto out;
36 pud = (pud_t *)page_address(page);
37 memset(pud, 0, PAGE_SIZE);
38 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
39 }
40 pud = pud_offset(pgd, addr);
41 if (!pud_present(*pud)) {
42 page = kimage_alloc_control_pages(image, 0);
43 if (!page)
44 goto out;
45 pmd = (pmd_t *)page_address(page);
46 memset(pmd, 0, PAGE_SIZE);
47 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
48 }
49 pmd = pmd_offset(pud, addr);
50 if (!pmd_present(*pmd))
51 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
52 result = 0;
53out:
54 return result;
55}
20 56
21static void init_level2_page(pmd_t *level2p, unsigned long addr) 57static void init_level2_page(pmd_t *level2p, unsigned long addr)
22{ 58{
@@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
83 } 119 }
84 level3p = (pud_t *)page_address(page); 120 level3p = (pud_t *)page_address(page);
85 result = init_level3_page(image, level3p, addr, last_addr); 121 result = init_level3_page(image, level3p, addr, last_addr);
86 if (result) { 122 if (result)
87 goto out; 123 goto out;
88 }
89 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); 124 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
90 addr += PGDIR_SIZE; 125 addr += PGDIR_SIZE;
91 } 126 }
@@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
156 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 191 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
157 if (result) 192 if (result)
158 return result; 193 return result;
194 /*
195 * image->start may be outside 0 ~ max_pfn, for example when
196 * jump back to original kernel from kexeced kernel
197 */
198 result = init_one_level2_page(image, level4p, image->start);
199 if (result)
200 return result;
159 return init_transition_pgtable(image, level4p); 201 return init_transition_pgtable(image, level4p);
160} 202}
161 203
@@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image)
229{ 271{
230 unsigned long page_list[PAGES_NR]; 272 unsigned long page_list[PAGES_NR];
231 void *control_page; 273 void *control_page;
274 int save_ftrace_enabled;
232 275
233 tracer_disable(); 276#ifdef CONFIG_KEXEC_JUMP
277 if (kexec_image->preserve_context)
278 save_processor_state();
279#endif
280
281 save_ftrace_enabled = __ftrace_enabled_save();
234 282
235 /* Interrupts aren't acceptable while we reboot */ 283 /* Interrupts aren't acceptable while we reboot */
236 local_irq_disable(); 284 local_irq_disable();
237 285
286 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC
288 /*
289 * We need to put APICs in legacy mode so that we can
290 * get timer interrupts in second kernel. kexec/kdump
291 * paths already have calls to disable_IO_APIC() in
292 * one form or other. kexec jump path also need
293 * one.
294 */
295 disable_IO_APIC();
296#endif
297 }
298
238 control_page = page_address(image->control_code_page) + PAGE_SIZE; 299 control_page = page_address(image->control_code_page) + PAGE_SIZE;
239 memcpy(control_page, relocate_kernel, PAGE_SIZE); 300 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
240 301
241 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); 302 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
303 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
242 page_list[PA_TABLE_PAGE] = 304 page_list[PA_TABLE_PAGE] =
243 (unsigned long)__pa(page_address(image->control_code_page)); 305 (unsigned long)__pa(page_address(image->control_code_page));
244 306
245 /* The segment registers are funny things, they have both a 307 if (image->type == KEXEC_TYPE_DEFAULT)
308 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
309 << PAGE_SHIFT);
310
311 /*
312 * The segment registers are funny things, they have both a
246 * visible and an invisible part. Whenever the visible part is 313 * visible and an invisible part. Whenever the visible part is
247 * set to a specific selector, the invisible part is loaded 314 * set to a specific selector, the invisible part is loaded
248 * with from a table in memory. At no other time is the 315 * with from a table in memory. At no other time is the
@@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image)
252 * segments, before I zap the gdt with an invalid value. 319 * segments, before I zap the gdt with an invalid value.
253 */ 320 */
254 load_segments(); 321 load_segments();
255 /* The gdt & idt are now invalid. 322 /*
323 * The gdt & idt are now invalid.
256 * If you want to load them you must set up your own idt & gdt. 324 * If you want to load them you must set up your own idt & gdt.
257 */ 325 */
258 set_gdt(phys_to_virt(0),0); 326 set_gdt(phys_to_virt(0), 0);
259 set_idt(phys_to_virt(0),0); 327 set_idt(phys_to_virt(0), 0);
260 328
261 /* now call it */ 329 /* now call it */
262 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 330 image->start = relocate_kernel((unsigned long)image->head,
263 image->start); 331 (unsigned long)page_list,
332 image->start,
333 image->preserve_context);
334
335#ifdef CONFIG_KEXEC_JUMP
336 if (kexec_image->preserve_context)
337 restore_processor_state();
338#endif
339
340 __ftrace_enabled_restore(save_ftrace_enabled);
264} 341}
265 342
266void arch_crash_save_vmcoreinfo(void) 343void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f..712d15fdc41 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
226 return 0; 226 return 0;
227} 227}
228 228
229static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { 229static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
230 { 230 {
231 .callback = set_check_enable_amd_mmconf, 231 .callback = set_check_enable_amd_mmconf,
232 .ident = "Sun Microsystems Machine", 232 .ident = "Sun Microsystems Machine",
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1ba..47673e02ae5 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
558 558
559static struct mpf_intel *mpf_found; 559static struct mpf_intel *mpf_found;
560 560
561static unsigned long __init get_mpc_size(unsigned long physptr)
562{
563 struct mpc_table *mpc;
564 unsigned long size;
565
566 mpc = early_ioremap(physptr, PAGE_SIZE);
567 size = mpc->length;
568 early_iounmap(mpc, PAGE_SIZE);
569 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
570
571 return size;
572}
573
561/* 574/*
562 * Scan the memory blocks for an SMP configuration block. 575 * Scan the memory blocks for an SMP configuration block.
563 */ 576 */
@@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early)
611 construct_default_ISA_mptable(mpf->feature1); 624 construct_default_ISA_mptable(mpf->feature1);
612 625
613 } else if (mpf->physptr) { 626 } else if (mpf->physptr) {
627 struct mpc_table *mpc;
628 unsigned long size;
614 629
630 size = get_mpc_size(mpf->physptr);
631 mpc = early_ioremap(mpf->physptr, size);
615 /* 632 /*
616 * Read the physical hardware table. Anything here will 633 * Read the physical hardware table. Anything here will
617 * override the defaults. 634 * override the defaults.
618 */ 635 */
619 if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) { 636 if (!smp_read_mpc(mpc, early)) {
620#ifdef CONFIG_X86_LOCAL_APIC 637#ifdef CONFIG_X86_LOCAL_APIC
621 smp_found_config = 0; 638 smp_found_config = 0;
622#endif 639#endif
@@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)
624 "BIOS bug, MP table errors detected!...\n"); 641 "BIOS bug, MP table errors detected!...\n");
625 printk(KERN_ERR "... disabling SMP support. " 642 printk(KERN_ERR "... disabling SMP support. "
626 "(tell your hw vendor)\n"); 643 "(tell your hw vendor)\n");
644 early_iounmap(mpc, size);
627 return; 645 return;
628 } 646 }
647 early_iounmap(mpc, size);
629 648
630 if (early) 649 if (early)
631 return; 650 return;
@@ -697,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
697 716
698 if (!reserve) 717 if (!reserve)
699 return 1; 718 return 1;
700 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 719 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
701 BOOTMEM_DEFAULT); 720 BOOTMEM_DEFAULT);
702 if (mpf->physptr) { 721 if (mpf->physptr) {
703 unsigned long size = PAGE_SIZE; 722 unsigned long size = get_mpc_size(mpf->physptr);
704#ifdef CONFIG_X86_32 723#ifdef CONFIG_X86_32
705 /* 724 /*
706 * We cannot access to MPC table to compute 725 * We cannot access to MPC table to compute
@@ -871,12 +890,12 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
871#ifdef CONFIG_X86_IO_APIC 890#ifdef CONFIG_X86_IO_APIC
872 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 891 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
873 892
874 printk(KERN_INFO "OLD "); 893 apic_printk(APIC_VERBOSE, "OLD ");
875 print_MP_intsrc_info(m); 894 print_MP_intsrc_info(m);
876 i = get_MP_intsrc_index(m); 895 i = get_MP_intsrc_index(m);
877 if (i > 0) { 896 if (i > 0) {
878 assign_to_mpc_intsrc(&mp_irqs[i], m); 897 assign_to_mpc_intsrc(&mp_irqs[i], m);
879 printk(KERN_INFO "NEW "); 898 apic_printk(APIC_VERBOSE, "NEW ");
880 print_mp_irq_info(&mp_irqs[i]); 899 print_mp_irq_info(&mp_irqs[i]);
881 } else if (!i) { 900 } else if (!i) {
882 /* legacy, do nothing */ 901 /* legacy, do nothing */
@@ -924,7 +943,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
924 continue; 943 continue;
925 944
926 if (nr_m_spare > 0) { 945 if (nr_m_spare > 0) {
927 printk(KERN_INFO "*NEW* found "); 946 apic_printk(APIC_VERBOSE, "*NEW* found\n");
928 nr_m_spare--; 947 nr_m_spare--;
929 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); 948 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
930 m_spare[nr_m_spare] = NULL; 949 m_spare[nr_m_spare] = NULL;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3d9672e59c1..19378715f41 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -685,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child,
685 if (!cfg.signal) 685 if (!cfg.signal)
686 return -EINVAL; 686 return -EINVAL;
687 687
688 return -EOPNOTSUPP;
689
690 child->thread.bts_ovfl_signal = cfg.signal; 688 child->thread.bts_ovfl_signal = cfg.signal;
689 return -EOPNOTSUPP;
691 } 690 }
692 691
693 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && 692 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 309949e9e1c..6a5a2970f4c 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
74 if (!force_hpet_address) 74 if (!force_hpet_address)
75 return; 75 return;
76 76
77 if (rcba_base == NULL) 77 BUG_ON(rcba_base == NULL);
78 BUG();
79 78
80 /* read the Function Disable register, dword mode only */ 79 /* read the Function Disable register, dword mode only */
81 val = readl(rcba_base + 0x3404); 80 val = readl(rcba_base + 0x3404);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bb..2aef36d8aca 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), 216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
217 }, 217 },
218 }, 218 },
219 { /* Handle problems with rebooting on Dell XPS710 */
220 .callback = set_bios_reboot,
221 .ident = "Dell XPS710",
222 .matches = {
223 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
224 DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
225 },
226 },
219 { } 227 { }
220}; 228};
221 229
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0aa8d2..41235531b11 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
17 17
18#define PTR(x) (x << 2) 18#define PTR(x) (x << 2)
19 19
20/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE 20/*
21 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
21 * ~ control_page + PAGE_SIZE are used as data storage and stack for 22 * ~ control_page + PAGE_SIZE are used as data storage and stack for
22 * jumping back 23 * jumping back
23 */ 24 */
@@ -76,8 +77,10 @@ relocate_kernel:
76 movl %eax, CP_PA_SWAP_PAGE(%edi) 77 movl %eax, CP_PA_SWAP_PAGE(%edi)
77 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) 78 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
78 79
79 /* get physical address of control page now */ 80 /*
80 /* this is impossible after page table switch */ 81 * get physical address of control page now
82 * this is impossible after page table switch
83 */
81 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 84 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
82 85
83 /* switch to new set of page tables */ 86 /* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
97 /* store the start address on the stack */ 100 /* store the start address on the stack */
98 pushl %edx 101 pushl %edx
99 102
100 /* Set cr0 to a known state: 103 /*
104 * Set cr0 to a known state:
101 * - Paging disabled 105 * - Paging disabled
102 * - Alignment check disabled 106 * - Alignment check disabled
103 * - Write protect disabled 107 * - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
113 /* clear cr4 if applicable */ 117 /* clear cr4 if applicable */
114 testl %ecx, %ecx 118 testl %ecx, %ecx
115 jz 1f 119 jz 1f
116 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
117 * Setting everything to zero seems safe. 122 * Setting everything to zero seems safe.
118 */ 123 */
119 xorl %eax, %eax 124 xorl %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
132 call swap_pages 137 call swap_pages
133 addl $8, %esp 138 addl $8, %esp
134 139
135 /* To be certain of avoiding problems with self-modifying code 140 /*
141 * To be certain of avoiding problems with self-modifying code
136 * I need to execute a serializing instruction here. 142 * I need to execute a serializing instruction here.
137 * So I flush the TLB, it's handy, and not processor dependent. 143 * So I flush the TLB, it's handy, and not processor dependent.
138 */ 144 */
139 xorl %eax, %eax 145 xorl %eax, %eax
140 movl %eax, %cr3 146 movl %eax, %cr3
141 147
142 /* set all of the registers to known values */ 148 /*
143 /* leave %esp alone */ 149 * set all of the registers to known values
150 * leave %esp alone
151 */
144 152
145 testl %esi, %esi 153 testl %esi, %esi
146 jnz 1f 154 jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb27a47..4de8f5b3d47 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,29 +19,77 @@
19#define PTR(x) (x << 3) 19#define PTR(x) (x << 3)
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21 21
22/*
23 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define RSP DATA(0x0)
31#define CR0 DATA(0x8)
32#define CR3 DATA(0x10)
33#define CR4 DATA(0x18)
34
35/* other data */
36#define CP_PA_TABLE_PAGE DATA(0x20)
37#define CP_PA_SWAP_PAGE DATA(0x28)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
39
22 .text 40 .text
23 .align PAGE_SIZE 41 .align PAGE_SIZE
24 .code64 42 .code64
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 /* %rdi indirection_page 45 /*
46 * %rdi indirection_page
28 * %rsi page_list 47 * %rsi page_list
29 * %rdx start address 48 * %rdx start address
49 * %rcx preserve_context
30 */ 50 */
31 51
52 /* Save the CPU context, used for jumping back */
53 pushq %rbx
54 pushq %rbp
55 pushq %r12
56 pushq %r13
57 pushq %r14
58 pushq %r15
59 pushf
60
61 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
62 movq %rsp, RSP(%r11)
63 movq %cr0, %rax
64 movq %rax, CR0(%r11)
65 movq %cr3, %rax
66 movq %rax, CR3(%r11)
67 movq %cr4, %rax
68 movq %rax, CR4(%r11)
69
32 /* zero out flags, and disable interrupts */ 70 /* zero out flags, and disable interrupts */
33 pushq $0 71 pushq $0
34 popfq 72 popfq
35 73
36 /* get physical address of control page now */ 74 /*
37 /* this is impossible after page table switch */ 75 * get physical address of control page now
76 * this is impossible after page table switch
77 */
38 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 78 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
39 79
40 /* get physical address of page table now too */ 80 /* get physical address of page table now too */
41 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx 81 movq PTR(PA_TABLE_PAGE)(%rsi), %r9
82
83 /* get physical address of swap page now */
84 movq PTR(PA_SWAP_PAGE)(%rsi), %r10
85
86 /* save some information for jumping back */
87 movq %r9, CP_PA_TABLE_PAGE(%r11)
88 movq %r10, CP_PA_SWAP_PAGE(%r11)
89 movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
42 90
43 /* Switch to the identity mapped page tables */ 91 /* Switch to the identity mapped page tables */
44 movq %rcx, %cr3 92 movq %r9, %cr3
45 93
46 /* setup a new stack at the end of the physical control page */ 94 /* setup a new stack at the end of the physical control page */
47 lea PAGE_SIZE(%r8), %rsp 95 lea PAGE_SIZE(%r8), %rsp
@@ -55,7 +103,8 @@ identity_mapped:
55 /* store the start address on the stack */ 103 /* store the start address on the stack */
56 pushq %rdx 104 pushq %rdx
57 105
58 /* Set cr0 to a known state: 106 /*
107 * Set cr0 to a known state:
59 * - Paging enabled 108 * - Paging enabled
60 * - Alignment check disabled 109 * - Alignment check disabled
61 * - Write protect disabled 110 * - Write protect disabled
@@ -68,7 +117,8 @@ identity_mapped:
68 orl $(X86_CR0_PG | X86_CR0_PE), %eax 117 orl $(X86_CR0_PG | X86_CR0_PE), %eax
69 movq %rax, %cr0 118 movq %rax, %cr0
70 119
71 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
72 * - physical address extension enabled 122 * - physical address extension enabled
73 */ 123 */
74 movq $X86_CR4_PAE, %rax 124 movq $X86_CR4_PAE, %rax
@@ -78,9 +128,87 @@ identity_mapped:
781: 1281:
79 129
80 /* Flush the TLB (needed?) */ 130 /* Flush the TLB (needed?) */
81 movq %rcx, %cr3 131 movq %r9, %cr3
132
133 movq %rcx, %r11
134 call swap_pages
135
136 /*
137 * To be certain of avoiding problems with self-modifying code
138 * I need to execute a serializing instruction here.
139 * So I flush the TLB by reloading %cr3 here, it's handy,
140 * and not processor dependent.
141 */
142 movq %cr3, %rax
143 movq %rax, %cr3
144
145 /*
146 * set all of the registers to known values
147 * leave %rsp alone
148 */
149
150 testq %r11, %r11
151 jnz 1f
152 xorq %rax, %rax
153 xorq %rbx, %rbx
154 xorq %rcx, %rcx
155 xorq %rdx, %rdx
156 xorq %rsi, %rsi
157 xorq %rdi, %rdi
158 xorq %rbp, %rbp
159 xorq %r8, %r8
160 xorq %r9, %r9
161 xorq %r10, %r9
162 xorq %r11, %r11
163 xorq %r12, %r12
164 xorq %r13, %r13
165 xorq %r14, %r14
166 xorq %r15, %r15
167
168 ret
169
1701:
171 popq %rdx
172 leaq PAGE_SIZE(%r10), %rsp
173 call *%rdx
174
175 /* get the re-entry point of the peer system */
176 movq 0(%rsp), %rbp
177 call 1f
1781:
179 popq %r8
180 subq $(1b - relocate_kernel), %r8
181 movq CP_PA_SWAP_PAGE(%r8), %r10
182 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
183 movq CP_PA_TABLE_PAGE(%r8), %rax
184 movq %rax, %cr3
185 lea PAGE_SIZE(%r8), %rsp
186 call swap_pages
187 movq $virtual_mapped, %rax
188 pushq %rax
189 ret
190
191virtual_mapped:
192 movq RSP(%r8), %rsp
193 movq CR4(%r8), %rax
194 movq %rax, %cr4
195 movq CR3(%r8), %rax
196 movq CR0(%r8), %r8
197 movq %rax, %cr3
198 movq %r8, %cr0
199 movq %rbp, %rax
200
201 popf
202 popq %r15
203 popq %r14
204 popq %r13
205 popq %r12
206 popq %rbp
207 popq %rbx
208 ret
82 209
83 /* Do the copies */ 210 /* Do the copies */
211swap_pages:
84 movq %rdi, %rcx /* Put the page_list in %rcx */ 212 movq %rdi, %rcx /* Put the page_list in %rcx */
85 xorq %rdi, %rdi 213 xorq %rdi, %rdi
86 xorq %rsi, %rsi 214 xorq %rsi, %rsi
@@ -112,36 +240,27 @@ identity_mapped:
112 movq %rcx, %rsi /* For ever source page do a copy */ 240 movq %rcx, %rsi /* For ever source page do a copy */
113 andq $0xfffffffffffff000, %rsi 241 andq $0xfffffffffffff000, %rsi
114 242
243 movq %rdi, %rdx
244 movq %rsi, %rax
245
246 movq %r10, %rdi
115 movq $512, %rcx 247 movq $512, %rcx
116 rep ; movsq 248 rep ; movsq
117 jmp 0b
1183:
119
120 /* To be certain of avoiding problems with self-modifying code
121 * I need to execute a serializing instruction here.
122 * So I flush the TLB by reloading %cr3 here, it's handy,
123 * and not processor dependent.
124 */
125 movq %cr3, %rax
126 movq %rax, %cr3
127 249
128 /* set all of the registers to known values */ 250 movq %rax, %rdi
129 /* leave %rsp alone */ 251 movq %rdx, %rsi
252 movq $512, %rcx
253 rep ; movsq
130 254
131 xorq %rax, %rax 255 movq %rdx, %rdi
132 xorq %rbx, %rbx 256 movq %r10, %rsi
133 xorq %rcx, %rcx 257 movq $512, %rcx
134 xorq %rdx, %rdx 258 rep ; movsq
135 xorq %rsi, %rsi
136 xorq %rdi, %rdi
137 xorq %rbp, %rbp
138 xorq %r8, %r8
139 xorq %r9, %r9
140 xorq %r10, %r9
141 xorq %r11, %r11
142 xorq %r12, %r12
143 xorq %r13, %r13
144 xorq %r14, %r14
145 xorq %r15, %r15
146 259
260 lea PAGE_SIZE(%rax), %rsi
261 jmp 0b
2623:
147 ret 263 ret
264
265 .globl kexec_control_code_size
266.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4c54bc0d8ff..a0d26237d7c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,8 +112,13 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115RESERVE_BRK(dmi_alloc, 65536);
116
115unsigned int boot_cpu_id __read_mostly; 117unsigned int boot_cpu_id __read_mostly;
116 118
119static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
120unsigned long _brk_end = (unsigned long)__brk_base;
121
117#ifdef CONFIG_X86_64 122#ifdef CONFIG_X86_64
118int default_cpu_present_to_apicid(int mps_cpu) 123int default_cpu_present_to_apicid(int mps_cpu)
119{ 124{
@@ -158,12 +163,6 @@ static struct resource bss_resource = {
158 163
159 164
160#ifdef CONFIG_X86_32 165#ifdef CONFIG_X86_32
161/* This value is set up by the early boot code to point to the value
162 immediately after the boot time page tables. It contains a *physical*
163 address, and must not be in the .bss segment! */
164unsigned long init_pg_tables_start __initdata = ~0UL;
165unsigned long init_pg_tables_end __initdata = ~0UL;
166
167static struct resource video_ram_resource = { 166static struct resource video_ram_resource = {
168 .name = "Video RAM area", 167 .name = "Video RAM area",
169 .start = 0xa0000, 168 .start = 0xa0000,
@@ -202,7 +201,9 @@ struct ist_info ist_info;
202#endif 201#endif
203 202
204#else 203#else
205struct cpuinfo_x86 boot_cpu_data __read_mostly; 204struct cpuinfo_x86 boot_cpu_data __read_mostly = {
205 .x86_phys_bits = MAX_PHYSMEM_BITS,
206};
206EXPORT_SYMBOL(boot_cpu_data); 207EXPORT_SYMBOL(boot_cpu_data);
207#endif 208#endif
208 209
@@ -217,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
217int bootloader_type; 218int bootloader_type;
218 219
219/* 220/*
220 * Early DMI memory
221 */
222int dmi_alloc_index;
223char dmi_alloc_data[DMI_MAX_DATA];
224
225/*
226 * Setup options 221 * Setup options
227 */ 222 */
228struct screen_info screen_info; 223struct screen_info screen_info;
@@ -267,6 +262,35 @@ static inline void copy_edd(void)
267} 262}
268#endif 263#endif
269 264
265void * __init extend_brk(size_t size, size_t align)
266{
267 size_t mask = align - 1;
268 void *ret;
269
270 BUG_ON(_brk_start == 0);
271 BUG_ON(align & mask);
272
273 _brk_end = (_brk_end + mask) & ~mask;
274 BUG_ON((char *)(_brk_end + size) > __brk_limit);
275
276 ret = (void *)_brk_end;
277 _brk_end += size;
278
279 memset(ret, 0, size);
280
281 return ret;
282}
283
284static void __init reserve_brk(void)
285{
286 if (_brk_end > _brk_start)
287 reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
288
289 /* Mark brk area as locked down and no longer taking any
290 new allocations */
291 _brk_start = 0;
292}
293
270#ifdef CONFIG_BLK_DEV_INITRD 294#ifdef CONFIG_BLK_DEV_INITRD
271 295
272#ifdef CONFIG_X86_32 296#ifdef CONFIG_X86_32
@@ -715,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
715 init_mm.start_code = (unsigned long) _text; 739 init_mm.start_code = (unsigned long) _text;
716 init_mm.end_code = (unsigned long) _etext; 740 init_mm.end_code = (unsigned long) _etext;
717 init_mm.end_data = (unsigned long) _edata; 741 init_mm.end_data = (unsigned long) _edata;
718#ifdef CONFIG_X86_32 742 init_mm.brk = _brk_end;
719 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
720#else
721 init_mm.brk = (unsigned long) &_end;
722#endif
723 743
724 code_resource.start = virt_to_phys(_text); 744 code_resource.start = virt_to_phys(_text);
725 code_resource.end = virt_to_phys(_etext)-1; 745 code_resource.end = virt_to_phys(_etext)-1;
@@ -770,6 +790,9 @@ void __init setup_arch(char **cmdline_p)
770 790
771 finish_e820_parsing(); 791 finish_e820_parsing();
772 792
793 if (efi_enabled)
794 efi_init();
795
773 dmi_scan_machine(); 796 dmi_scan_machine();
774 797
775 dmi_check_system(bad_bios_dmi_table); 798 dmi_check_system(bad_bios_dmi_table);
@@ -789,8 +812,6 @@ void __init setup_arch(char **cmdline_p)
789 insert_resource(&iomem_resource, &data_resource); 812 insert_resource(&iomem_resource, &data_resource);
790 insert_resource(&iomem_resource, &bss_resource); 813 insert_resource(&iomem_resource, &bss_resource);
791 814
792 if (efi_enabled)
793 efi_init();
794 815
795#ifdef CONFIG_X86_32 816#ifdef CONFIG_X86_32
796 if (ppro_with_ram_bug()) { 817 if (ppro_with_ram_bug()) {
@@ -839,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
839 setup_bios_corruption_check(); 860 setup_bios_corruption_check();
840#endif 861#endif
841 862
863 reserve_brk();
864
842 /* max_pfn_mapped is updated here */ 865 /* max_pfn_mapped is updated here */
843 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 866 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
844 max_pfn_mapped = max_low_pfn_mapped; 867 max_pfn_mapped = max_low_pfn_mapped;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff73..400331b50a5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
@@ -41,6 +42,309 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
41}; 42};
42EXPORT_SYMBOL(__per_cpu_offset); 43EXPORT_SYMBOL(__per_cpu_offset);
43 44
45/*
46 * On x86_64 symbols referenced from code should be reachable using
47 * 32bit relocations. Reserve space for static percpu variables in
48 * modules so that they are always served from the first chunk which
49 * is located at the percpu segment base. On x86_32, anything can
50 * address anywhere. No need to reserve space in the first chunk.
51 */
52#ifdef CONFIG_X86_64
53#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
54#else
55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif
57
58/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 *
61 * If NUMA is not configured or there is only one NUMA node available,
62 * there is no reason to consider NUMA. This function determines
63 * whether percpu allocation should consider NUMA or not.
64 *
65 * RETURNS:
66 * true if NUMA should be considered; otherwise, false.
67 */
68static bool __init pcpu_need_numa(void)
69{
70#ifdef CONFIG_NEED_MULTIPLE_NODES
71 pg_data_t *last = NULL;
72 unsigned int cpu;
73
74 for_each_possible_cpu(cpu) {
75 int node = early_cpu_to_node(cpu);
76
77 if (node_online(node) && NODE_DATA(node) &&
78 last && last != NODE_DATA(node))
79 return true;
80
81 last = NODE_DATA(node);
82 }
83#endif
84 return false;
85}
86
87/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
89 * @cpu: cpu to allocate for
90 * @size: size allocation in bytes
91 * @align: alignment
92 *
93 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
94 * does the right thing for NUMA regardless of the current
95 * configuration.
96 *
97 * RETURNS:
98 * Pointer to the allocated area on success, NULL on failure.
99 */
100static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
101 unsigned long align)
102{
103 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
104#ifdef CONFIG_NEED_MULTIPLE_NODES
105 int node = early_cpu_to_node(cpu);
106 void *ptr;
107
108 if (!node_online(node) || !NODE_DATA(node)) {
109 ptr = __alloc_bootmem_nopanic(size, align, goal);
110 pr_info("cpu %d has no node %d or node-local memory\n",
111 cpu, node);
112 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
113 cpu, size, __pa(ptr));
114 } else {
115 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
116 size, align, goal);
117 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
118 "%016lx\n", cpu, size, node, __pa(ptr));
119 }
120 return ptr;
121#else
122 return __alloc_bootmem_nopanic(size, align, goal);
123#endif
124}
125
126/*
127 * Remap allocator
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES
140static size_t pcpur_size __initdata;
141static void **pcpur_ptrs __initdata;
142
143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
144{
145 size_t off = (size_t)pageno << PAGE_SHIFT;
146
147 if (off >= pcpur_size)
148 return NULL;
149
150 return virt_to_page(pcpur_ptrs[cpu] + off);
151}
152
153static ssize_t __init setup_pcpu_remap(size_t static_size)
154{
155 static struct vm_struct vm;
156 pg_data_t *last;
157 size_t ptrs_size, dyn_size;
158 unsigned int cpu;
159 ssize_t ret;
160
161 /*
162 * If large page isn't supported, there's no benefit in doing
163 * this. Also, on non-NUMA, embedding is better.
164 */
165 if (!cpu_has_pse || pcpu_need_numa())
166 return -EINVAL;
167
168 last = NULL;
169 for_each_possible_cpu(cpu) {
170 int node = early_cpu_to_node(cpu);
171
172 if (node_online(node) && NODE_DATA(node) &&
173 last && last != NODE_DATA(node))
174 goto proceed;
175
176 last = NODE_DATA(node);
177 }
178 return -EINVAL;
179
180proceed:
181 /*
182 * Currently supports only single page. Supporting multiple
183 * pages won't be too difficult if it ever becomes necessary.
184 */
185 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
186 PERCPU_DYNAMIC_RESERVE);
187 if (pcpur_size > PMD_SIZE) {
188 pr_warning("PERCPU: static data is larger than large page, "
189 "can't use large page\n");
190 return -EINVAL;
191 }
192 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
193
194 /* allocate pointer array and alloc large pages */
195 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
196 pcpur_ptrs = alloc_bootmem(ptrs_size);
197
198 for_each_possible_cpu(cpu) {
199 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
200 if (!pcpur_ptrs[cpu])
201 goto enomem;
202
203 /*
204 * Only use pcpur_size bytes and give back the rest.
205 *
206 * Ingo: The 2MB up-rounding bootmem is needed to make
207 * sure the partial 2MB page is still fully RAM - it's
208 * not well-specified to have a PAT-incompatible area
209 * (unmapped RAM, device memory, etc.) in that hole.
210 */
211 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
212 PMD_SIZE - pcpur_size);
213
214 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
215 }
216
217 /* allocate address and map */
218 vm.flags = VM_ALLOC;
219 vm.size = num_possible_cpus() * PMD_SIZE;
220 vm_area_register_early(&vm, PMD_SIZE);
221
222 for_each_possible_cpu(cpu) {
223 pmd_t *pmd;
224
225 pmd = populate_extra_pmd((unsigned long)vm.addr
226 + cpu * PMD_SIZE);
227 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
228 PAGE_KERNEL_LARGE));
229 }
230
231 /* we're ready, commit */
232 pr_info("PERCPU: Remapped at %p with large pages, static data "
233 "%zu bytes\n", vm.addr, static_size);
234
235 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
236 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
237 PMD_SIZE, vm.addr, NULL);
238 goto out_free_ar;
239
240enomem:
241 for_each_possible_cpu(cpu)
242 if (pcpur_ptrs[cpu])
243 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
244 ret = -ENOMEM;
245out_free_ar:
246 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
247 return ret;
248}
249#else
250static ssize_t __init setup_pcpu_remap(size_t static_size)
251{
252 return -EINVAL;
253}
254#endif
255
256/*
257 * Embedding allocator
258 *
259 * The first chunk is sized to just contain the static area plus
260 * module and dynamic reserves and embedded into linear physical
261 * mapping so that it can use PMD mapping without additional TLB
262 * pressure.
263 */
264static ssize_t __init setup_pcpu_embed(size_t static_size)
265{
266 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
267
268 /*
269 * If large page isn't supported, there's no benefit in doing
270 * this. Also, embedding allocation doesn't play well with
271 * NUMA.
272 */
273 if (!cpu_has_pse || pcpu_need_numa())
274 return -EINVAL;
275
276 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
277 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
278}
279
280/*
281 * 4k page allocator
282 *
283 * This is the basic allocator. Static percpu area is allocated
284 * page-by-page and most of initialization is done by the generic
285 * setup function.
286 */
287static struct page **pcpu4k_pages __initdata;
288static int pcpu4k_nr_static_pages __initdata;
289
290static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
291{
292 if (pageno < pcpu4k_nr_static_pages)
293 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
294 return NULL;
295}
296
297static void __init pcpu4k_populate_pte(unsigned long addr)
298{
299 populate_extra_pte(addr);
300}
301
302static ssize_t __init setup_pcpu_4k(size_t static_size)
303{
304 size_t pages_size;
305 unsigned int cpu;
306 int i, j;
307 ssize_t ret;
308
309 pcpu4k_nr_static_pages = PFN_UP(static_size);
310
311 /* unaligned allocations can't be freed, round up to page size */
312 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
313 * sizeof(pcpu4k_pages[0]));
314 pcpu4k_pages = alloc_bootmem(pages_size);
315
316 /* allocate and copy */
317 j = 0;
318 for_each_possible_cpu(cpu)
319 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
320 void *ptr;
321
322 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
323 if (!ptr)
324 goto enomem;
325
326 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
327 pcpu4k_pages[j++] = virt_to_page(ptr);
328 }
329
330 /* we're ready, commit */
331 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
332 pcpu4k_nr_static_pages, static_size);
333
334 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
335 PERCPU_FIRST_CHUNK_RESERVE, -1,
336 -1, NULL, pcpu4k_populate_pte);
337 goto out_free_ar;
338
339enomem:
340 while (--j >= 0)
341 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
342 ret = -ENOMEM;
343out_free_ar:
344 free_bootmem(__pa(pcpu4k_pages), pages_size);
345 return ret;
346}
347
44static inline void setup_percpu_segment(int cpu) 348static inline void setup_percpu_segment(int cpu)
45{ 349{
46#ifdef CONFIG_X86_32 350#ifdef CONFIG_X86_32
@@ -61,38 +365,35 @@ static inline void setup_percpu_segment(int cpu)
61 */ 365 */
62void __init setup_per_cpu_areas(void) 366void __init setup_per_cpu_areas(void)
63{ 367{
64 ssize_t size; 368 size_t static_size = __per_cpu_end - __per_cpu_start;
65 char *ptr; 369 unsigned int cpu;
66 int cpu; 370 unsigned long delta;
67 371 size_t pcpu_unit_size;
68 /* Copy section for each CPU (we discard the original) */ 372 ssize_t ret;
69 size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
70 373
71 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 374 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
72 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 375 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
73 376
74 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 377 /*
378 * Allocate percpu area. If PSE is supported, try to make use
379 * of large page mappings. Please read comments on top of
380 * each allocator for details.
381 */
382 ret = setup_pcpu_remap(static_size);
383 if (ret < 0)
384 ret = setup_pcpu_embed(static_size);
385 if (ret < 0)
386 ret = setup_pcpu_4k(static_size);
387 if (ret < 0)
388 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
389 static_size, ret);
75 390
76 for_each_possible_cpu(cpu) { 391 pcpu_unit_size = ret;
77#ifndef CONFIG_NEED_MULTIPLE_NODES
78 ptr = alloc_bootmem_pages(size);
79#else
80 int node = early_cpu_to_node(cpu);
81 if (!node_online(node) || !NODE_DATA(node)) {
82 ptr = alloc_bootmem_pages(size);
83 pr_info("cpu %d has no node %d or node-local memory\n",
84 cpu, node);
85 pr_debug("per cpu data for cpu%d at %016lx\n",
86 cpu, __pa(ptr));
87 } else {
88 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
89 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
90 cpu, node, __pa(ptr));
91 }
92#endif
93 392
94 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); 393 /* alrighty, percpu areas up and running */
95 per_cpu_offset(cpu) = ptr - __per_cpu_start; 394 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
395 for_each_possible_cpu(cpu) {
396 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
96 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 397 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
97 per_cpu(cpu_number, cpu) = cpu; 398 per_cpu(cpu_number, cpu) = cpu;
98 setup_percpu_segment(cpu); 399 setup_percpu_segment(cpu);
@@ -125,8 +426,6 @@ void __init setup_per_cpu_areas(void)
125 */ 426 */
126 if (cpu == boot_cpu_id) 427 if (cpu == boot_cpu_id)
127 switch_to_new_gdt(cpu); 428 switch_to_new_gdt(cpu);
128
129 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
130 } 429 }
131 430
132 /* indicate the early static arrays will soon be gone */ 431 /* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 249334f5080..ef7d10170c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
114 114
115atomic_t init_deasserted; 115atomic_t init_deasserted;
116 116
117
118/* Set if we find a B stepping CPU */
119static int __cpuinitdata smp_b_stepping;
120
121#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 117#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
122 118
123/* which logical CPUs are on which nodes */ 119/* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
271 cpumask_set_cpu(cpuid, cpu_callin_mask); 267 cpumask_set_cpu(cpuid, cpu_callin_mask);
272} 268}
273 269
274static int __cpuinitdata unsafe_smp;
275
276/* 270/*
277 * Activate a secondary processor. 271 * Activate a secondary processor.
278 */ 272 */
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
340 cpu_idle(); 334 cpu_idle();
341} 335}
342 336
343static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
344{
345 /*
346 * Mask B, Pentium, but not Pentium MMX
347 */
348 if (c->x86_vendor == X86_VENDOR_INTEL &&
349 c->x86 == 5 &&
350 c->x86_mask >= 1 && c->x86_mask <= 4 &&
351 c->x86_model <= 3)
352 /*
353 * Remember we have B step Pentia with bugs
354 */
355 smp_b_stepping = 1;
356
357 /*
358 * Certain Athlons might work (for various values of 'work') in SMP
359 * but they are not certified as MP capable.
360 */
361 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
362
363 if (num_possible_cpus() == 1)
364 goto valid_k7;
365
366 /* Athlon 660/661 is valid. */
367 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
368 (c->x86_mask == 1)))
369 goto valid_k7;
370
371 /* Duron 670 is valid */
372 if ((c->x86_model == 7) && (c->x86_mask == 0))
373 goto valid_k7;
374
375 /*
376 * Athlon 662, Duron 671, and Athlon >model 7 have capability
377 * bit. It's worth noting that the A5 stepping (662) of some
378 * Athlon XP's have the MP bit set.
379 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
380 * more.
381 */
382 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
383 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
384 (c->x86_model > 7))
385 if (cpu_has_mp)
386 goto valid_k7;
387
388 /* If we get here, not a certified SMP capable AMD system. */
389 unsafe_smp = 1;
390 }
391
392valid_k7:
393 ;
394}
395
396static void __cpuinit smp_checks(void)
397{
398 if (smp_b_stepping)
399 printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
400 "with B stepping processors.\n");
401
402 /*
403 * Don't taint if we are running SMP kernel on a single non-MP
404 * approved Athlon
405 */
406 if (unsafe_smp && num_online_cpus() > 1) {
407 printk(KERN_INFO "WARNING: This combination of AMD"
408 "processors is not suitable for SMP.\n");
409 add_taint(TAINT_UNSAFE_SMP);
410 }
411}
412
413/* 337/*
414 * The bootstrap kernel entry code has set these up. Save them for 338 * The bootstrap kernel entry code has set these up. Save them for
415 * a given CPU 339 * a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
423 c->cpu_index = id; 347 c->cpu_index = id;
424 if (id != 0) 348 if (id != 0)
425 identify_secondary_cpu(c); 349 identify_secondary_cpu(c);
426 smp_apply_quirks(c);
427} 350}
428 351
429 352
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1193 pr_debug("Boot done.\n"); 1116 pr_debug("Boot done.\n");
1194 1117
1195 impress_friends(); 1118 impress_friends();
1196 smp_checks();
1197#ifdef CONFIG_X86_IO_APIC 1119#ifdef CONFIG_X86_IO_APIC
1198 setup_ioapic_dest(); 1120 setup_ioapic_dest();
1199#endif 1121#endif
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f04549afcfe..79c07324728 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
314 int locals = 0; 314 int locals = 0;
315 struct bau_desc *bau_desc; 315 struct bau_desc *bau_desc;
316 316
317 WARN_ON(!in_atomic());
318
319 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 317 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
320 318
321 uv_cpu = uv_blade_processor_id(); 319 uv_cpu = uv_blade_processor_id();
@@ -752,7 +750,7 @@ static int __init uv_bau_init(void)
752 int node; 750 int node;
753 int nblades; 751 int nblades;
754 int last_blade; 752 int last_blade;
755 int cur_cpu = 0; 753 int cur_cpu;
756 754
757 if (!is_uv_system()) 755 if (!is_uv_system())
758 return 0; 756 return 0;
@@ -762,6 +760,7 @@ static int __init uv_bau_init(void)
762 uv_mmask = (1UL << uv_hub_info->n_val) - 1; 760 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
763 nblades = 0; 761 nblades = 0;
764 last_blade = -1; 762 last_blade = -1;
763 cur_cpu = 0;
765 for_each_online_node(node) { 764 for_each_online_node(node) {
766 blade = uv_node_to_blade_id(node); 765 blade = uv_node_to_blade_id(node);
767 if (blade == last_blade) 766 if (blade == last_blade)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 83d53ce5d4c..462b9ba67e9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -273,30 +273,43 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
273 * use the TSC value at the transitions to calculate a pretty 273 * use the TSC value at the transitions to calculate a pretty
274 * good value for the TSC frequencty. 274 * good value for the TSC frequencty.
275 */ 275 */
276static inline int pit_expect_msb(unsigned char val) 276static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
277{ 277{
278 int count = 0; 278 int count;
279 u64 tsc = 0;
279 280
280 for (count = 0; count < 50000; count++) { 281 for (count = 0; count < 50000; count++) {
281 /* Ignore LSB */ 282 /* Ignore LSB */
282 inb(0x42); 283 inb(0x42);
283 if (inb(0x42) != val) 284 if (inb(0x42) != val)
284 break; 285 break;
286 tsc = get_cycles();
285 } 287 }
286 return count > 50; 288 *deltap = get_cycles() - tsc;
289 *tscp = tsc;
290
291 /*
292 * We require _some_ success, but the quality control
293 * will be based on the error terms on the TSC values.
294 */
295 return count > 5;
287} 296}
288 297
289/* 298/*
290 * How many MSB values do we want to see? We aim for a 299 * How many MSB values do we want to see? We aim for
291 * 15ms calibration, which assuming a 2us counter read 300 * a maximum error rate of 500ppm (in practice the
292 * error should give us roughly 150 ppm precision for 301 * real error is much smaller), but refuse to spend
293 * the calibration. 302 * more than 25ms on it.
294 */ 303 */
295#define QUICK_PIT_MS 15 304#define MAX_QUICK_PIT_MS 25
296#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) 305#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
297 306
298static unsigned long quick_pit_calibrate(void) 307static unsigned long quick_pit_calibrate(void)
299{ 308{
309 int i;
310 u64 tsc, delta;
311 unsigned long d1, d2;
312
300 /* Set the Gate high, disable speaker */ 313 /* Set the Gate high, disable speaker */
301 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 314 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
302 315
@@ -315,45 +328,52 @@ static unsigned long quick_pit_calibrate(void)
315 outb(0xff, 0x42); 328 outb(0xff, 0x42);
316 outb(0xff, 0x42); 329 outb(0xff, 0x42);
317 330
318 if (pit_expect_msb(0xff)) { 331 /*
319 int i; 332 * The PIT starts counting at the next edge, so we
320 u64 t1, t2, delta; 333 * need to delay for a microsecond. The easiest way
321 unsigned char expect = 0xfe; 334 * to do that is to just read back the 16-bit counter
322 335 * once from the PIT.
323 t1 = get_cycles(); 336 */
324 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { 337 inb(0x42);
325 if (!pit_expect_msb(expect)) 338 inb(0x42);
326 goto failed; 339
340 if (pit_expect_msb(0xff, &tsc, &d1)) {
341 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
342 if (!pit_expect_msb(0xff-i, &delta, &d2))
343 break;
344
345 /*
346 * Iterate until the error is less than 500 ppm
347 */
348 delta -= tsc;
349 if (d1+d2 < delta >> 11)
350 goto success;
327 } 351 }
328 t2 = get_cycles();
329
330 /*
331 * Make sure we can rely on the second TSC timestamp:
332 */
333 if (!pit_expect_msb(expect))
334 goto failed;
335
336 /*
337 * Ok, if we get here, then we've seen the
338 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
339 * times, and each MSB had many hits, so we never
340 * had any sudden jumps.
341 *
342 * As a result, we can depend on there not being
343 * any odd delays anywhere, and the TSC reads are
344 * reliable.
345 *
346 * kHz = ticks / time-in-seconds / 1000;
347 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
348 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
349 */
350 delta = (t2 - t1)*PIT_TICK_RATE;
351 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
352 printk("Fast TSC calibration using PIT\n");
353 return delta;
354 } 352 }
355failed: 353 printk("Fast TSC calibration failed\n");
356 return 0; 354 return 0;
355
356success:
357 /*
358 * Ok, if we get here, then we've seen the
359 * MSB of the PIT decrement 'i' times, and the
360 * error has shrunk to less than 500 ppm.
361 *
362 * As a result, we can depend on there not being
363 * any odd delays anywhere, and the TSC reads are
364 * reliable (within the error). We also adjust the
365 * delta to the middle of the error bars, just
366 * because it looks nicer.
367 *
368 * kHz = ticks / time-in-seconds / 1000;
369 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
370 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
371 */
372 delta += (long)(d2 - d1)/2;
373 delta *= PIT_TICK_RATE;
374 do_div(delta, i*256*1000);
375 printk("Fast TSC calibration using PIT\n");
376 return delta;
357} 377}
358 378
359/** 379/**
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 00000000000..2ffb6c53326
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
1/*
2 * SGI RTC clock/timer routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Dimitri Sivanich
20 */
21#include <linux/clockchips.h>
22
23#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h>
25#include <asm/uv/bios.h>
26#include <asm/uv/uv.h>
27#include <asm/apic.h>
28#include <asm/cpu.h>
29
30#define RTC_NAME "sgi_rtc"
31
32static cycle_t uv_read_rtc(void);
33static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
34static void uv_rtc_timer_setup(enum clock_event_mode,
35 struct clock_event_device *);
36
37static struct clocksource clocksource_uv = {
38 .name = RTC_NAME,
39 .rating = 400,
40 .read = uv_read_rtc,
41 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
42 .shift = 10,
43 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
44};
45
46static struct clock_event_device clock_event_device_uv = {
47 .name = RTC_NAME,
48 .features = CLOCK_EVT_FEAT_ONESHOT,
49 .shift = 20,
50 .rating = 400,
51 .irq = -1,
52 .set_next_event = uv_rtc_next_event,
53 .set_mode = uv_rtc_timer_setup,
54 .event_handler = NULL,
55};
56
57static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
58
59/* There is one of these allocated per node */
60struct uv_rtc_timer_head {
61 spinlock_t lock;
62 /* next cpu waiting for timer, local node relative: */
63 int next_cpu;
64 /* number of cpus on this node: */
65 int ncpus;
66 struct {
67 int lcpu; /* systemwide logical cpu number */
68 u64 expires; /* next timer expiration for this cpu */
69 } cpu[1];
70};
71
72/*
73 * Access to uv_rtc_timer_head via blade id.
74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly;
76
77static int uv_rtc_enable;
78
79/*
80 * Hardware interface routines
81 */
82
83/* Send IPIs to another node */
84static void uv_rtc_send_IPI(int cpu)
85{
86 unsigned long apicid, val;
87 int pnode;
88
89 apicid = cpu_physical_id(cpu);
90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96}
97
98/* Check for an RTC interrupt pending */
99static int uv_intr_pending(int pnode)
100{
101 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
102 UVH_EVENT_OCCURRED0_RTC1_MASK;
103}
104
105/* Setup interrupt and return non-zero if early expiration occurred. */
106static int uv_setup_intr(int cpu, u64 expires)
107{
108 u64 val;
109 int pnode = uv_cpu_to_pnode(cpu);
110
111 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
112 UVH_RTC1_INT_CONFIG_M_MASK);
113 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
114
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120
121 /* Set configuration */
122 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125
126 return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
127}
128
129/*
130 * Per-cpu timer tracking routines
131 */
132
133static __init void uv_rtc_deallocate_timers(void)
134{
135 int bid;
136
137 for_each_possible_blade(bid) {
138 kfree(blade_info[bid]);
139 }
140 kfree(blade_info);
141}
142
143/* Allocate per-node list of cpu timer expiration times. */
144static __init int uv_rtc_allocate_timers(void)
145{
146 int cpu;
147
148 blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
149 if (!blade_info)
150 return -ENOMEM;
151 memset(blade_info, 0, uv_possible_blades * sizeof(void *));
152
153 for_each_present_cpu(cpu) {
154 int nid = cpu_to_node(cpu);
155 int bid = uv_cpu_to_blade_id(cpu);
156 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
157 struct uv_rtc_timer_head *head = blade_info[bid];
158
159 if (!head) {
160 head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
161 (uv_blade_nr_possible_cpus(bid) *
162 2 * sizeof(u64)),
163 GFP_KERNEL, nid);
164 if (!head) {
165 uv_rtc_deallocate_timers();
166 return -ENOMEM;
167 }
168 spin_lock_init(&head->lock);
169 head->ncpus = uv_blade_nr_possible_cpus(bid);
170 head->next_cpu = -1;
171 blade_info[bid] = head;
172 }
173
174 head->cpu[bcpu].lcpu = cpu;
175 head->cpu[bcpu].expires = ULLONG_MAX;
176 }
177
178 return 0;
179}
180
181/* Find and set the next expiring timer. */
182static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
183{
184 u64 lowest = ULLONG_MAX;
185 int c, bcpu = -1;
186
187 head->next_cpu = -1;
188 for (c = 0; c < head->ncpus; c++) {
189 u64 exp = head->cpu[c].expires;
190 if (exp < lowest) {
191 bcpu = c;
192 lowest = exp;
193 }
194 }
195 if (bcpu >= 0) {
196 head->next_cpu = bcpu;
197 c = head->cpu[bcpu].lcpu;
198 if (uv_setup_intr(c, lowest))
199 /* If we didn't set it up in time, trigger */
200 uv_rtc_send_IPI(c);
201 } else {
202 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
203 UVH_RTC1_INT_CONFIG_M_MASK);
204 }
205}
206
207/*
208 * Set expiration time for current cpu.
209 *
210 * Returns 1 if we missed the expiration time.
211 */
212static int uv_rtc_set_timer(int cpu, u64 expires)
213{
214 int pnode = uv_cpu_to_pnode(cpu);
215 int bid = uv_cpu_to_blade_id(cpu);
216 struct uv_rtc_timer_head *head = blade_info[bid];
217 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
218 u64 *t = &head->cpu[bcpu].expires;
219 unsigned long flags;
220 int next_cpu;
221
222 spin_lock_irqsave(&head->lock, flags);
223
224 next_cpu = head->next_cpu;
225 *t = expires;
226 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) {
229 head->next_cpu = bcpu;
230 if (uv_setup_intr(cpu, expires)) {
231 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags);
234 return 1;
235 }
236 }
237
238 spin_unlock_irqrestore(&head->lock, flags);
239 return 0;
240}
241
242/*
243 * Unset expiration time for current cpu.
244 *
245 * Returns 1 if this timer was pending.
246 */
247static int uv_rtc_unset_timer(int cpu)
248{
249 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu);
251 struct uv_rtc_timer_head *head = blade_info[bid];
252 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
253 u64 *t = &head->cpu[bcpu].expires;
254 unsigned long flags;
255 int rc = 0;
256
257 spin_lock_irqsave(&head->lock, flags);
258
259 if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
260 rc = 1;
261
262 *t = ULLONG_MAX;
263
264 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode);
267
268 spin_unlock_irqrestore(&head->lock, flags);
269
270 return rc;
271}
272
273
274/*
275 * Kernel interface routines.
276 */
277
278/*
279 * Read the RTC.
280 */
281static cycle_t uv_read_rtc(void)
282{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC);
284}
285
286/*
287 * Program the next event, relative to now
288 */
289static int uv_rtc_next_event(unsigned long delta,
290 struct clock_event_device *ced)
291{
292 int ced_cpu = cpumask_first(ced->cpumask);
293
294 return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
295}
296
297/*
298 * Setup the RTC timer in oneshot mode
299 */
300static void uv_rtc_timer_setup(enum clock_event_mode mode,
301 struct clock_event_device *evt)
302{
303 int ced_cpu = cpumask_first(evt->cpumask);
304
305 switch (mode) {
306 case CLOCK_EVT_MODE_PERIODIC:
307 case CLOCK_EVT_MODE_ONESHOT:
308 case CLOCK_EVT_MODE_RESUME:
309 /* Nothing to do here yet */
310 break;
311 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu);
314 break;
315 }
316}
317
318static void uv_rtc_interrupt(void)
319{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id();
322
323 if (!ced || !ced->event_handler)
324 return;
325
326 if (uv_rtc_unset_timer(cpu) != 1)
327 return;
328
329 ced->event_handler(ced);
330}
331
332static int __init uv_enable_rtc(char *str)
333{
334 uv_rtc_enable = 1;
335
336 return 1;
337}
338__setup("uvrtc", uv_enable_rtc);
339
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{
342 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
343
344 *ced = clock_event_device_uv;
345 ced->cpumask = cpumask_of(smp_processor_id());
346 clockevents_register_device(ced);
347}
348
349static __init int uv_rtc_setup_clock(void)
350{
351 int rc;
352
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
354 return -ENODEV;
355
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift);
360
361 rc = clocksource_register(&clocksource_uv);
362 if (rc) {
363 generic_interrupt_extension = NULL;
364 return rc;
365 }
366
367 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers();
369 if (rc) {
370 clocksource_unregister(&clocksource_uv);
371 generic_interrupt_extension = NULL;
372 return rc;
373 }
374
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift);
377
378 clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
379 sn_rtc_cycles_per_second;
380
381 clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
382 (NSEC_PER_SEC / sn_rtc_cycles_per_second);
383
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) {
386 clocksource_unregister(&clocksource_uv);
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers();
389 }
390
391 return rc;
392}
393arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 0d860963f26..62ad500d55f 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,15 +189,24 @@ SECTIONS
189 *(.bss) 189 *(.bss)
190 . = ALIGN(4); 190 . = ALIGN(4);
191 __bss_stop = .; 191 __bss_stop = .;
192 _end = . ; 192 }
193 /* This is where the kernel creates the early boot page tables */ 193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
194 . = ALIGN(PAGE_SIZE); 195 . = ALIGN(PAGE_SIZE);
195 pg0 = . ; 196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
196 } 204 }
197 205
198 /* Sections to be discarded */ 206 /* Sections to be discarded */
199 /DISCARD/ : { 207 /DISCARD/ : {
200 *(.exitcall.exit) 208 *(.exitcall.exit)
209 *(.discard)
201 } 210 }
202 211
203 STABS_DEBUG 212 STABS_DEBUG
@@ -205,6 +214,12 @@ SECTIONS
205 DWARF_DEBUG 214 DWARF_DEBUG
206} 215}
207 216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
208#ifdef CONFIG_KEXEC 223#ifdef CONFIG_KEXEC
209/* Link time checks */ 224/* Link time checks */
210#include <asm/kexec.h> 225#include <asm/kexec.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6f680..c8742507b03 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -29,8 +29,8 @@ SECTIONS
29{ 29{
30 . = __START_KERNEL; 30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET; 31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 _text = .; /* Text and read-only data */
33 .text : AT(ADDR(.text) - LOAD_OFFSET) { 32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */ 34 /* First the code that has to be first for bootstrapping */
35 *(.text.head) 35 *(.text.head)
36 _stext = .; 36 _stext = .;
@@ -61,13 +61,13 @@ SECTIONS
61 .data : AT(ADDR(.data) - LOAD_OFFSET) { 61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA 62 DATA_DATA
63 CONSTRUCTORS 63 CONSTRUCTORS
64 _edata = .; /* End of data section */
64 } :data 65 } :data
65 66
66 _edata = .; /* End of data section */
67 67
68 . = ALIGN(PAGE_SIZE);
69 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
70 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { 68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned) 71 *(.data.cacheline_aligned)
72 } 72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); 73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -125,29 +125,29 @@ SECTIONS
125#undef VVIRT_OFFSET 125#undef VVIRT_OFFSET
126#undef VVIRT 126#undef VVIRT
127 127
128 . = ALIGN(THREAD_SIZE); /* init_task */
129 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task) 130 *(.data.init_task)
131 }:data.init 131 }:data.init
132 132
133 . = ALIGN(PAGE_SIZE);
134 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned) 135 *(.data.page_aligned)
136 } 136 }
137 137
138 /* might get freed after init */
139 . = ALIGN(PAGE_SIZE);
140 __smp_alt_begin = .;
141 __smp_locks = .;
142 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks) 143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
144 } 147 }
145 __smp_locks_end = .;
146 . = ALIGN(PAGE_SIZE);
147 __smp_alt_end = .;
148 148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */ 149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; 150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .; 152 _sinittext = .;
153 INIT_TEXT 153 INIT_TEXT
@@ -159,40 +159,42 @@ SECTIONS
159 __initdata_end = .; 159 __initdata_end = .;
160 } 160 }
161 161
162 . = ALIGN(16); 162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 __setup_start = .; 163 . = ALIGN(16);
164 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 164 __setup_start = .;
165 __setup_end = .; 165 *(.init.setup)
166 __initcall_start = .; 166 __setup_end = .;
167 }
167 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
168 INITCALLS 170 INITCALLS
171 __initcall_end = .;
169 } 172 }
170 __initcall_end = .;
171 __con_initcall_start = .;
172 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
173 *(.con_initcall.init) 175 *(.con_initcall.init)
176 __con_initcall_end = .;
174 } 177 }
175 __con_initcall_end = .;
176 __x86_cpu_dev_start = .;
177 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
178 *(.x86_cpu_dev.init) 180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
179 } 182 }
180 __x86_cpu_dev_end = .;
181 SECURITY_INIT 183 SECURITY_INIT
182 184
183 . = ALIGN(8); 185 . = ALIGN(8);
184 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
185 __parainstructions = .; 187 __parainstructions = .;
186 *(.parainstructions) 188 *(.parainstructions)
187 __parainstructions_end = .; 189 __parainstructions_end = .;
188 } 190 }
189 191
190 . = ALIGN(8);
191 __alt_instructions = .;
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
193 *(.altinstructions) 195 *(.altinstructions)
196 __alt_instructions_end = .;
194 } 197 }
195 __alt_instructions_end = .;
196 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
197 *(.altinstr_replacement) 199 *(.altinstr_replacement)
198 } 200 }
@@ -207,9 +209,11 @@ SECTIONS
207 209
208#ifdef CONFIG_BLK_DEV_INITRD 210#ifdef CONFIG_BLK_DEV_INITRD
209 . = ALIGN(PAGE_SIZE); 211 . = ALIGN(PAGE_SIZE);
210 __initramfs_start = .; 212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
211 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 213 __initramfs_start = .;
212 __initramfs_end = .; 214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
213#endif 217#endif
214 218
215#ifdef CONFIG_SMP 219#ifdef CONFIG_SMP
@@ -229,20 +233,29 @@ SECTIONS
229 . = ALIGN(PAGE_SIZE); 233 . = ALIGN(PAGE_SIZE);
230 __init_end = .; 234 __init_end = .;
231 235
232 . = ALIGN(PAGE_SIZE);
233 __nosave_begin = .;
234 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { 236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
235 *(.data.nosave) 237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
236 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ 242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
237 . = ALIGN(PAGE_SIZE);
238 __nosave_end = .;
239 243
240 __bss_start = .; /* BSS */
241 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
242 *(.bss.page_aligned) 247 *(.bss.page_aligned)
243 *(.bss) 248 *(.bss)
244 } 249 __bss_stop = .;
245 __bss_stop = .; 250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
246 259
247 _end = . ; 260 _end = . ;
248 261
@@ -250,6 +263,7 @@ SECTIONS
250 /DISCARD/ : { 263 /DISCARD/ : {
251 *(.exitcall.exit) 264 *(.exitcall.exit)
252 *(.eh_frame) 265 *(.eh_frame)
266 *(.discard)
253 } 267 }
254 268
255 STABS_DEBUG 269 STABS_DEBUG
@@ -275,3 +289,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
275ASSERT((per_cpu__irq_stack_union == 0), 289ASSERT((per_cpu__irq_stack_union == 0),
276 "irq_stack_union is not at start of per-cpu area"); 290 "irq_stack_union is not at start of per-cpu area");
277#endif 291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f3a5305b8ad..90e44a10e68 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -348,6 +348,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
348 * flush_tlb_user() for both user and kernel mappings unless 348 * flush_tlb_user() for both user and kernel mappings unless
349 * the Page Global Enable (PGE) feature bit is set. */ 349 * the Page Global Enable (PGE) feature bit is set. */
350 *dx |= 0x00002000; 350 *dx |= 0x00002000;
351 /* We also lie, and say we're family id 5. 6 or greater
352 * leads to a rdmsr in early_init_intel which we can't handle.
353 * Family ID is returned as bits 8-12 in ax. */
354 *ax &= 0xFFFFF0FF;
355 *ax |= 0x00000500;
351 break; 356 break;
352 case 0x80000000: 357 case 0x80000000:
353 /* Futureproof this a little: if they ask how much extended 358 /* Futureproof this a little: if they ask how much extended
@@ -594,19 +599,21 @@ static void __init lguest_init_IRQ(void)
594 /* Some systems map "vectors" to interrupts weirdly. Lguest has 599 /* Some systems map "vectors" to interrupts weirdly. Lguest has
595 * a straightforward 1 to 1 mapping, so force that here. */ 600 * a straightforward 1 to 1 mapping, so force that here. */
596 __get_cpu_var(vector_irq)[vector] = i; 601 __get_cpu_var(vector_irq)[vector] = i;
597 if (vector != SYSCALL_VECTOR) { 602 if (vector != SYSCALL_VECTOR)
598 set_intr_gate(vector, 603 set_intr_gate(vector, interrupt[i]);
599 interrupt[vector-FIRST_EXTERNAL_VECTOR]);
600 set_irq_chip_and_handler_name(i, &lguest_irq_controller,
601 handle_level_irq,
602 "level");
603 }
604 } 604 }
605 /* This call is required to set up for 4k stacks, where we have 605 /* This call is required to set up for 4k stacks, where we have
606 * separate stacks for hard and soft interrupts. */ 606 * separate stacks for hard and soft interrupts. */
607 irq_ctx_init(smp_processor_id()); 607 irq_ctx_init(smp_processor_id());
608} 608}
609 609
610void lguest_setup_irq(unsigned int irq)
611{
612 irq_to_desc_alloc_cpu(irq, 0);
613 set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
614 handle_level_irq, "level");
615}
616
610/* 617/*
611 * Time. 618 * Time.
612 * 619 *
@@ -1051,14 +1058,6 @@ __init void lguest_init(void)
1051 * lguest_init() where the rest of the fairly chaotic boot setup 1058 * lguest_init() where the rest of the fairly chaotic boot setup
1052 * occurs. */ 1059 * occurs. */
1053 1060
1054 /* The native boot code sets up initial page tables immediately after
1055 * the kernel itself, and sets init_pg_tables_end so they're not
1056 * clobbered. The Launcher places our initial pagetables somewhere at
1057 * the top of our physical memory, so we don't need extra space: set
1058 * init_pg_tables_end to the end of the kernel. */
1059 init_pg_tables_start = __pa(pg0);
1060 init_pg_tables_end = __pa(pg0);
1061
1062 /* As described in head_32.S, we map the first 128M of memory. */ 1061 /* As described in head_32.S, we map the first 128M of memory. */
1063 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1062 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1064 1063
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3..ad5441ed1b5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/dwarf2.h>
6 7
7/* 8/*
8 * memcpy - Copy a memory block. 9 * memcpy - Copy a memory block.
9 * 10 *
10 * Input: 11 * Input:
11 * rdi destination 12 * rdi destination
12 * rsi source 13 * rsi source
13 * rdx count 14 * rdx count
14 * 15 *
15 * Output: 16 * Output:
16 * rax original destination 17 * rax original destination
17 */ 18 */
18 19
20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
23 * Calls to this get patched into the kernel image via the
24 * alternative instructions framework:
25 */
19 ALIGN 26 ALIGN
20memcpy_c: 27memcpy_c:
21 CFI_STARTPROC 28 CFI_STARTPROC
22 movq %rdi,%rax 29 movq %rdi, %rax
23 movl %edx,%ecx 30
24 shrl $3,%ecx 31 movl %edx, %ecx
25 andl $7,%edx 32 shrl $3, %ecx
33 andl $7, %edx
26 rep movsq 34 rep movsq
27 movl %edx,%ecx 35 movl %edx, %ecx
28 rep movsb 36 rep movsb
29 ret 37 ret
30 CFI_ENDPROC 38 CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
33ENTRY(__memcpy) 41ENTRY(__memcpy)
34ENTRY(memcpy) 42ENTRY(memcpy)
35 CFI_STARTPROC 43 CFI_STARTPROC
36 pushq %rbx
37 CFI_ADJUST_CFA_OFFSET 8
38 CFI_REL_OFFSET rbx, 0
39 movq %rdi,%rax
40 44
41 movl %edx,%ecx 45 /*
42 shrl $6,%ecx 46 * Put the number of full 64-byte blocks into %ecx.
47 * Tail portion is handled at the end:
48 */
49 movq %rdi, %rax
50 movl %edx, %ecx
51 shrl $6, %ecx
43 jz .Lhandle_tail 52 jz .Lhandle_tail
44 53
45 .p2align 4 54 .p2align 4
46.Lloop_64: 55.Lloop_64:
56 /*
57 * We decrement the loop index here - and the zero-flag is
58 * checked at the end of the loop (instructions inbetween do
59 * not change the zero flag):
60 */
47 decl %ecx 61 decl %ecx
48 62
49 movq (%rsi),%r11 63 /*
50 movq 8(%rsi),%r8 64 * Move in blocks of 4x16 bytes:
65 */
66 movq 0*8(%rsi), %r11
67 movq 1*8(%rsi), %r8
68 movq %r11, 0*8(%rdi)
69 movq %r8, 1*8(%rdi)
51 70
52 movq %r11,(%rdi) 71 movq 2*8(%rsi), %r9
53 movq %r8,1*8(%rdi) 72 movq 3*8(%rsi), %r10
73 movq %r9, 2*8(%rdi)
74 movq %r10, 3*8(%rdi)
54 75
55 movq 2*8(%rsi),%r9 76 movq 4*8(%rsi), %r11
56 movq 3*8(%rsi),%r10 77 movq 5*8(%rsi), %r8
78 movq %r11, 4*8(%rdi)
79 movq %r8, 5*8(%rdi)
57 80
58 movq %r9,2*8(%rdi) 81 movq 6*8(%rsi), %r9
59 movq %r10,3*8(%rdi) 82 movq 7*8(%rsi), %r10
83 movq %r9, 6*8(%rdi)
84 movq %r10, 7*8(%rdi)
60 85
61 movq 4*8(%rsi),%r11 86 leaq 64(%rsi), %rsi
62 movq 5*8(%rsi),%r8 87 leaq 64(%rdi), %rdi
63 88
64 movq %r11,4*8(%rdi)
65 movq %r8,5*8(%rdi)
66
67 movq 6*8(%rsi),%r9
68 movq 7*8(%rsi),%r10
69
70 movq %r9,6*8(%rdi)
71 movq %r10,7*8(%rdi)
72
73 leaq 64(%rsi),%rsi
74 leaq 64(%rdi),%rdi
75 jnz .Lloop_64 89 jnz .Lloop_64
76 90
77.Lhandle_tail: 91.Lhandle_tail:
78 movl %edx,%ecx 92 movl %edx, %ecx
79 andl $63,%ecx 93 andl $63, %ecx
80 shrl $3,%ecx 94 shrl $3, %ecx
81 jz .Lhandle_7 95 jz .Lhandle_7
96
82 .p2align 4 97 .p2align 4
83.Lloop_8: 98.Lloop_8:
84 decl %ecx 99 decl %ecx
85 movq (%rsi),%r8 100 movq (%rsi), %r8
86 movq %r8,(%rdi) 101 movq %r8, (%rdi)
87 leaq 8(%rdi),%rdi 102 leaq 8(%rdi), %rdi
88 leaq 8(%rsi),%rsi 103 leaq 8(%rsi), %rsi
89 jnz .Lloop_8 104 jnz .Lloop_8
90 105
91.Lhandle_7: 106.Lhandle_7:
92 movl %edx,%ecx 107 movl %edx, %ecx
93 andl $7,%ecx 108 andl $7, %ecx
94 jz .Lende 109 jz .Lend
110
95 .p2align 4 111 .p2align 4
96.Lloop_1: 112.Lloop_1:
97 movb (%rsi),%r8b 113 movb (%rsi), %r8b
98 movb %r8b,(%rdi) 114 movb %r8b, (%rdi)
99 incq %rdi 115 incq %rdi
100 incq %rsi 116 incq %rsi
101 decl %ecx 117 decl %ecx
102 jnz .Lloop_1 118 jnz .Lloop_1
103 119
104.Lende: 120.Lend:
105 popq %rbx
106 CFI_ADJUST_CFA_OFFSET -8
107 CFI_RESTORE rbx
108 ret 121 ret
109.Lfinal:
110 CFI_ENDPROC 122 CFI_ENDPROC
111ENDPROC(memcpy) 123ENDPROC(memcpy)
112ENDPROC(__memcpy) 124ENDPROC(__memcpy)
113 125
114 /* Some CPUs run faster using the string copy instructions. 126 /*
115 It is also a lot simpler. Use this when possible */ 127 * Some CPUs run faster using the string copy instructions.
128 * It is also a lot simpler. Use this when possible:
129 */
116 130
117 .section .altinstr_replacement,"ax" 131 .section .altinstr_replacement, "ax"
1181: .byte 0xeb /* jmp <disp8> */ 1321: .byte 0xeb /* jmp <disp8> */
119 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1202: 1342:
121 .previous 135 .previous
122 .section .altinstructions,"a" 136
137 .section .altinstructions, "a"
123 .align 8 138 .align 8
124 .quad memcpy 139 .quad memcpy
125 .quad 1b 140 .quad 1b
126 .byte X86_FEATURE_REP_GOOD 141 .byte X86_FEATURE_REP_GOOD
127 /* Replace only beginning, memcpy is used to apply alternatives, so it 142
128 * is silly to overwrite itself with nops - reboot is only outcome... */ 143 /*
144 * Replace only beginning, memcpy is used to apply alternatives,
145 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome...
147 */
129 .byte 2b - 1b 148 .byte 2b - 1b
130 .byte 2b - 1b 149 .byte 2b - 1b
131 .previous 150 .previous
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce54..aa098708877 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
30} 30}
31 31
32/* Needs to be externally visible */ 32/* Needs to be externally visible */
33void finit(void) 33void finit_task(struct task_struct *tsk)
34{ 34{
35 control_word = 0x037f; 35 struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
36 partial_status = 0; 36 struct address *oaddr, *iaddr;
37 top = 0; /* We don't keep top in the status word internally. */ 37 soft->cwd = 0x037f;
38 fpu_tag_word = 0xffff; 38 soft->swd = 0;
39 soft->ftop = 0; /* We don't keep top in the status word internally. */
40 soft->twd = 0xffff;
39 /* The behaviour is different from that detailed in 41 /* The behaviour is different from that detailed in
40 Section 15.1.6 of the Intel manual */ 42 Section 15.1.6 of the Intel manual */
41 operand_address.offset = 0; 43 oaddr = (struct address *)&soft->foo;
42 operand_address.selector = 0; 44 oaddr->offset = 0;
43 instruction_address.offset = 0; 45 oaddr->selector = 0;
44 instruction_address.selector = 0; 46 iaddr = (struct address *)&soft->fip;
45 instruction_address.opcode = 0; 47 iaddr->offset = 0;
46 no_ip_update = 1; 48 iaddr->selector = 0;
49 iaddr->opcode = 0;
50 soft->no_update = 1;
51}
52
53void finit(void)
54{
55 finit_task(current);
47} 56}
48 57
49/* 58/*
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c80b0..522db5e3d0b 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -121,22 +121,13 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
121 pagefault_enable(); 121 pagefault_enable();
122} 122}
123 123
124/* This is the same as kmap_atomic() but can map memory that doesn't 124/*
125 * This is the same as kmap_atomic() but can map memory that doesn't
125 * have a struct page associated with it. 126 * have a struct page associated with it.
126 */ 127 */
127void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) 128void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
128{ 129{
129 enum fixed_addresses idx; 130 return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
130 unsigned long vaddr;
131
132 pagefault_disable();
133
134 idx = type + KM_TYPE_NR*smp_processor_id();
135 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
136 set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
137 arch_flush_lazy_mmu_mode();
138
139 return (void*) vaddr;
140} 131}
141EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ 132EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
142 133
@@ -158,7 +149,6 @@ EXPORT_SYMBOL(kunmap);
158EXPORT_SYMBOL(kmap_atomic); 149EXPORT_SYMBOL(kmap_atomic);
159EXPORT_SYMBOL(kunmap_atomic); 150EXPORT_SYMBOL(kunmap_atomic);
160 151
161#ifdef CONFIG_NUMA
162void __init set_highmem_pages_init(void) 152void __init set_highmem_pages_init(void)
163{ 153{
164 struct zone *zone; 154 struct zone *zone;
@@ -182,11 +172,3 @@ void __init set_highmem_pages_init(void)
182 } 172 }
183 totalram_pages += totalhigh_pages; 173 totalram_pages += totalhigh_pages;
184} 174}
185#else
186void __init set_highmem_pages_init(void)
187{
188 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
189
190 totalram_pages += totalhigh_pages;
191}
192#endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ce6a722587d..fd3da1dda1c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,8 +1,345 @@
1#include <linux/ioport.h>
1#include <linux/swap.h> 2#include <linux/swap.h>
3
2#include <asm/cacheflush.h> 4#include <asm/cacheflush.h>
5#include <asm/e820.h>
6#include <asm/init.h>
3#include <asm/page.h> 7#include <asm/page.h>
8#include <asm/page_types.h>
4#include <asm/sections.h> 9#include <asm/sections.h>
5#include <asm/system.h> 10#include <asm/system.h>
11#include <asm/tlbflush.h>
12
13unsigned long __initdata e820_table_start;
14unsigned long __meminitdata e820_table_end;
15unsigned long __meminitdata e820_table_top;
16
17int after_bootmem;
18
19int direct_gbpages
20#ifdef CONFIG_DIRECT_GBPAGES
21 = 1
22#endif
23;
24
25static void __init find_early_table_space(unsigned long end, int use_pse,
26 int use_gbpages)
27{
28 unsigned long puds, pmds, ptes, tables, start;
29
30 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
31 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
32
33 if (use_gbpages) {
34 unsigned long extra;
35
36 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
37 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
38 } else
39 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
40
41 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
42
43 if (use_pse) {
44 unsigned long extra;
45
46 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
47#ifdef CONFIG_X86_32
48 extra += PMD_SIZE;
49#endif
50 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
51 } else
52 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
53
54 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
55
56#ifdef CONFIG_X86_32
57 /* for fixmap */
58 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
59#endif
60
61 /*
62 * RED-PEN putting page tables only on node 0 could
63 * cause a hotspot and fill up ZONE_DMA. The page tables
64 * need roughly 0.5KB per GB.
65 */
66#ifdef CONFIG_X86_32
67 start = 0x7000;
68 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
69 tables, PAGE_SIZE);
70#else /* CONFIG_X86_64 */
71 start = 0x8000;
72 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
73#endif
74 if (e820_table_start == -1UL)
75 panic("Cannot find space for the kernel page tables");
76
77 e820_table_start >>= PAGE_SHIFT;
78 e820_table_end = e820_table_start;
79 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
80
81 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
82 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
83}
84
85struct map_range {
86 unsigned long start;
87 unsigned long end;
88 unsigned page_size_mask;
89};
90
91#ifdef CONFIG_X86_32
92#define NR_RANGE_MR 3
93#else /* CONFIG_X86_64 */
94#define NR_RANGE_MR 5
95#endif
96
97static int __meminit save_mr(struct map_range *mr, int nr_range,
98 unsigned long start_pfn, unsigned long end_pfn,
99 unsigned long page_size_mask)
100{
101 if (start_pfn < end_pfn) {
102 if (nr_range >= NR_RANGE_MR)
103 panic("run out of range for init_memory_mapping\n");
104 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
105 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
106 mr[nr_range].page_size_mask = page_size_mask;
107 nr_range++;
108 }
109
110 return nr_range;
111}
112
113#ifdef CONFIG_X86_64
114static void __init init_gbpages(void)
115{
116 if (direct_gbpages && cpu_has_gbpages)
117 printk(KERN_INFO "Using GB pages for direct mapping\n");
118 else
119 direct_gbpages = 0;
120}
121#else
122static inline void init_gbpages(void)
123{
124}
125#endif
126
127/*
128 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
129 * This runs before bootmem is initialized and gets pages directly from
130 * the physical memory. To access them they are temporarily mapped.
131 */
132unsigned long __init_refok init_memory_mapping(unsigned long start,
133 unsigned long end)
134{
135 unsigned long page_size_mask = 0;
136 unsigned long start_pfn, end_pfn;
137 unsigned long ret = 0;
138 unsigned long pos;
139
140 struct map_range mr[NR_RANGE_MR];
141 int nr_range, i;
142 int use_pse, use_gbpages;
143
144 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
145
146 if (!after_bootmem)
147 init_gbpages();
148
149#ifdef CONFIG_DEBUG_PAGEALLOC
150 /*
151 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
152 * This will simplify cpa(), which otherwise needs to support splitting
153 * large pages into small in interrupt context, etc.
154 */
155 use_pse = use_gbpages = 0;
156#else
157 use_pse = cpu_has_pse;
158 use_gbpages = direct_gbpages;
159#endif
160
161#ifdef CONFIG_X86_32
162#ifdef CONFIG_X86_PAE
163 set_nx();
164 if (nx_enabled)
165 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
166#endif
167
168 /* Enable PSE if available */
169 if (cpu_has_pse)
170 set_in_cr4(X86_CR4_PSE);
171
172 /* Enable PGE if available */
173 if (cpu_has_pge) {
174 set_in_cr4(X86_CR4_PGE);
175 __supported_pte_mask |= _PAGE_GLOBAL;
176 }
177#endif
178
179 if (use_gbpages)
180 page_size_mask |= 1 << PG_LEVEL_1G;
181 if (use_pse)
182 page_size_mask |= 1 << PG_LEVEL_2M;
183
184 memset(mr, 0, sizeof(mr));
185 nr_range = 0;
186
187 /* head if not big page alignment ? */
188 start_pfn = start >> PAGE_SHIFT;
189 pos = start_pfn << PAGE_SHIFT;
190#ifdef CONFIG_X86_32
191 /*
192 * Don't use a large page for the first 2/4MB of memory
193 * because there are often fixed size MTRRs in there
194 * and overlapping MTRRs into large pages can cause
195 * slowdowns.
196 */
197 if (pos == 0)
198 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
199 else
200 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
201 << (PMD_SHIFT - PAGE_SHIFT);
202#else /* CONFIG_X86_64 */
203 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
204 << (PMD_SHIFT - PAGE_SHIFT);
205#endif
206 if (end_pfn > (end >> PAGE_SHIFT))
207 end_pfn = end >> PAGE_SHIFT;
208 if (start_pfn < end_pfn) {
209 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
210 pos = end_pfn << PAGE_SHIFT;
211 }
212
213 /* big page (2M) range */
214 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
215 << (PMD_SHIFT - PAGE_SHIFT);
216#ifdef CONFIG_X86_32
217 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
218#else /* CONFIG_X86_64 */
219 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
220 << (PUD_SHIFT - PAGE_SHIFT);
221 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
222 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
223#endif
224
225 if (start_pfn < end_pfn) {
226 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
227 page_size_mask & (1<<PG_LEVEL_2M));
228 pos = end_pfn << PAGE_SHIFT;
229 }
230
231#ifdef CONFIG_X86_64
232 /* big page (1G) range */
233 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
234 << (PUD_SHIFT - PAGE_SHIFT);
235 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
236 if (start_pfn < end_pfn) {
237 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
238 page_size_mask &
239 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
240 pos = end_pfn << PAGE_SHIFT;
241 }
242
243 /* tail is not big page (1G) alignment */
244 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
245 << (PMD_SHIFT - PAGE_SHIFT);
246 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
247 if (start_pfn < end_pfn) {
248 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
249 page_size_mask & (1<<PG_LEVEL_2M));
250 pos = end_pfn << PAGE_SHIFT;
251 }
252#endif
253
254 /* tail is not big page (2M) alignment */
255 start_pfn = pos>>PAGE_SHIFT;
256 end_pfn = end>>PAGE_SHIFT;
257 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
258
259 /* try to merge same page size and continuous */
260 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
261 unsigned long old_start;
262 if (mr[i].end != mr[i+1].start ||
263 mr[i].page_size_mask != mr[i+1].page_size_mask)
264 continue;
265 /* move it */
266 old_start = mr[i].start;
267 memmove(&mr[i], &mr[i+1],
268 (nr_range - 1 - i) * sizeof(struct map_range));
269 mr[i--].start = old_start;
270 nr_range--;
271 }
272
273 for (i = 0; i < nr_range; i++)
274 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
275 mr[i].start, mr[i].end,
276 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
277 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
278
279 /*
280 * Find space for the kernel direct mapping tables.
281 *
282 * Later we should allocate these tables in the local node of the
283 * memory mapped. Unfortunately this is done currently before the
284 * nodes are discovered.
285 */
286 if (!after_bootmem)
287 find_early_table_space(end, use_pse, use_gbpages);
288
289#ifdef CONFIG_X86_32
290 for (i = 0; i < nr_range; i++)
291 kernel_physical_mapping_init(mr[i].start, mr[i].end,
292 mr[i].page_size_mask);
293 ret = end;
294#else /* CONFIG_X86_64 */
295 for (i = 0; i < nr_range; i++)
296 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
297 mr[i].page_size_mask);
298#endif
299
300#ifdef CONFIG_X86_32
301 early_ioremap_page_table_range_init();
302
303 load_cr3(swapper_pg_dir);
304#endif
305
306#ifdef CONFIG_X86_64
307 if (!after_bootmem)
308 mmu_cr4_features = read_cr4();
309#endif
310 __flush_tlb_all();
311
312 if (!after_bootmem && e820_table_end > e820_table_start)
313 reserve_early(e820_table_start << PAGE_SHIFT,
314 e820_table_end << PAGE_SHIFT, "PGTABLE");
315
316 if (!after_bootmem)
317 early_memtest(start, end);
318
319 return ret >> PAGE_SHIFT;
320}
321
322
323/*
324 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
325 * is valid. The argument is a physical page number.
326 *
327 *
328 * On x86, access has to be given to the first megabyte of ram because that area
329 * contains bios code and data regions used by X and dosemu and similar apps.
330 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
331 * mmio resources as well as potential bios/acpi data regions.
332 */
333int devmem_is_allowed(unsigned long pagenr)
334{
335 if (pagenr <= 256)
336 return 1;
337 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
338 return 0;
339 if (!page_is_ram(pagenr))
340 return 1;
341 return 0;
342}
6 343
7void free_init_pages(char *what, unsigned long begin, unsigned long end) 344void free_init_pages(char *what, unsigned long begin, unsigned long end)
8{ 345{
@@ -47,3 +384,10 @@ void free_initmem(void)
47 (unsigned long)(&__init_begin), 384 (unsigned long)(&__init_begin),
48 (unsigned long)(&__init_end)); 385 (unsigned long)(&__init_end));
49} 386}
387
388#ifdef CONFIG_BLK_DEV_INITRD
389void free_initrd_mem(unsigned long start, unsigned long end)
390{
391 free_init_pages("initrd memory", start, end);
392}
393#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0b087dcd2c1..db81e9a8556 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,6 +49,7 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/init.h>
52 53
53unsigned long max_low_pfn_mapped; 54unsigned long max_low_pfn_mapped;
54unsigned long max_pfn_mapped; 55unsigned long max_pfn_mapped;
@@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
58 59
59static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
60 61
61 62bool __read_mostly __vmalloc_start_set = false;
62static unsigned long __initdata table_start;
63static unsigned long __meminitdata table_end;
64static unsigned long __meminitdata table_top;
65
66static int __initdata after_init_bootmem;
67 63
68static __init void *alloc_low_page(void) 64static __init void *alloc_low_page(void)
69{ 65{
70 unsigned long pfn = table_end++; 66 unsigned long pfn = e820_table_end++;
71 void *adr; 67 void *adr;
72 68
73 if (pfn >= table_top) 69 if (pfn >= e820_table_top)
74 panic("alloc_low_page: ran out of memory"); 70 panic("alloc_low_page: ran out of memory");
75 71
76 adr = __va(pfn * PAGE_SIZE); 72 adr = __va(pfn * PAGE_SIZE);
@@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
90 86
91#ifdef CONFIG_X86_PAE 87#ifdef CONFIG_X86_PAE
92 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 88 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
93 if (after_init_bootmem) 89 if (after_bootmem)
94 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 90 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
95 else 91 else
96 pmd_table = (pmd_t *)alloc_low_page(); 92 pmd_table = (pmd_t *)alloc_low_page();
@@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
117 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 113 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
118 pte_t *page_table = NULL; 114 pte_t *page_table = NULL;
119 115
120 if (after_init_bootmem) { 116 if (after_bootmem) {
121#ifdef CONFIG_DEBUG_PAGEALLOC 117#ifdef CONFIG_DEBUG_PAGEALLOC
122 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 118 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
123#endif 119#endif
@@ -135,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
135 return pte_offset_kernel(pmd, 0); 131 return pte_offset_kernel(pmd, 0);
136} 132}
137 133
134pmd_t * __init populate_extra_pmd(unsigned long vaddr)
135{
136 int pgd_idx = pgd_index(vaddr);
137 int pmd_idx = pmd_index(vaddr);
138
139 return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
140}
141
142pte_t * __init populate_extra_pte(unsigned long vaddr)
143{
144 int pte_idx = pte_index(vaddr);
145 pmd_t *pmd;
146
147 pmd = populate_extra_pmd(vaddr);
148 return one_page_table_init(pmd) + pte_idx;
149}
150
138static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 151static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
139 unsigned long vaddr, pte_t *lastpte) 152 unsigned long vaddr, pte_t *lastpte)
140{ 153{
@@ -151,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
151 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 164 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
152 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 165 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
153 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 166 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
154 && ((__pa(pte) >> PAGE_SHIFT) < table_start 167 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
155 || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { 168 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
156 pte_t *newpte; 169 pte_t *newpte;
157 int i; 170 int i;
158 171
159 BUG_ON(after_init_bootmem); 172 BUG_ON(after_bootmem);
160 newpte = alloc_low_page(); 173 newpte = alloc_low_page();
161 for (i = 0; i < PTRS_PER_PTE; i++) 174 for (i = 0; i < PTRS_PER_PTE; i++)
162 set_pte(newpte + i, pte[i]); 175 set_pte(newpte + i, pte[i]);
@@ -225,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
225 * of max_low_pfn pages, by creating page tables starting from address 238 * of max_low_pfn pages, by creating page tables starting from address
226 * PAGE_OFFSET: 239 * PAGE_OFFSET:
227 */ 240 */
228static void __init kernel_physical_mapping_init(pgd_t *pgd_base, 241unsigned long __init
229 unsigned long start_pfn, 242kernel_physical_mapping_init(unsigned long start,
230 unsigned long end_pfn, 243 unsigned long end,
231 int use_pse) 244 unsigned long page_size_mask)
232{ 245{
246 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
247 unsigned long start_pfn, end_pfn;
248 pgd_t *pgd_base = swapper_pg_dir;
233 int pgd_idx, pmd_idx, pte_ofs; 249 int pgd_idx, pmd_idx, pte_ofs;
234 unsigned long pfn; 250 unsigned long pfn;
235 pgd_t *pgd; 251 pgd_t *pgd;
@@ -238,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
238 unsigned pages_2m, pages_4k; 254 unsigned pages_2m, pages_4k;
239 int mapping_iter; 255 int mapping_iter;
240 256
257 start_pfn = start >> PAGE_SHIFT;
258 end_pfn = end >> PAGE_SHIFT;
259
241 /* 260 /*
242 * First iteration will setup identity mapping using large/small pages 261 * First iteration will setup identity mapping using large/small pages
243 * based on use_pse, with other attributes same as set by 262 * based on use_pse, with other attributes same as set by
@@ -352,26 +371,6 @@ repeat:
352 mapping_iter = 2; 371 mapping_iter = 2;
353 goto repeat; 372 goto repeat;
354 } 373 }
355}
356
357/*
358 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
359 * is valid. The argument is a physical page number.
360 *
361 *
362 * On x86, access has to be given to the first megabyte of ram because that area
363 * contains bios code and data regions used by X and dosemu and similar apps.
364 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
365 * mmio resources as well as potential bios/acpi data regions.
366 */
367int devmem_is_allowed(unsigned long pagenr)
368{
369 if (pagenr <= 256)
370 return 1;
371 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
372 return 0;
373 if (!page_is_ram(pagenr))
374 return 1;
375 return 0; 374 return 0;
376} 375}
377 376
@@ -528,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
528 * be partially populated, and so it avoids stomping on any existing 527 * be partially populated, and so it avoids stomping on any existing
529 * mappings. 528 * mappings.
530 */ 529 */
531static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) 530void __init early_ioremap_page_table_range_init(void)
532{ 531{
532 pgd_t *pgd_base = swapper_pg_dir;
533 unsigned long vaddr, end; 533 unsigned long vaddr, end;
534 534
535 /* 535 /*
@@ -624,7 +624,7 @@ static int __init noexec_setup(char *str)
624} 624}
625early_param("noexec", noexec_setup); 625early_param("noexec", noexec_setup);
626 626
627static void __init set_nx(void) 627void __init set_nx(void)
628{ 628{
629 unsigned int v[4], l, h; 629 unsigned int v[4], l, h;
630 630
@@ -776,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
776#ifdef CONFIG_FLATMEM 776#ifdef CONFIG_FLATMEM
777 max_mapnr = num_physpages; 777 max_mapnr = num_physpages;
778#endif 778#endif
779 __vmalloc_start_set = true;
780
779 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 781 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
780 pages_to_mb(max_low_pfn)); 782 pages_to_mb(max_low_pfn));
781 783
@@ -797,176 +799,66 @@ static void __init zone_sizes_init(void)
797 free_area_init_nodes(max_zone_pfns); 799 free_area_init_nodes(max_zone_pfns);
798} 800}
799 801
802static unsigned long __init setup_node_bootmem(int nodeid,
803 unsigned long start_pfn,
804 unsigned long end_pfn,
805 unsigned long bootmap)
806{
807 unsigned long bootmap_size;
808
809 /* don't touch min_low_pfn */
810 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
811 bootmap >> PAGE_SHIFT,
812 start_pfn, end_pfn);
813 printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
814 nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
815 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
816 nodeid, bootmap, bootmap + bootmap_size);
817 free_bootmem_with_active_regions(nodeid, end_pfn);
818 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
819
820 return bootmap + bootmap_size;
821}
822
800void __init setup_bootmem_allocator(void) 823void __init setup_bootmem_allocator(void)
801{ 824{
802 int i; 825 int nodeid;
803 unsigned long bootmap_size, bootmap; 826 unsigned long bootmap_size, bootmap;
804 /* 827 /*
805 * Initialize the boot-time allocator (with low memory only): 828 * Initialize the boot-time allocator (with low memory only):
806 */ 829 */
807 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; 830 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
808 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, 831 bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
809 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
810 PAGE_SIZE); 832 PAGE_SIZE);
811 if (bootmap == -1L) 833 if (bootmap == -1L)
812 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 834 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
813 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 835 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
814 836
815 /* don't touch min_low_pfn */
816 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
817 min_low_pfn, max_low_pfn);
818 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 837 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
819 max_pfn_mapped<<PAGE_SHIFT); 838 max_pfn_mapped<<PAGE_SHIFT);
820 printk(KERN_INFO " low ram: %08lx - %08lx\n", 839 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
821 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
822 printk(KERN_INFO " bootmap %08lx - %08lx\n",
823 bootmap, bootmap + bootmap_size);
824 for_each_online_node(i)
825 free_bootmem_with_active_regions(i, max_low_pfn);
826 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
827
828 after_init_bootmem = 1;
829}
830
831static void __init find_early_table_space(unsigned long end, int use_pse)
832{
833 unsigned long puds, pmds, ptes, tables, start;
834
835 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
836 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
837
838 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
839 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
840
841 if (use_pse) {
842 unsigned long extra;
843 840
844 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 841 for_each_online_node(nodeid) {
845 extra += PMD_SIZE; 842 unsigned long start_pfn, end_pfn;
846 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
847 } else
848 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
849 843
850 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); 844#ifdef CONFIG_NEED_MULTIPLE_NODES
851 845 start_pfn = node_start_pfn[nodeid];
852 /* for fixmap */ 846 end_pfn = node_end_pfn[nodeid];
853 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 847 if (start_pfn > max_low_pfn)
854 848 continue;
855 /* 849 if (end_pfn > max_low_pfn)
856 * RED-PEN putting page tables only on node 0 could 850 end_pfn = max_low_pfn;
857 * cause a hotspot and fill up ZONE_DMA. The page tables
858 * need roughly 0.5KB per GB.
859 */
860 start = 0x7000;
861 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
862 tables, PAGE_SIZE);
863 if (table_start == -1UL)
864 panic("Cannot find space for the kernel page tables");
865
866 table_start >>= PAGE_SHIFT;
867 table_end = table_start;
868 table_top = table_start + (tables>>PAGE_SHIFT);
869
870 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
871 end, table_start << PAGE_SHIFT,
872 (table_start << PAGE_SHIFT) + tables);
873}
874
875unsigned long __init_refok init_memory_mapping(unsigned long start,
876 unsigned long end)
877{
878 pgd_t *pgd_base = swapper_pg_dir;
879 unsigned long start_pfn, end_pfn;
880 unsigned long big_page_start;
881#ifdef CONFIG_DEBUG_PAGEALLOC
882 /*
883 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
884 * This will simplify cpa(), which otherwise needs to support splitting
885 * large pages into small in interrupt context, etc.
886 */
887 int use_pse = 0;
888#else 851#else
889 int use_pse = cpu_has_pse; 852 start_pfn = 0;
890#endif 853 end_pfn = max_low_pfn;
891
892 /*
893 * Find space for the kernel direct mapping tables.
894 */
895 if (!after_init_bootmem)
896 find_early_table_space(end, use_pse);
897
898#ifdef CONFIG_X86_PAE
899 set_nx();
900 if (nx_enabled)
901 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
902#endif 854#endif
903 855 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
904 /* Enable PSE if available */ 856 bootmap);
905 if (cpu_has_pse)
906 set_in_cr4(X86_CR4_PSE);
907
908 /* Enable PGE if available */
909 if (cpu_has_pge) {
910 set_in_cr4(X86_CR4_PGE);
911 __supported_pte_mask |= _PAGE_GLOBAL;
912 }
913
914 /*
915 * Don't use a large page for the first 2/4MB of memory
916 * because there are often fixed size MTRRs in there
917 * and overlapping MTRRs into large pages can cause
918 * slowdowns.
919 */
920 big_page_start = PMD_SIZE;
921
922 if (start < big_page_start) {
923 start_pfn = start >> PAGE_SHIFT;
924 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
925 } else {
926 /* head is not big page alignment ? */
927 start_pfn = start >> PAGE_SHIFT;
928 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
929 << (PMD_SHIFT - PAGE_SHIFT);
930 }
931 if (start_pfn < end_pfn)
932 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
933
934 /* big page range */
935 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
936 << (PMD_SHIFT - PAGE_SHIFT);
937 if (start_pfn < (big_page_start >> PAGE_SHIFT))
938 start_pfn = big_page_start >> PAGE_SHIFT;
939 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
940 if (start_pfn < end_pfn)
941 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
942 use_pse);
943
944 /* tail is not big page alignment ? */
945 start_pfn = end_pfn;
946 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
947 end_pfn = end >> PAGE_SHIFT;
948 if (start_pfn < end_pfn)
949 kernel_physical_mapping_init(pgd_base, start_pfn,
950 end_pfn, 0);
951 } 857 }
952 858
953 early_ioremap_page_table_range_init(pgd_base); 859 after_bootmem = 1;
954
955 load_cr3(swapper_pg_dir);
956
957 __flush_tlb_all();
958
959 if (!after_init_bootmem)
960 reserve_early(table_start << PAGE_SHIFT,
961 table_end << PAGE_SHIFT, "PGTABLE");
962
963 if (!after_init_bootmem)
964 early_memtest(start, end);
965
966 return end >> PAGE_SHIFT;
967} 860}
968 861
969
970/* 862/*
971 * paging_init() sets up the page tables - note that the first 8MB are 863 * paging_init() sets up the page tables - note that the first 8MB are
972 * already mapped by head.S. 864 * already mapped by head.S.
@@ -1200,13 +1092,6 @@ void mark_rodata_ro(void)
1200} 1092}
1201#endif 1093#endif
1202 1094
1203#ifdef CONFIG_BLK_DEV_INITRD
1204void free_initrd_mem(unsigned long start, unsigned long end)
1205{
1206 free_init_pages("initrd memory", start, end);
1207}
1208#endif
1209
1210int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 1095int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1211 int flags) 1096 int flags)
1212{ 1097{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 724e537432e..54efa57d1c0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
48#include <asm/kdebug.h> 48#include <asm/kdebug.h>
49#include <asm/numa.h> 49#include <asm/numa.h>
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h>
51 52
52/* 53/*
53 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
61 62
62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
63 64
64int direct_gbpages
65#ifdef CONFIG_DIRECT_GBPAGES
66 = 1
67#endif
68;
69
70static int __init parse_direct_gbpages_off(char *arg) 65static int __init parse_direct_gbpages_off(char *arg)
71{ 66{
72 direct_gbpages = 0; 67 direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
87 * around without checking the pgd every time. 82 * around without checking the pgd every time.
88 */ 83 */
89 84
90int after_bootmem;
91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask); 86EXPORT_SYMBOL_GPL(__supported_pte_mask);
94 87
95static int do_not_nx __cpuinitdata; 88static int disable_nx __cpuinitdata;
96 89
97/* 90/*
98 * noexec=on|off 91 * noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
107 return -EINVAL; 100 return -EINVAL;
108 if (!strncmp(str, "on", 2)) { 101 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX; 102 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0; 103 disable_nx = 0;
111 } else if (!strncmp(str, "off", 3)) { 104 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1; 105 disable_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX; 106 __supported_pte_mask &= ~_PAGE_NX;
114 } 107 }
115 return 0; 108 return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
121 unsigned long efer; 114 unsigned long efer;
122 115
123 rdmsrl(MSR_EFER, efer); 116 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx) 117 if (!(efer & EFER_NX) || disable_nx)
125 __supported_pte_mask &= ~_PAGE_NX; 118 __supported_pte_mask &= ~_PAGE_NX;
126} 119}
127 120
@@ -168,34 +161,51 @@ static __ref void *spp_getpage(void)
168 return ptr; 161 return ptr;
169} 162}
170 163
171void 164static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
172set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
173{ 165{
174 pud_t *pud; 166 if (pgd_none(*pgd)) {
175 pmd_t *pmd; 167 pud_t *pud = (pud_t *)spp_getpage();
176 pte_t *pte; 168 pgd_populate(&init_mm, pgd, pud);
169 if (pud != pud_offset(pgd, 0))
170 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
171 pud, pud_offset(pgd, 0));
172 }
173 return pud_offset(pgd, vaddr);
174}
177 175
178 pud = pud_page + pud_index(vaddr); 176static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
177{
179 if (pud_none(*pud)) { 178 if (pud_none(*pud)) {
180 pmd = (pmd_t *) spp_getpage(); 179 pmd_t *pmd = (pmd_t *) spp_getpage();
181 pud_populate(&init_mm, pud, pmd); 180 pud_populate(&init_mm, pud, pmd);
182 if (pmd != pmd_offset(pud, 0)) { 181 if (pmd != pmd_offset(pud, 0))
183 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 182 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
184 pmd, pmd_offset(pud, 0)); 183 pmd, pmd_offset(pud, 0));
185 return;
186 }
187 } 184 }
188 pmd = pmd_offset(pud, vaddr); 185 return pmd_offset(pud, vaddr);
186}
187
188static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
189{
189 if (pmd_none(*pmd)) { 190 if (pmd_none(*pmd)) {
190 pte = (pte_t *) spp_getpage(); 191 pte_t *pte = (pte_t *) spp_getpage();
191 pmd_populate_kernel(&init_mm, pmd, pte); 192 pmd_populate_kernel(&init_mm, pmd, pte);
192 if (pte != pte_offset_kernel(pmd, 0)) { 193 if (pte != pte_offset_kernel(pmd, 0))
193 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 194 printk(KERN_ERR "PAGETABLE BUG #02!\n");
194 return;
195 }
196 } 195 }
196 return pte_offset_kernel(pmd, vaddr);
197}
198
199void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
200{
201 pud_t *pud;
202 pmd_t *pmd;
203 pte_t *pte;
204
205 pud = pud_page + pud_index(vaddr);
206 pmd = fill_pmd(pud, vaddr);
207 pte = fill_pte(pmd, vaddr);
197 208
198 pte = pte_offset_kernel(pmd, vaddr);
199 set_pte(pte, new_pte); 209 set_pte(pte, new_pte);
200 210
201 /* 211 /*
@@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
205 __flush_tlb_one(vaddr); 215 __flush_tlb_one(vaddr);
206} 216}
207 217
208void 218void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
209set_pte_vaddr(unsigned long vaddr, pte_t pteval)
210{ 219{
211 pgd_t *pgd; 220 pgd_t *pgd;
212 pud_t *pud_page; 221 pud_t *pud_page;
@@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
223 set_pte_vaddr_pud(pud_page, vaddr, pteval); 232 set_pte_vaddr_pud(pud_page, vaddr, pteval);
224} 233}
225 234
235pmd_t * __init populate_extra_pmd(unsigned long vaddr)
236{
237 pgd_t *pgd;
238 pud_t *pud;
239
240 pgd = pgd_offset_k(vaddr);
241 pud = fill_pud(pgd, vaddr);
242 return fill_pmd(pud, vaddr);
243}
244
245pte_t * __init populate_extra_pte(unsigned long vaddr)
246{
247 pmd_t *pmd;
248
249 pmd = populate_extra_pmd(vaddr);
250 return fill_pte(pmd, vaddr);
251}
252
226/* 253/*
227 * Create large page table mappings for a range of physical addresses. 254 * Create large page table mappings for a range of physical addresses.
228 */ 255 */
@@ -291,13 +318,9 @@ void __init cleanup_highmap(void)
291 } 318 }
292} 319}
293 320
294static unsigned long __initdata table_start;
295static unsigned long __meminitdata table_end;
296static unsigned long __meminitdata table_top;
297
298static __ref void *alloc_low_page(unsigned long *phys) 321static __ref void *alloc_low_page(unsigned long *phys)
299{ 322{
300 unsigned long pfn = table_end++; 323 unsigned long pfn = e820_table_end++;
301 void *adr; 324 void *adr;
302 325
303 if (after_bootmem) { 326 if (after_bootmem) {
@@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
307 return adr; 330 return adr;
308 } 331 }
309 332
310 if (pfn >= table_top) 333 if (pfn >= e820_table_top)
311 panic("alloc_low_page: ran out of memory"); 334 panic("alloc_low_page: ran out of memory");
312 335
313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 336 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
547 return phys_pud_init(pud, addr, end, page_size_mask); 570 return phys_pud_init(pud, addr, end, page_size_mask);
548} 571}
549 572
550static void __init find_early_table_space(unsigned long end, int use_pse, 573unsigned long __init
551 int use_gbpages) 574kernel_physical_mapping_init(unsigned long start,
552{ 575 unsigned long end,
553 unsigned long puds, pmds, ptes, tables, start; 576 unsigned long page_size_mask)
554
555 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
556 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
557 if (use_gbpages) {
558 unsigned long extra;
559 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
560 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
561 } else
562 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
563 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
564
565 if (use_pse) {
566 unsigned long extra;
567 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
568 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
569 } else
570 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
571 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
572
573 /*
574 * RED-PEN putting page tables only on node 0 could
575 * cause a hotspot and fill up ZONE_DMA. The page tables
576 * need roughly 0.5KB per GB.
577 */
578 start = 0x8000;
579 table_start = find_e820_area(start, end, tables, PAGE_SIZE);
580 if (table_start == -1UL)
581 panic("Cannot find space for the kernel page tables");
582
583 table_start >>= PAGE_SHIFT;
584 table_end = table_start;
585 table_top = table_start + (tables >> PAGE_SHIFT);
586
587 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
588 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
589}
590
591static void __init init_gbpages(void)
592{
593 if (direct_gbpages && cpu_has_gbpages)
594 printk(KERN_INFO "Using GB pages for direct mapping\n");
595 else
596 direct_gbpages = 0;
597}
598
599static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
600 unsigned long end,
601 unsigned long page_size_mask)
602{ 577{
603 578
604 unsigned long next, last_map_addr = end; 579 unsigned long next, last_map_addr = end;
@@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
635 return last_map_addr; 610 return last_map_addr;
636} 611}
637 612
638struct map_range {
639 unsigned long start;
640 unsigned long end;
641 unsigned page_size_mask;
642};
643
644#define NR_RANGE_MR 5
645
646static int save_mr(struct map_range *mr, int nr_range,
647 unsigned long start_pfn, unsigned long end_pfn,
648 unsigned long page_size_mask)
649{
650
651 if (start_pfn < end_pfn) {
652 if (nr_range >= NR_RANGE_MR)
653 panic("run out of range for init_memory_mapping\n");
654 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
655 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
656 mr[nr_range].page_size_mask = page_size_mask;
657 nr_range++;
658 }
659
660 return nr_range;
661}
662
663/*
664 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
665 * This runs before bootmem is initialized and gets pages directly from
666 * the physical memory. To access them they are temporarily mapped.
667 */
668unsigned long __init_refok init_memory_mapping(unsigned long start,
669 unsigned long end)
670{
671 unsigned long last_map_addr = 0;
672 unsigned long page_size_mask = 0;
673 unsigned long start_pfn, end_pfn;
674 unsigned long pos;
675
676 struct map_range mr[NR_RANGE_MR];
677 int nr_range, i;
678 int use_pse, use_gbpages;
679
680 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
681
682 /*
683 * Find space for the kernel direct mapping tables.
684 *
685 * Later we should allocate these tables in the local node of the
686 * memory mapped. Unfortunately this is done currently before the
687 * nodes are discovered.
688 */
689 if (!after_bootmem)
690 init_gbpages();
691
692#ifdef CONFIG_DEBUG_PAGEALLOC
693 /*
694 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
695 * This will simplify cpa(), which otherwise needs to support splitting
696 * large pages into small in interrupt context, etc.
697 */
698 use_pse = use_gbpages = 0;
699#else
700 use_pse = cpu_has_pse;
701 use_gbpages = direct_gbpages;
702#endif
703
704 if (use_gbpages)
705 page_size_mask |= 1 << PG_LEVEL_1G;
706 if (use_pse)
707 page_size_mask |= 1 << PG_LEVEL_2M;
708
709 memset(mr, 0, sizeof(mr));
710 nr_range = 0;
711
712 /* head if not big page alignment ?*/
713 start_pfn = start >> PAGE_SHIFT;
714 pos = start_pfn << PAGE_SHIFT;
715 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
716 << (PMD_SHIFT - PAGE_SHIFT);
717 if (end_pfn > (end >> PAGE_SHIFT))
718 end_pfn = end >> PAGE_SHIFT;
719 if (start_pfn < end_pfn) {
720 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
721 pos = end_pfn << PAGE_SHIFT;
722 }
723
724 /* big page (2M) range*/
725 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
726 << (PMD_SHIFT - PAGE_SHIFT);
727 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
728 << (PUD_SHIFT - PAGE_SHIFT);
729 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
730 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
731 if (start_pfn < end_pfn) {
732 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
733 page_size_mask & (1<<PG_LEVEL_2M));
734 pos = end_pfn << PAGE_SHIFT;
735 }
736
737 /* big page (1G) range */
738 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
739 << (PUD_SHIFT - PAGE_SHIFT);
740 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
741 if (start_pfn < end_pfn) {
742 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
743 page_size_mask &
744 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
745 pos = end_pfn << PAGE_SHIFT;
746 }
747
748 /* tail is not big page (1G) alignment */
749 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
750 << (PMD_SHIFT - PAGE_SHIFT);
751 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
752 if (start_pfn < end_pfn) {
753 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
754 page_size_mask & (1<<PG_LEVEL_2M));
755 pos = end_pfn << PAGE_SHIFT;
756 }
757
758 /* tail is not big page (2M) alignment */
759 start_pfn = pos>>PAGE_SHIFT;
760 end_pfn = end>>PAGE_SHIFT;
761 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
762
763 /* try to merge same page size and continuous */
764 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
765 unsigned long old_start;
766 if (mr[i].end != mr[i+1].start ||
767 mr[i].page_size_mask != mr[i+1].page_size_mask)
768 continue;
769 /* move it */
770 old_start = mr[i].start;
771 memmove(&mr[i], &mr[i+1],
772 (nr_range - 1 - i) * sizeof (struct map_range));
773 mr[i--].start = old_start;
774 nr_range--;
775 }
776
777 for (i = 0; i < nr_range; i++)
778 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
779 mr[i].start, mr[i].end,
780 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
781 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
782
783 if (!after_bootmem)
784 find_early_table_space(end, use_pse, use_gbpages);
785
786 for (i = 0; i < nr_range; i++)
787 last_map_addr = kernel_physical_mapping_init(
788 mr[i].start, mr[i].end,
789 mr[i].page_size_mask);
790
791 if (!after_bootmem)
792 mmu_cr4_features = read_cr4();
793 __flush_tlb_all();
794
795 if (!after_bootmem && table_end > table_start)
796 reserve_early(table_start << PAGE_SHIFT,
797 table_end << PAGE_SHIFT, "PGTABLE");
798
799 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
800 last_map_addr, end);
801
802 if (!after_bootmem)
803 early_memtest(start, end);
804
805 return last_map_addr >> PAGE_SHIFT;
806}
807
808#ifndef CONFIG_NUMA 613#ifndef CONFIG_NUMA
809void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 614void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
810{ 615{
@@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
876 681
877#endif /* CONFIG_MEMORY_HOTPLUG */ 682#endif /* CONFIG_MEMORY_HOTPLUG */
878 683
879/*
880 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
881 * is valid. The argument is a physical page number.
882 *
883 *
884 * On x86, access has to be given to the first megabyte of ram because that area
885 * contains bios code and data regions used by X and dosemu and similar apps.
886 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
887 * mmio resources as well as potential bios/acpi data regions.
888 */
889int devmem_is_allowed(unsigned long pagenr)
890{
891 if (pagenr <= 256)
892 return 1;
893 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
894 return 0;
895 if (!page_is_ram(pagenr))
896 return 1;
897 return 0;
898}
899
900
901static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, 684static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
902 kcore_modules, kcore_vsyscall; 685 kcore_modules, kcore_vsyscall;
903 686
@@ -985,13 +768,6 @@ void mark_rodata_ro(void)
985 768
986#endif 769#endif
987 770
988#ifdef CONFIG_BLK_DEV_INITRD
989void free_initrd_mem(unsigned long start, unsigned long end)
990{
991 free_init_pages("initrd memory", start, end);
992}
993#endif
994
995int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 771int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
996 int flags) 772 int flags)
997{ 773{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d42ff4..699c9b2895a 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -31,16 +31,27 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
31} 31}
32EXPORT_SYMBOL_GPL(is_io_mapping_possible); 32EXPORT_SYMBOL_GPL(is_io_mapping_possible);
33 33
34/* Map 'pfn' using fixed map 'type' and protections 'prot' 34void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
35 */
36void *
37iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
38{ 35{
39 enum fixed_addresses idx; 36 enum fixed_addresses idx;
40 unsigned long vaddr; 37 unsigned long vaddr;
41 38
42 pagefault_disable(); 39 pagefault_disable();
43 40
41 idx = type + KM_TYPE_NR * smp_processor_id();
42 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
43 set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
44 arch_flush_lazy_mmu_mode();
45
46 return (void *)vaddr;
47}
48
49/*
50 * Map 'pfn' using fixed map 'type' and protections 'prot'
51 */
52void *
53iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
54{
44 /* 55 /*
45 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. 56 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
46 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the 57 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -50,12 +61,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
50 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) 61 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
51 prot = PAGE_KERNEL_UC_MINUS; 62 prot = PAGE_KERNEL_UC_MINUS;
52 63
53 idx = type + KM_TYPE_NR*smp_processor_id(); 64 return kmap_atomic_prot_pfn(pfn, type, prot);
54 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
55 set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
56 arch_flush_lazy_mmu_mode();
57
58 return (void*) vaddr;
59} 65}
60EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); 66EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
61 67
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648..55e127f71ed 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,13 +22,17 @@
22#include <asm/pgalloc.h> 22#include <asm/pgalloc.h>
23#include <asm/pat.h> 23#include <asm/pat.h>
24 24
25#ifdef CONFIG_X86_64 25static inline int phys_addr_valid(resource_size_t addr)
26
27static inline int phys_addr_valid(unsigned long addr)
28{ 26{
29 return addr < (1UL << boot_cpu_data.x86_phys_bits); 27#ifdef CONFIG_PHYS_ADDR_T_64BIT
28 return !(addr >> boot_cpu_data.x86_phys_bits);
29#else
30 return 1;
31#endif
30} 32}
31 33
34#ifdef CONFIG_X86_64
35
32unsigned long __phys_addr(unsigned long x) 36unsigned long __phys_addr(unsigned long x)
33{ 37{
34 if (x >= __START_KERNEL_map) { 38 if (x >= __START_KERNEL_map) {
@@ -38,8 +42,7 @@ unsigned long __phys_addr(unsigned long x)
38 } else { 42 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 43 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET; 44 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : 45 VIRTUAL_BUG_ON(!phys_addr_valid(x));
42 !phys_addr_valid(x));
43 } 46 }
44 return x; 47 return x;
45} 48}
@@ -56,10 +59,8 @@ bool __virt_addr_valid(unsigned long x)
56 if (x < PAGE_OFFSET) 59 if (x < PAGE_OFFSET)
57 return false; 60 return false;
58 x -= PAGE_OFFSET; 61 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ? 62 if (!phys_addr_valid(x))
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false; 63 return false;
62 }
63 } 64 }
64 65
65 return pfn_valid(x >> PAGE_SHIFT); 66 return pfn_valid(x >> PAGE_SHIFT);
@@ -68,18 +69,12 @@ EXPORT_SYMBOL(__virt_addr_valid);
68 69
69#else 70#else
70 71
71static inline int phys_addr_valid(unsigned long addr)
72{
73 return 1;
74}
75
76#ifdef CONFIG_DEBUG_VIRTUAL 72#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x) 73unsigned long __phys_addr(unsigned long x)
78{ 74{
79 /* VMALLOC_* aren't constants; not available at the boot time */ 75 /* VMALLOC_* aren't constants */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 76 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && 77 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET; 78 return x - PAGE_OFFSET;
84} 79}
85EXPORT_SYMBOL(__phys_addr); 80EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +84,9 @@ bool __virt_addr_valid(unsigned long x)
89{ 84{
90 if (x < PAGE_OFFSET) 85 if (x < PAGE_OFFSET)
91 return false; 86 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) 87 if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
88 return false;
89 if (x >= FIXADDR_START)
93 return false; 90 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); 91 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95} 92}
@@ -490,7 +487,12 @@ static int __init early_ioremap_debug_setup(char *str)
490early_param("early_ioremap_debug", early_ioremap_debug_setup); 487early_param("early_ioremap_debug", early_ioremap_debug_setup);
491 488
492static __initdata int after_paging_init; 489static __initdata int after_paging_init;
493static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; 490#define __FIXADDR_TOP (-PAGE_SIZE)
491static pte_t bm_pte[(__fix_to_virt(FIX_DBGP_BASE)
492 ^ __fix_to_virt(FIX_BTMAP_BEGIN)) >> PMD_SHIFT
493 ? PAGE_SIZE / sizeof(pte_t) : 0] __page_aligned_bss;
494#undef __FIXADDR_TOP
495static __initdata pte_t *bm_ptep;
494 496
495static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 497static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
496{ 498{
@@ -505,19 +507,33 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
505 507
506static inline pte_t * __init early_ioremap_pte(unsigned long addr) 508static inline pte_t * __init early_ioremap_pte(unsigned long addr)
507{ 509{
510 if (!sizeof(bm_pte))
511 return &bm_ptep[pte_index(addr)];
508 return &bm_pte[pte_index(addr)]; 512 return &bm_pte[pte_index(addr)];
509} 513}
510 514
515static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
516
511void __init early_ioremap_init(void) 517void __init early_ioremap_init(void)
512{ 518{
513 pmd_t *pmd; 519 pmd_t *pmd;
520 int i;
514 521
515 if (early_ioremap_debug) 522 if (early_ioremap_debug)
516 printk(KERN_INFO "early_ioremap_init()\n"); 523 printk(KERN_INFO "early_ioremap_init()\n");
517 524
525 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
526 slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
527
518 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 528 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
519 memset(bm_pte, 0, sizeof(bm_pte)); 529 if (sizeof(bm_pte)) {
520 pmd_populate_kernel(&init_mm, pmd, bm_pte); 530 memset(bm_pte, 0, sizeof(bm_pte));
531 pmd_populate_kernel(&init_mm, pmd, bm_pte);
532 } else {
533 bm_ptep = pte_offset_kernel(pmd, 0);
534 if (early_ioremap_debug)
535 printk(KERN_INFO "bm_ptep=%p\n", bm_ptep);
536 }
521 537
522 /* 538 /*
523 * The boot-ioremap range spans multiple pmds, for which 539 * The boot-ioremap range spans multiple pmds, for which
@@ -581,6 +597,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
581 597
582static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; 598static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
583static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; 599static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
600
584static int __init check_early_ioremap_leak(void) 601static int __init check_early_ioremap_leak(void)
585{ 602{
586 int count = 0; 603 int count = 0;
@@ -602,7 +619,8 @@ static int __init check_early_ioremap_leak(void)
602} 619}
603late_initcall(check_early_ioremap_leak); 620late_initcall(check_early_ioremap_leak);
604 621
605static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) 622static void __init __iomem *
623__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
606{ 624{
607 unsigned long offset, last_addr; 625 unsigned long offset, last_addr;
608 unsigned int nrpages; 626 unsigned int nrpages;
@@ -668,9 +686,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
668 --nrpages; 686 --nrpages;
669 } 687 }
670 if (early_ioremap_debug) 688 if (early_ioremap_debug)
671 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 689 printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
672 690
673 prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); 691 prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
674 return prev_map[slot]; 692 return prev_map[slot];
675} 693}
676 694
@@ -738,8 +756,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
738 } 756 }
739 prev_map[slot] = NULL; 757 prev_map[slot] = NULL;
740} 758}
741
742void __this_fixmap_does_not_exist(void)
743{
744 WARN_ON(1);
745}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 93d82038af4..4f115e00486 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,11 +32,14 @@ struct kmmio_fault_page {
32 struct list_head list; 32 struct list_head list;
33 struct kmmio_fault_page *release_next; 33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */ 34 unsigned long page; /* location of the fault page */
35 bool old_presence; /* page presence prior to arming */
36 bool armed;
35 37
36 /* 38 /*
37 * Number of times this page has been registered as a part 39 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed. 40 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU). 41 * Used only by writers (RCU) and post_kmmio_handler().
42 * Protected by kmmio_lock, when linked into kmmio_page_table.
40 */ 43 */
41 int count; 44 int count;
42}; 45};
@@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
105 return NULL; 108 return NULL;
106} 109}
107 110
108static void set_page_present(unsigned long addr, bool present, 111static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
109 unsigned int *pglevel) 112{
113 pmdval_t v = pmd_val(*pmd);
114 *old = !!(v & _PAGE_PRESENT);
115 v &= ~_PAGE_PRESENT;
116 if (present)
117 v |= _PAGE_PRESENT;
118 set_pmd(pmd, __pmd(v));
119}
120
121static void set_pte_presence(pte_t *pte, bool present, bool *old)
122{
123 pteval_t v = pte_val(*pte);
124 *old = !!(v & _PAGE_PRESENT);
125 v &= ~_PAGE_PRESENT;
126 if (present)
127 v |= _PAGE_PRESENT;
128 set_pte_atomic(pte, __pte(v));
129}
130
131static int set_page_presence(unsigned long addr, bool present, bool *old)
110{ 132{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level; 133 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level); 134 pte_t *pte = lookup_address(addr, &level);
116 135
117 if (!pte) { 136 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr); 137 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return; 138 return -1;
120 } 139 }
121 140
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) { 141 switch (level) {
126 case PG_LEVEL_2M: 142 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte; 143 set_pmd_presence((pmd_t *)pte, present, old);
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break; 144 break;
133
134 case PG_LEVEL_4K: 145 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT; 146 set_pte_presence(pte, present, old);
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break; 147 break;
140
141 default: 148 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level); 149 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return; 150 return -1;
144 } 151 }
145 152
146 __flush_tlb_one(addr); 153 __flush_tlb_one(addr);
154 return 0;
147} 155}
148 156
149/** Mark the given page as not present. Access to it will trigger a fault. */ 157/*
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) 158 * Mark the given page as not present. Access to it will trigger a fault.
159 *
160 * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
161 * protection is ignored here. RCU read lock is assumed held, so the struct
162 * will not disappear unexpectedly. Furthermore, the caller must guarantee,
163 * that double arming the same virtual address (page) cannot occur.
164 *
165 * Double disarming on the other hand is allowed, and may occur when a fault
166 * and mmiotrace shutdown happen simultaneously.
167 */
168static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
151{ 169{
152 set_page_present(page & PAGE_MASK, false, pglevel); 170 int ret;
171 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
172 if (f->armed) {
173 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
174 f->page, f->count, f->old_presence);
175 }
176 ret = set_page_presence(f->page, false, &f->old_presence);
177 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
178 f->armed = true;
179 return ret;
153} 180}
154 181
155/** Mark the given page as present. */ 182/** Restore the given page to saved presence state. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) 183static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
157{ 184{
158 set_page_present(page & PAGE_MASK, true, pglevel); 185 bool tmp;
186 int ret = set_page_presence(f->page, f->old_presence, &tmp);
187 WARN_ONCE(ret < 0,
188 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
189 f->armed = false;
159} 190}
160 191
161/* 192/*
@@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
202 233
203 ctx = &get_cpu_var(kmmio_ctx); 234 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) { 235 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) { 236 if (addr == ctx->addr) {
207 /* 237 /*
208 * On SMP we sometimes get recursive probe hits on the 238 * A second fault on the same page means some other
209 * same address. Context is already saved, fall out. 239 * condition needs handling by do_page_fault(), the
240 * page really not being present is the most common.
210 */ 241 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for " 242 pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
212 "address 0x%08lx.\n", 243 addr, smp_processor_id());
213 smp_processor_id(), addr); 244
214 ret = 1; 245 if (!faultpage->old_presence)
215 goto no_kmmio_ctx; 246 pr_info("kmmio: unexpected secondary hit for "
216 } 247 "address 0x%08lx on CPU %d.\n", addr,
217 /* 248 smp_processor_id());
218 * Prevent overwriting already in-flight context. 249 } else {
219 * This should not happen, let's hope disarming at least 250 /*
220 * prevents a panic. 251 * Prevent overwriting already in-flight context.
221 */ 252 * This should not happen, let's hope disarming at
222 pr_emerg("kmmio: recursive probe hit on CPU %d, " 253 * least prevents a panic.
254 */
255 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n", 256 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr); 257 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n", 258 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr); 259 ctx->addr);
260 disarm_kmmio_fault_page(faultpage);
261 }
227 goto no_kmmio_ctx; 262 goto no_kmmio_ctx;
228 } 263 }
229 ctx->active++; 264 ctx->active++;
@@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
244 regs->flags &= ~X86_EFLAGS_IF; 279 regs->flags &= ~X86_EFLAGS_IF;
245 280
246 /* Now we set present bit in PTE and single step. */ 281 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL); 282 disarm_kmmio_fault_page(ctx->fpage);
248 283
249 /* 284 /*
250 * If another cpu accesses the same page while we are stepping, 285 * If another cpu accesses the same page while we are stepping,
@@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
283 if (ctx->probe && ctx->probe->post_handler) 318 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs); 319 ctx->probe->post_handler(ctx->probe, condition, regs);
285 320
286 arm_kmmio_fault_page(ctx->fpage->page, NULL); 321 /* Prevent racing against release_kmmio_fault_page(). */
322 spin_lock(&kmmio_lock);
323 if (ctx->fpage->count)
324 arm_kmmio_fault_page(ctx->fpage);
325 spin_unlock(&kmmio_lock);
287 326
288 regs->flags &= ~X86_EFLAGS_TF; 327 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags; 328 regs->flags |= ctx->saved_flags;
@@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page)
315 f = get_kmmio_fault_page(page); 354 f = get_kmmio_fault_page(page);
316 if (f) { 355 if (f) {
317 if (!f->count) 356 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL); 357 arm_kmmio_fault_page(f);
319 f->count++; 358 f->count++;
320 return 0; 359 return 0;
321 } 360 }
322 361
323 f = kmalloc(sizeof(*f), GFP_ATOMIC); 362 f = kzalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f) 363 if (!f)
325 return -1; 364 return -1;
326 365
327 f->count = 1; 366 f->count = 1;
328 f->page = page; 367 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330 368
331 arm_kmmio_fault_page(f->page, NULL); 369 if (arm_kmmio_fault_page(f)) {
370 kfree(f);
371 return -1;
372 }
373
374 list_add_rcu(&f->list, kmmio_page_list(f->page));
332 375
333 return 0; 376 return 0;
334} 377}
@@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page,
347 f->count--; 390 f->count--;
348 BUG_ON(f->count < 0); 391 BUG_ON(f->count < 0);
349 if (!f->count) { 392 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL); 393 disarm_kmmio_fault_page(f);
351 f->release_next = *release_list; 394 f->release_next = *release_list;
352 *release_list = f; 395 *release_list = f;
353 } 396 }
@@ -408,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
408 451
409static void remove_kmmio_fault_pages(struct rcu_head *head) 452static void remove_kmmio_fault_pages(struct rcu_head *head)
410{ 453{
411 struct kmmio_delayed_release *dr = container_of( 454 struct kmmio_delayed_release *dr =
412 head, 455 container_of(head, struct kmmio_delayed_release, rcu);
413 struct kmmio_delayed_release,
414 rcu);
415 struct kmmio_fault_page *p = dr->release_list; 456 struct kmmio_fault_page *p = dr->release_list;
416 struct kmmio_fault_page **prevp = &dr->release_list; 457 struct kmmio_fault_page **prevp = &dr->release_list;
417 unsigned long flags; 458 unsigned long flags;
459
418 spin_lock_irqsave(&kmmio_lock, flags); 460 spin_lock_irqsave(&kmmio_lock, flags);
419 while (p) { 461 while (p) {
420 if (!p->count) 462 if (!p->count) {
421 list_del_rcu(&p->list); 463 list_del_rcu(&p->list);
422 else 464 prevp = &p->release_next;
465 } else {
423 *prevp = p->release_next; 466 *prevp = p->release_next;
424 prevp = &p->release_next; 467 }
425 p = p->release_next; 468 p = p->release_next;
426 } 469 }
427 spin_unlock_irqrestore(&kmmio_lock, flags); 470 spin_unlock_irqrestore(&kmmio_lock, flags);
471
428 /* This is the real RCU destroy call. */ 472 /* This is the real RCU destroy call. */
429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); 473 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
430} 474}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd7883d03..605c8be0621 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
100{ 100{
101 if (arg) 101 if (arg)
102 memtest_pattern = simple_strtoul(arg, NULL, 0); 102 memtest_pattern = simple_strtoul(arg, NULL, 0);
103 else
104 memtest_pattern = ARRAY_SIZE(patterns);
105
103 return 0; 106 return 0;
104} 107}
105 108
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 451fe95a035..3daefa04ace 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
416 for_each_online_node(nid) 416 for_each_online_node(nid)
417 propagate_e820_map_node(nid); 417 propagate_e820_map_node(nid);
418 418
419 for_each_online_node(nid) 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
422 }
421 423
422 NODE_DATA(0)->bdata = &bootmem_node_data[0];
423 setup_bootmem_allocator(); 424 setup_bootmem_allocator();
424} 425}
425 426
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8253bc97587..1280565670e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,7 @@
16#include <asm/processor.h> 16#include <asm/processor.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include <asm/sections.h> 18#include <asm/sections.h>
19#include <asm/setup.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/pgalloc.h> 21#include <asm/pgalloc.h>
21#include <asm/proto.h> 22#include <asm/proto.h>
@@ -95,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
95 96
96static inline unsigned long highmap_end_pfn(void) 97static inline unsigned long highmap_end_pfn(void)
97{ 98{
98 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 99 return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
99} 100}
100 101
101#endif 102#endif
@@ -522,6 +523,17 @@ static int split_large_page(pte_t *kpte, unsigned long address)
522 * primary protection behavior: 523 * primary protection behavior:
523 */ 524 */
524 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); 525 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
526
527 /*
528 * Intel Atom errata AAH41 workaround.
529 *
530 * The real fix should be in hw or in a microcode update, but
531 * we also probabilistically try to reduce the window of having
532 * a large TLB mixed with 4K TLBs while instruction fetches are
533 * going on.
534 */
535 __flush_tlb_all();
536
525 base = NULL; 537 base = NULL;
526 538
527out_unlock: 539out_unlock:
@@ -700,7 +712,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
700 * No need to redo, when the primary call touched the high 712 * No need to redo, when the primary call touched the high
701 * mapping already: 713 * mapping already:
702 */ 714 */
703 if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) 715 if (within(vaddr, (unsigned long) _text, _brk_end))
704 return 0; 716 return 0;
705 717
706 /* 718 /*
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2ed37158012..640339ee4fb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -677,10 +677,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
677 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 677 is_ram = pat_pagerange_is_ram(paddr, paddr + size);
678 678
679 /* 679 /*
680 * reserve_pfn_range() doesn't support RAM pages. 680 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
681 * behavior with RAM pages by returning success.
681 */ 682 */
682 if (is_ram != 0) 683 if (is_ram != 0)
683 return -EINVAL; 684 return 0;
684 685
685 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); 686 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
686 if (ret) 687 if (ret)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index ab50a8d7402..427fd1b56df 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi> 2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
3 */ 3 */
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/io.h> 5#include <linux/io.h>
@@ -9,35 +9,74 @@
9 9
10static unsigned long mmio_address; 10static unsigned long mmio_address;
11module_param(mmio_address, ulong, 0); 11module_param(mmio_address, ulong, 0);
12MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); 12MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
13 "(or 8 MB if read_far is non-zero).");
14
15static unsigned long read_far = 0x400100;
16module_param(read_far, ulong, 0);
17MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
18 "(default: 0x400100).");
19
20static unsigned v16(unsigned i)
21{
22 return i * 12 + 7;
23}
24
25static unsigned v32(unsigned i)
26{
27 return i * 212371 + 13;
28}
13 29
14static void do_write_test(void __iomem *p) 30static void do_write_test(void __iomem *p)
15{ 31{
16 unsigned int i; 32 unsigned int i;
33 pr_info(MODULE_NAME ": write test.\n");
17 mmiotrace_printk("Write test.\n"); 34 mmiotrace_printk("Write test.\n");
35
18 for (i = 0; i < 256; i++) 36 for (i = 0; i < 256; i++)
19 iowrite8(i, p + i); 37 iowrite8(i, p + i);
38
20 for (i = 1024; i < (5 * 1024); i += 2) 39 for (i = 1024; i < (5 * 1024); i += 2)
21 iowrite16(i * 12 + 7, p + i); 40 iowrite16(v16(i), p + i);
41
22 for (i = (5 * 1024); i < (16 * 1024); i += 4) 42 for (i = (5 * 1024); i < (16 * 1024); i += 4)
23 iowrite32(i * 212371 + 13, p + i); 43 iowrite32(v32(i), p + i);
24} 44}
25 45
26static void do_read_test(void __iomem *p) 46static void do_read_test(void __iomem *p)
27{ 47{
28 unsigned int i; 48 unsigned int i;
49 unsigned errs[3] = { 0 };
50 pr_info(MODULE_NAME ": read test.\n");
29 mmiotrace_printk("Read test.\n"); 51 mmiotrace_printk("Read test.\n");
52
30 for (i = 0; i < 256; i++) 53 for (i = 0; i < 256; i++)
31 ioread8(p + i); 54 if (ioread8(p + i) != i)
55 ++errs[0];
56
32 for (i = 1024; i < (5 * 1024); i += 2) 57 for (i = 1024; i < (5 * 1024); i += 2)
33 ioread16(p + i); 58 if (ioread16(p + i) != v16(i))
59 ++errs[1];
60
34 for (i = (5 * 1024); i < (16 * 1024); i += 4) 61 for (i = (5 * 1024); i < (16 * 1024); i += 4)
35 ioread32(p + i); 62 if (ioread32(p + i) != v32(i))
63 ++errs[2];
64
65 mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
66 errs[0], errs[1], errs[2]);
36} 67}
37 68
38static void do_test(void) 69static void do_read_far_test(void __iomem *p)
39{ 70{
40 void __iomem *p = ioremap_nocache(mmio_address, 0x4000); 71 pr_info(MODULE_NAME ": read far test.\n");
72 mmiotrace_printk("Read far test.\n");
73
74 ioread32(p + read_far);
75}
76
77static void do_test(unsigned long size)
78{
79 void __iomem *p = ioremap_nocache(mmio_address, size);
41 if (!p) { 80 if (!p) {
42 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 81 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
43 return; 82 return;
@@ -45,11 +84,15 @@ static void do_test(void)
45 mmiotrace_printk("ioremap returned %p.\n", p); 84 mmiotrace_printk("ioremap returned %p.\n", p);
46 do_write_test(p); 85 do_write_test(p);
47 do_read_test(p); 86 do_read_test(p);
87 if (read_far && read_far < size - 4)
88 do_read_far_test(p);
48 iounmap(p); 89 iounmap(p);
49} 90}
50 91
51static int __init init(void) 92static int __init init(void)
52{ 93{
94 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
95
53 if (mmio_address == 0) { 96 if (mmio_address == 0) {
54 pr_err(MODULE_NAME ": you have to use the module argument " 97 pr_err(MODULE_NAME ": you have to use the module argument "
55 "mmio_address.\n"); 98 "mmio_address.\n");
@@ -58,10 +101,11 @@ static int __init init(void)
58 return -ENXIO; 101 return -ENXIO;
59 } 102 }
60 103
61 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " 104 pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
62 "in PCI address space, and writing " 105 "address space, and writing 16 kB of rubbish in there.\n",
63 "rubbish in there.\n", mmio_address); 106 size >> 10, mmio_address);
64 do_test(); 107 do_test(size);
108 pr_info(MODULE_NAME ": All done.\n");
65 return 0; 109 return 0;
66} 110}
67 111
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a654d59e448..821e97017e9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -187,11 +187,6 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
187 cpumask, cpumask_of(smp_processor_id())); 187 cpumask, cpumask_of(smp_processor_id()));
188 188
189 /* 189 /*
190 * Make the above memory operations globally visible before
191 * sending the IPI.
192 */
193 smp_mb();
194 /*
195 * We have to send the IPI only to 190 * We have to send the IPI only to
196 * CPUs affected. 191 * CPUs affected.
197 */ 192 */
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 82d22fc601a..8c362b96b64 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -90,7 +90,7 @@ static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
90 return 0; 90 return 0;
91} 91}
92 92
93static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = { 93static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = {
94/* 94/*
95 * Systems where PCI IO resource ISA alignment can be skipped 95 * Systems where PCI IO resource ISA alignment can be skipped
96 * when the ISA enable bit in the bridge control is not set 96 * when the ISA enable bit in the bridge control is not set
@@ -183,7 +183,7 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d)
183} 183}
184#endif 184#endif
185 185
186static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { 186static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
187#ifdef __i386__ 187#ifdef __i386__
188/* 188/*
189 * Laptops which need pci=assign-busses to see Cardbus cards 189 * Laptops which need pci=assign-busses to see Cardbus cards
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 7d388d5cf54..9c49919e4d1 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -356,7 +356,7 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
356DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); 356DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
357 357
358 358
359static struct dmi_system_id __devinitdata msi_k8t_dmi_table[] = { 359static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
360 { 360 {
361 .ident = "MSI-K8T-Neo2Fir", 361 .ident = "MSI-K8T-Neo2Fir",
362 .matches = { 362 .matches = {
@@ -413,7 +413,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
413 */ 413 */
414static u16 toshiba_line_size; 414static u16 toshiba_line_size;
415 415
416static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = { 416static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = {
417 { 417 {
418 .ident = "Toshiba PS5 based laptop", 418 .ident = "Toshiba PS5 based laptop",
419 .matches = { 419 .matches = {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c52f4034c7f..82cd39a6cbd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
103 103
104 vcpup = &per_cpu(xen_vcpu_info, cpu); 104 vcpup = &per_cpu(xen_vcpu_info, cpu);
105 105
106 info.mfn = virt_to_mfn(vcpup); 106 info.mfn = arbitrary_virt_to_mfn(vcpup);
107 info.offset = offset_in_page(vcpup); 107 info.offset = offset_in_page(vcpup);
108 108
109 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", 109 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
301 frames = mcs.args; 301 frames = mcs.args;
302 302
303 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 303 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
304 frames[f] = virt_to_mfn(va); 304 frames[f] = arbitrary_virt_to_mfn((void *)va);
305
305 make_lowmem_page_readonly((void *)va); 306 make_lowmem_page_readonly((void *)va);
307 make_lowmem_page_readonly(mfn_to_virt(frames[f]));
306 } 308 }
307 309
308 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); 310 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
314 unsigned int cpu, unsigned int i) 316 unsigned int cpu, unsigned int i)
315{ 317{
316 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 318 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
317 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 319 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
318 struct multicall_space mc = __xen_mc_entry(0); 320 struct multicall_space mc = __xen_mc_entry(0);
319 321
320 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 322 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
488 break; 490 break;
489 491
490 default: { 492 default: {
491 xmaddr_t maddr = virt_to_machine(&dt[entry]); 493 xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
492 494
493 xen_mc_flush(); 495 xen_mc_flush();
494 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 496 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40a57c..72f6a76dbfb 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
276 p2m_top[topidx][idx] = mfn; 276 p2m_top[topidx][idx] = mfn;
277} 277}
278 278
279unsigned long arbitrary_virt_to_mfn(void *vaddr)
280{
281 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
282
283 return PFN_DOWN(maddr.maddr);
284}
285
279xmaddr_t arbitrary_virt_to_machine(void *vaddr) 286xmaddr_t arbitrary_virt_to_machine(void *vaddr)
280{ 287{
281 unsigned long address = (unsigned long)vaddr; 288 unsigned long address = (unsigned long)vaddr;
@@ -1716,9 +1723,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1716{ 1723{
1717 pmd_t *kernel_pmd; 1724 pmd_t *kernel_pmd;
1718 1725
1719 init_pg_tables_start = __pa(pgd); 1726 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1720 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1727 xen_start_info->nr_pt_frames * PAGE_SIZE +
1721 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); 1728 512*1024);
1722 1729
1723 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1730 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1724 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1731 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582ae815..8d470562ffc 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
219{ 219{
220 struct vcpu_guest_context *ctxt; 220 struct vcpu_guest_context *ctxt;
221 struct desc_struct *gdt; 221 struct desc_struct *gdt;
222 unsigned long gdt_mfn;
222 223
223 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 224 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
224 return 0; 225 return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
248 ctxt->ldt_ents = 0; 249 ctxt->ldt_ents = 0;
249 250
250 BUG_ON((unsigned long)gdt & ~PAGE_MASK); 251 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
252
253 gdt_mfn = arbitrary_virt_to_mfn(gdt);
251 make_lowmem_page_readonly(gdt); 254 make_lowmem_page_readonly(gdt);
255 make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
252 256
253 ctxt->gdt_frames[0] = virt_to_mfn(gdt); 257 ctxt->gdt_frames[0] = gdt_mfn;
254 ctxt->gdt_ents = GDT_ENTRIES; 258 ctxt->gdt_ents = GDT_ENTRIES;
255 259
256 ctxt->user_regs.cs = __KERNEL_CS; 260 ctxt->user_regs.cs = __KERNEL_CS;