aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/earlyprintk.txt101
-rw-r--r--arch/x86/Kconfig15
-rw-r--r--arch/x86/Kconfig.cpu17
-rw-r--r--arch/x86/Makefile29
-rw-r--r--arch/x86/boot/Makefile53
-rw-r--r--arch/x86/boot/header.S29
-rw-r--r--arch/x86/boot/pm.c44
-rw-r--r--arch/x86/boot/pmjump.S1
-rw-r--r--arch/x86/boot/setup.ld3
-rw-r--r--arch/x86/boot/tools/build.c9
-rw-r--r--arch/x86/boot/video-vga.c22
-rw-r--r--arch/x86/include/asm/apic.h37
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/boot.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h3
-rwxr-xr-xarch/x86/include/asm/cpu_debug.h226
-rw-r--r--arch/x86/include/asm/desc.h3
-rw-r--r--arch/x86/include/asm/dmi.h19
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/init.h18
-rw-r--r--arch/x86/include/asm/io_apic.h5
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/kexec.h13
-rw-r--r--arch/x86/include/asm/linkage.h19
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/msidef.h1
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/page_32_types.h5
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h19
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pgtable-2level.h7
-rw-r--r--arch/x86/include/asm/pgtable-3level.h17
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32.h3
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h5
-rw-r--r--arch/x86/include/asm/sections.h7
-rw-r--r--arch/x86/include/asm/setup.h39
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/alternative.c17
-rw-r--r--arch/x86/kernel/apic/apic.c35
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c18
-rw-r--r--arch/x86/kernel/apic/io_apic.c292
-rw-r--r--arch/x86/kernel/apic/probe_64.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c6
-rw-r--r--arch/x86/kernel/check.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c54
-rw-r--r--arch/x86/kernel/cpu/centaur.c36
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c37
-rw-r--r--arch/x86/kernel/cpu/common.c396
-rw-r--r--arch/x86/kernel/cpu/cpu.h25
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c901
-rw-r--r--arch/x86/kernel/cpu/cyrix.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c32
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c62
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c1101
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c202
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1069
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/umc.c2
-rw-r--r--arch/x86/kernel/e820.c142
-rw-r--r--arch/x86/kernel/early_printk.c20
-rw-r--r--arch/x86/kernel/entry_32.S18
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/head32.c5
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S76
-rw-r--r--arch/x86/kernel/i8253.c68
-rw-r--r--arch/x86/kernel/io_delay.c27
-rw-r--r--arch/x86/kernel/irq.c88
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/kdebugfs.c82
-rw-r--r--arch/x86/kernel/kprobes.c2
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/machine_kexec_32.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c99
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/mpparse.c384
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-nommu.c20
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S24
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S189
-rw-r--r--arch/x86/kernel/rtc.c20
-rw-r--r--arch/x86/kernel/setup.c58
-rw-r--r--arch/x86/kernel/signal.c48
-rw-r--r--arch/x86/kernel/smpboot.c78
-rw-r--r--arch/x86/kernel/tlb_uv.c3
-rw-r--r--arch/x86/kernel/topology.c14
-rw-r--r--arch/x86/kernel/uv_time.c393
-rw-r--r--arch/x86/kernel/visws_quirks.c2
-rw-r--r--arch/x86/kernel/vmi_32.c6
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S21
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S101
-rw-r--r--arch/x86/kernel/vsmp_64.c12
-rw-r--r--arch/x86/lguest/boot.c8
-rw-r--r--arch/x86/lib/memcpy_64.S143
-rw-r--r--arch/x86/mm/highmem_32.c24
-rw-r--r--arch/x86/mm/init.c344
-rw-r--r--arch/x86/mm/init_32.c256
-rw-r--r--arch/x86/mm/init_64.c280
-rw-r--r--arch/x86/mm/iomap_32.c26
-rw-r--r--arch/x86/mm/ioremap.c52
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/memtest.c3
-rw-r--r--arch/x86/mm/numa_32.c5
-rw-r--r--arch/x86/mm/pageattr.c147
-rw-r--r--arch/x86/mm/pat.c5
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/mm/tlb.c5
-rw-r--r--arch/x86/pci/common.c4
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/pci/i386.c3
-rw-r--r--arch/x86/xen/mmu.c7
-rw-r--r--drivers/pci/dmar.c296
-rw-r--r--drivers/pci/intel-iommu.c220
-rw-r--r--drivers/pci/intr_remapping.c110
-rw-r--r--include/linux/dmar.h52
-rw-r--r--include/linux/intel-iommu.h5
-rw-r--r--include/linux/mm.h3
-rw-r--r--mm/memory.c6
145 files changed, 6465 insertions, 3583 deletions
diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644
index 000000000000..607b1a016064
--- /dev/null
+++ b/Documentation/x86/earlyprintk.txt
@@ -0,0 +1,101 @@
1
2Mini-HOWTO for using the earlyprintk=dbgp boot option with a
3USB2 Debug port key and a debug cable, on x86 systems.
4
5You need two computers, the 'USB debug key' special gadget and
6and two USB cables, connected like this:
7
8 [host/target] <-------> [USB debug key] <-------> [client/console]
9
101. There are three specific hardware requirements:
11
12 a.) Host/target system needs to have USB debug port capability.
13
14 You can check this capability by looking at a 'Debug port' bit in
15 the lspci -vvv output:
16
17 # lspci -vvv
18 ...
19 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
20 Subsystem: Lenovo ThinkPad T61
21 Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
22 Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
23 Latency: 0
24 Interrupt: pin D routed to IRQ 19
25 Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
26 Capabilities: [50] Power Management version 2
27 Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
28 Status: D0 PME-Enable- DSel=0 DScale=0 PME+
29 Capabilities: [58] Debug port: BAR=1 offset=00a0
30 ^^^^^^^^^^^ <==================== [ HERE ]
31 Kernel driver in use: ehci_hcd
32 Kernel modules: ehci-hcd
33 ...
34
35( If your system does not list a debug port capability then you probably
36 wont be able to use the USB debug key. )
37
38 b.) You also need a Netchip USB debug cable/key:
39
40 http://www.plxtech.com/products/NET2000/NET20DC/default.asp
41
42 This is a small blue plastic connector with two USB connections,
43 it draws power from its USB connections.
44
45 c.) Thirdly, you need a second client/console system with a regular USB port.
46
472. Software requirements:
48
49 a.) On the host/target system:
50
51 You need to enable the following kernel config option:
52
53 CONFIG_EARLY_PRINTK_DBGP=y
54
55 And you need to add the boot command line: "earlyprintk=dbgp".
56 (If you are using Grub, append it to the 'kernel' line in
57 /etc/grub.conf)
58
59 NOTE: normally earlyprintk console gets turned off once the
60 regular console is alive - use "earlyprintk=dbgp,keep" to keep
61 this channel open beyond early bootup. This can be useful for
62 debugging crashes under Xorg, etc.
63
64 b.) On the client/console system:
65
66 You should enable the following kernel config option:
67
68 CONFIG_USB_SERIAL_DEBUG=y
69
70 On the next bootup with the modified kernel you should
71 get a /dev/ttyUSBx device(s).
72
73 Now this channel of kernel messages is ready to be used: start
74 your favorite terminal emulator (minicom, etc.) and set
75 it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
76 see the raw output.
77
78 c.) On Nvidia Southbridge based systems: the kernel will try to probe
79 and find out which port has debug device connected.
80
813. Testing that it works fine:
82
83 You can test the output by using earlyprintk=dbgp,keep and provoking
84 kernel messages on the host/target system. You can provoke a harmless
85 kernel message by for example doing:
86
87 echo h > /proc/sysrq-trigger
88
89 On the host/target system you should see this help line in "dmesg" output:
90
91 SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
92
93 On the client/console system do:
94
95 cat /dev/ttyUSB0
96
97 And you should see the help line above displayed shortly after you've
98 provoked it on the host system.
99
100If it does not work then please ask about it on the linux-kernel@vger.kernel.org
101mailing list or contact the x86 maintainers.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 06c02c00d7d9..34bc3a89228b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -786,6 +786,11 @@ config X86_MCE_AMD
786 Additional support for AMD specific MCE features such as 786 Additional support for AMD specific MCE features such as
787 the DRAM Error Threshold. 787 the DRAM Error Threshold.
788 788
789config X86_MCE_THRESHOLD
790 depends on X86_MCE_AMD || X86_MCE_INTEL
791 bool
792 default y
793
789config X86_MCE_NONFATAL 794config X86_MCE_NONFATAL
790 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 795 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
791 depends on X86_32 && X86_MCE 796 depends on X86_32 && X86_MCE
@@ -929,6 +934,12 @@ config X86_CPUID
929 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to 934 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
930 /dev/cpu/31/cpuid. 935 /dev/cpu/31/cpuid.
931 936
937config X86_CPU_DEBUG
938 tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
939 ---help---
940 If you select this option, this will provide various x86 CPUs
941 information through debugfs.
942
932choice 943choice
933 prompt "High Memory Support" 944 prompt "High Memory Support"
934 default HIGHMEM4G if !X86_NUMAQ 945 default HIGHMEM4G if !X86_NUMAQ
@@ -1121,7 +1132,7 @@ config NUMA_EMU
1121 1132
1122config NODES_SHIFT 1133config NODES_SHIFT
1123 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP 1134 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1124 range 1 9 if X86_64 1135 range 1 9
1125 default "9" if MAXSMP 1136 default "9" if MAXSMP
1126 default "6" if X86_64 1137 default "6" if X86_64
1127 default "4" if X86_NUMAQ 1138 default "4" if X86_NUMAQ
@@ -1429,7 +1440,7 @@ config CRASH_DUMP
1429config KEXEC_JUMP 1440config KEXEC_JUMP
1430 bool "kexec jump (EXPERIMENTAL)" 1441 bool "kexec jump (EXPERIMENTAL)"
1431 depends on EXPERIMENTAL 1442 depends on EXPERIMENTAL
1432 depends on KEXEC && HIBERNATION && X86_32 1443 depends on KEXEC && HIBERNATION
1433 ---help--- 1444 ---help---
1434 Jump between original kernel and kexeced kernel and invoke 1445 Jump between original kernel and kexeced kernel and invoke
1435 code in physical address mode via KEXEC 1446 code in physical address mode via KEXEC
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a95eaf0e582a..924e156a85ab 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -456,24 +456,9 @@ config CPU_SUP_AMD
456 456
457 If unsure, say N. 457 If unsure, say N.
458 458
459config CPU_SUP_CENTAUR_32 459config CPU_SUP_CENTAUR
460 default y 460 default y
461 bool "Support Centaur processors" if PROCESSOR_SELECT 461 bool "Support Centaur processors" if PROCESSOR_SELECT
462 depends on !64BIT
463 ---help---
464 This enables detection, tunings and quirks for Centaur processors
465
466 You need this enabled if you want your kernel to run on a
467 Centaur CPU. Disabling this option on other types of CPUs
468 makes the kernel a tiny bit smaller. Disabling it on a Centaur
469 CPU might render the kernel unbootable.
470
471 If unsure, say N.
472
473config CPU_SUP_CENTAUR_64
474 default y
475 bool "Support Centaur processors" if PROCESSOR_SELECT
476 depends on 64BIT
477 ---help--- 462 ---help---
478 This enables detection, tunings and quirks for Centaur processors 463 This enables detection, tunings and quirks for Centaur processors
479 464
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1836191839ee..f05d8c91d9e5 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -153,34 +153,23 @@ endif
153 153
154boot := arch/x86/boot 154boot := arch/x86/boot
155 155
156PHONY += zImage bzImage compressed zlilo bzlilo \ 156BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
157 zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install 157
158PHONY += bzImage $(BOOT_TARGETS)
158 159
159# Default kernel to build 160# Default kernel to build
160all: bzImage 161all: bzImage
161 162
162# KBUILD_IMAGE specify target image being built 163# KBUILD_IMAGE specify target image being built
163 KBUILD_IMAGE := $(boot)/bzImage 164KBUILD_IMAGE := $(boot)/bzImage
164zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
165 165
166zImage bzImage: vmlinux 166bzImage: vmlinux
167 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 167 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
168 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 168 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
169 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ 169 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
170 170
171compressed: zImage 171$(BOOT_TARGETS): vmlinux
172 172 $(Q)$(MAKE) $(build)=$(boot) $@
173zlilo bzlilo: vmlinux
174 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
175
176zdisk bzdisk: vmlinux
177 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
178
179fdimage fdimage144 fdimage288 isoimage: vmlinux
180 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
181
182install:
183 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
184 173
185PHONY += vdso_install 174PHONY += vdso_install
186vdso_install: 175vdso_install:
@@ -205,7 +194,3 @@ define archhelp
205 echo ' FDARGS="..." arguments for the booted kernel' 194 echo ' FDARGS="..." arguments for the booted kernel'
206 echo ' FDINITRD=file initrd for the booted kernel' 195 echo ' FDINITRD=file initrd for the booted kernel'
207endef 196endef
208
209CLEAN_FILES += arch/x86/boot/fdimage \
210 arch/x86/boot/image.iso \
211 arch/x86/boot/mtools.conf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c70eff69a1fb..fb737ce5888d 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -6,26 +6,24 @@
6# for more details. 6# for more details.
7# 7#
8# Copyright (C) 1994 by Linus Torvalds 8# Copyright (C) 1994 by Linus Torvalds
9# Changed by many, many contributors over the years.
9# 10#
10 11
11# ROOT_DEV specifies the default root-device when making the image. 12# ROOT_DEV specifies the default root-device when making the image.
12# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case 13# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
13# the default of FLOPPY is used by 'build'. 14# the default of FLOPPY is used by 'build'.
14 15
15ROOT_DEV := CURRENT 16ROOT_DEV := CURRENT
16 17
17# If you want to preset the SVGA mode, uncomment the next line and 18# If you want to preset the SVGA mode, uncomment the next line and
18# set SVGA_MODE to whatever number you want. 19# set SVGA_MODE to whatever number you want.
19# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. 20# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
20# The number is the same as you would ordinarily press at bootup. 21# The number is the same as you would ordinarily press at bootup.
21 22
22SVGA_MODE := -DSVGA_MODE=NORMAL_VGA 23SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
23 24
24# If you want the RAM disk device, define this to be the size in blocks. 25targets := vmlinux.bin setup.bin setup.elf bzImage
25 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
26#RAMDISK := -DRAMDISK=512
27
28targets := vmlinux.bin setup.bin setup.elf zImage bzImage
29subdir- := compressed 27subdir- := compressed
30 28
31setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
@@ -71,17 +69,13 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
71KBUILD_CFLAGS += $(call cc-option,-m32) 69KBUILD_CFLAGS += $(call cc-option,-m32)
72KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 70KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
73 71
74$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 72$(obj)/bzImage: asflags-y := $(SVGA_MODE)
75$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
76$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
77$(obj)/bzImage: BUILDFLAGS := -b
78 73
79quiet_cmd_image = BUILD $@ 74quiet_cmd_image = BUILD $@
80cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \ 75cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
81 $(obj)/vmlinux.bin $(ROOT_DEV) > $@ 76 $(ROOT_DEV) > $@
82 77
83$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ 78$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
84 $(obj)/vmlinux.bin $(obj)/tools/build FORCE
85 $(call if_changed,image) 79 $(call if_changed,image)
86 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' 80 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
87 81
@@ -116,9 +110,11 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
116$(obj)/compressed/vmlinux: FORCE 110$(obj)/compressed/vmlinux: FORCE
117 $(Q)$(MAKE) $(build)=$(obj)/compressed $@ 111 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
118 112
119# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 113# Set this if you want to pass append arguments to the
114# bzdisk/fdimage/isoimage kernel
120FDARGS = 115FDARGS =
121# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel 116# Set this if you want an initrd included with the
117# bzdisk/fdimage/isoimage kernel
122FDINITRD = 118FDINITRD =
123 119
124image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) 120image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
@@ -127,7 +123,7 @@ $(obj)/mtools.conf: $(src)/mtools.conf.in
127 sed -e 's|@OBJ@|$(obj)|g' < $< > $@ 123 sed -e 's|@OBJ@|$(obj)|g' < $< > $@
128 124
129# This requires write access to /dev/fd0 125# This requires write access to /dev/fd0
130zdisk: $(BOOTIMAGE) $(obj)/mtools.conf 126bzdisk: $(obj)/bzImage $(obj)/mtools.conf
131 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync 127 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
132 syslinux /dev/fd0 ; sync 128 syslinux /dev/fd0 ; sync
133 echo '$(image_cmdline)' | \ 129 echo '$(image_cmdline)' | \
@@ -135,10 +131,10 @@ zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
135 if [ -f '$(FDINITRD)' ] ; then \ 131 if [ -f '$(FDINITRD)' ] ; then \
136 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ 132 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
137 fi 133 fi
138 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync 134 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync
139 135
140# These require being root or having syslinux 2.02 or higher installed 136# These require being root or having syslinux 2.02 or higher installed
141fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf 137fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
142 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 138 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
143 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync 139 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
144 syslinux $(obj)/fdimage ; sync 140 syslinux $(obj)/fdimage ; sync
@@ -147,9 +143,9 @@ fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
147 if [ -f '$(FDINITRD)' ] ; then \ 143 if [ -f '$(FDINITRD)' ] ; then \
148 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ 144 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
149 fi 145 fi
150 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync 146 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync
151 147
152fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf 148fdimage288: $(obj)/bzImage $(obj)/mtools.conf
153 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 149 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
154 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync 150 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
155 syslinux $(obj)/fdimage ; sync 151 syslinux $(obj)/fdimage ; sync
@@ -158,9 +154,9 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
158 if [ -f '$(FDINITRD)' ] ; then \ 154 if [ -f '$(FDINITRD)' ] ; then \
159 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ 155 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
160 fi 156 fi
161 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync 157 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync
162 158
163isoimage: $(BOOTIMAGE) 159isoimage: $(obj)/bzImage
164 -rm -rf $(obj)/isoimage 160 -rm -rf $(obj)/isoimage
165 mkdir $(obj)/isoimage 161 mkdir $(obj)/isoimage
166 for i in lib lib64 share end ; do \ 162 for i in lib lib64 share end ; do \
@@ -170,7 +166,7 @@ isoimage: $(BOOTIMAGE)
170 fi ; \ 166 fi ; \
171 if [ $$i = end ] ; then exit 1 ; fi ; \ 167 if [ $$i = end ] ; then exit 1 ; fi ; \
172 done 168 done
173 cp $(BOOTIMAGE) $(obj)/isoimage/linux 169 cp $(obj)/bzImage $(obj)/isoimage/linux
174 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg 170 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
175 if [ -f '$(FDINITRD)' ] ; then \ 171 if [ -f '$(FDINITRD)' ] ; then \
176 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ 172 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
@@ -181,12 +177,13 @@ isoimage: $(BOOTIMAGE)
181 isohybrid $(obj)/image.iso 2>/dev/null || true 177 isohybrid $(obj)/image.iso 2>/dev/null || true
182 rm -rf $(obj)/isoimage 178 rm -rf $(obj)/isoimage
183 179
184zlilo: $(BOOTIMAGE) 180bzlilo: $(obj)/bzImage
185 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi 181 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
186 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi 182 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
187 cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz 183 cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
188 cp System.map $(INSTALL_PATH)/ 184 cp System.map $(INSTALL_PATH)/
189 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi 185 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
190 186
191install: 187install:
192 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" 188 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
189 System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 7ccff4884a23..5d84d1c74e4c 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -24,12 +24,8 @@
24#include "boot.h" 24#include "boot.h"
25#include "offsets.h" 25#include "offsets.h"
26 26
27SETUPSECTS = 4 /* default nr of setup-sectors */
28BOOTSEG = 0x07C0 /* original address of boot-sector */ 27BOOTSEG = 0x07C0 /* original address of boot-sector */
29SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ 28SYSSEG = 0x1000 /* historical load address >> 4 */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33 29
34#ifndef SVGA_MODE 30#ifndef SVGA_MODE
35#define SVGA_MODE ASK_VGA 31#define SVGA_MODE ASK_VGA
@@ -97,12 +93,12 @@ bugger_off_msg:
97 .section ".header", "a" 93 .section ".header", "a"
98 .globl hdr 94 .globl hdr
99hdr: 95hdr:
100setup_sects: .byte SETUPSECTS 96setup_sects: .byte 0 /* Filled in by build.c */
101root_flags: .word ROOT_RDONLY 97root_flags: .word ROOT_RDONLY
102syssize: .long SYSSIZE 98syssize: .long 0 /* Filled in by build.c */
103ram_size: .word RAMDISK 99ram_size: .word 0 /* Obsolete */
104vid_mode: .word SVGA_MODE 100vid_mode: .word SVGA_MODE
105root_dev: .word ROOT_DEV 101root_dev: .word 0 /* Filled in by build.c */
106boot_flag: .word 0xAA55 102boot_flag: .word 0xAA55
107 103
108 # offset 512, entry point 104 # offset 512, entry point
@@ -123,14 +119,15 @@ _start:
123 # or else old loadlin-1.5 will fail) 119 # or else old loadlin-1.5 will fail)
124 .globl realmode_swtch 120 .globl realmode_swtch
125realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 121realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
126start_sys_seg: .word SYSSEG 122start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
123 # in case something decided to "use" it
127 .word kernel_version-512 # pointing to kernel version string 124 .word kernel_version-512 # pointing to kernel version string
128 # above section of header is compatible 125 # above section of header is compatible
129 # with loadlin-1.5 (header v1.5). Don't 126 # with loadlin-1.5 (header v1.5). Don't
130 # change it. 127 # change it.
131 128
132type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, 129type_of_loader: .byte 0 # 0 means ancient bootloader, newer
133 # Bootlin, SYSLX, bootsect...) 130 # bootloaders know to change this.
134 # See Documentation/i386/boot.txt for 131 # See Documentation/i386/boot.txt for
135 # assigned ids 132 # assigned ids
136 133
@@ -142,11 +139,7 @@ CAN_USE_HEAP = 0x80 # If set, the loader also has set
142 # space behind setup.S can be used for 139 # space behind setup.S can be used for
143 # heap purposes. 140 # heap purposes.
144 # Only the loader knows what is free 141 # Only the loader knows what is free
145#ifndef __BIG_KERNEL__
146 .byte 0
147#else
148 .byte LOADED_HIGH 142 .byte LOADED_HIGH
149#endif
150 143
151setup_move_size: .word 0x8000 # size to move, when setup is not 144setup_move_size: .word 0x8000 # size to move, when setup is not
152 # loaded at 0x90000. We will move setup 145 # loaded at 0x90000. We will move setup
@@ -157,11 +150,7 @@ setup_move_size: .word 0x8000 # size to move, when setup is not
157 150
158code32_start: # here loaders can put a different 151code32_start: # here loaders can put a different
159 # start address for 32-bit code. 152 # start address for 32-bit code.
160#ifndef __BIG_KERNEL__
161 .long 0x1000 # 0x1000 = default for zImage
162#else
163 .long 0x100000 # 0x100000 = default for big kernel 153 .long 0x100000 # 0x100000 = default for big kernel
164#endif
165 154
166ramdisk_image: .long 0 # address of loaded ramdisk image 155ramdisk_image: .long 0 # address of loaded ramdisk image
167 # Here the loader puts the 32-bit 156 # Here the loader puts the 32-bit
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 85a1cd8a8ff8..8062f8915250 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -33,47 +33,6 @@ static void realmode_switch_hook(void)
33} 33}
34 34
35/* 35/*
36 * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
37 * A bzImage kernel is loaded and runs at 0x100000.
38 */
39static void move_kernel_around(void)
40{
41 /* Note: rely on the compile-time option here rather than
42 the LOADED_HIGH flag. The Qemu kernel loader unconditionally
43 sets the loadflags to zero. */
44#ifndef __BIG_KERNEL__
45 u16 dst_seg, src_seg;
46 u32 syssize;
47
48 dst_seg = 0x1000 >> 4;
49 src_seg = 0x10000 >> 4;
50 syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
51
52 while (syssize) {
53 int paras = (syssize >= 0x1000) ? 0x1000 : syssize;
54 int dwords = paras << 2;
55
56 asm volatile("pushw %%es ; "
57 "pushw %%ds ; "
58 "movw %1,%%es ; "
59 "movw %2,%%ds ; "
60 "xorw %%di,%%di ; "
61 "xorw %%si,%%si ; "
62 "rep;movsl ; "
63 "popw %%ds ; "
64 "popw %%es"
65 : "+c" (dwords)
66 : "r" (dst_seg), "r" (src_seg)
67 : "esi", "edi");
68
69 syssize -= paras;
70 dst_seg += paras;
71 src_seg += paras;
72 }
73#endif
74}
75
76/*
77 * Disable all interrupts at the legacy PIC. 36 * Disable all interrupts at the legacy PIC.
78 */ 37 */
79static void mask_all_interrupts(void) 38static void mask_all_interrupts(void)
@@ -147,9 +106,6 @@ void go_to_protected_mode(void)
147 /* Hook before leaving real mode, also disables interrupts */ 106 /* Hook before leaving real mode, also disables interrupts */
148 realmode_switch_hook(); 107 realmode_switch_hook();
149 108
150 /* Move the kernel/setup to their final resting places */
151 move_kernel_around();
152
153 /* Enable the A20 gate */ 109 /* Enable the A20 gate */
154 if (enable_a20()) { 110 if (enable_a20()) {
155 puts("A20 gate not responding, unable to boot...\n"); 111 puts("A20 gate not responding, unable to boot...\n");
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index 019c17a75851..3e0edc6d2a20 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -47,6 +47,7 @@ GLOBAL(protected_mode_jump)
47ENDPROC(protected_mode_jump) 47ENDPROC(protected_mode_jump)
48 48
49 .code32 49 .code32
50 .section ".text32","ax"
50GLOBAL(in_pm32) 51GLOBAL(in_pm32)
51 # Set up data segments for flat 32-bit mode 52 # Set up data segments for flat 32-bit mode
52 movl %ecx, %ds 53 movl %ecx, %ds
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index df9234b3a5e0..bb8dc2de7969 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -17,7 +17,8 @@ SECTIONS
17 .header : { *(.header) } 17 .header : { *(.header) }
18 .inittext : { *(.inittext) } 18 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) } 19 .initdata : { *(.initdata) }
20 .text : { *(.text*) } 20 .text : { *(.text) }
21 .text32 : { *(.text32) }
21 22
22 . = ALIGN(16); 23 . = ALIGN(16);
23 .rodata : { *(.rodata*) } 24 .rodata : { *(.rodata*) }
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 44dc1923c0e3..ee3a4ea923ac 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
130 130
131static void usage(void) 131static void usage(void)
132{ 132{
133 die("Usage: build [-b] setup system [rootdev] [> image]"); 133 die("Usage: build setup system [rootdev] [> image]");
134} 134}
135 135
136int main(int argc, char ** argv) 136int main(int argc, char ** argv)
@@ -145,11 +145,6 @@ int main(int argc, char ** argv)
145 void *kernel; 145 void *kernel;
146 u32 crc = 0xffffffffUL; 146 u32 crc = 0xffffffffUL;
147 147
148 if (argc > 2 && !strcmp(argv[1], "-b"))
149 {
150 is_big_kernel = 1;
151 argc--, argv++;
152 }
153 if ((argc < 3) || (argc > 4)) 148 if ((argc < 3) || (argc > 4))
154 usage(); 149 usage();
155 if (argc > 3) { 150 if (argc > 3) {
@@ -216,8 +211,6 @@ int main(int argc, char ** argv)
216 die("Unable to mmap '%s': %m", argv[2]); 211 die("Unable to mmap '%s': %m", argv[2]);
217 /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ 212 /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
218 sys_size = (sz + 15 + 4) / 16; 213 sys_size = (sz + 15 + 4) / 16;
219 if (!is_big_kernel && sys_size > DEF_SYSSIZE)
220 die("System is too big. Try using bzImage or modules.");
221 214
222 /* Patch the setup code with the appropriate size parameters */ 215 /* Patch the setup code with the appropriate size parameters */
223 buf[0x1f1] = setup_sectors-1; 216 buf[0x1f1] = setup_sectors-1;
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 5d4742ed4aa2..95d86ce0421c 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -129,41 +129,45 @@ u16 vga_crtc(void)
129 return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4; 129 return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
130} 130}
131 131
132static void vga_set_480_scanlines(int end) 132static void vga_set_480_scanlines(int lines)
133{ 133{
134 u16 crtc; 134 u16 crtc; /* CRTC base address */
135 u8 csel; 135 u8 csel; /* CRTC miscellaneous output register */
136 u8 ovfw; /* CRTC overflow register */
137 int end = lines-1;
136 138
137 crtc = vga_crtc(); 139 crtc = vga_crtc();
138 140
141 ovfw = 0x3c | ((end >> (8-1)) & 0x02) | ((end >> (9-6)) & 0x40);
142
139 out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */ 143 out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
140 out_idx(0x0b, crtc, 0x06); /* Vertical total */ 144 out_idx(0x0b, crtc, 0x06); /* Vertical total */
141 out_idx(0x3e, crtc, 0x07); /* Vertical overflow */ 145 out_idx(ovfw, crtc, 0x07); /* Vertical overflow */
142 out_idx(0xea, crtc, 0x10); /* Vertical sync start */ 146 out_idx(0xea, crtc, 0x10); /* Vertical sync start */
143 out_idx(end, crtc, 0x12); /* Vertical display end */ 147 out_idx(end, crtc, 0x12); /* Vertical display end */
144 out_idx(0xe7, crtc, 0x15); /* Vertical blank start */ 148 out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
145 out_idx(0x04, crtc, 0x16); /* Vertical blank end */ 149 out_idx(0x04, crtc, 0x16); /* Vertical blank end */
146 csel = inb(0x3cc); 150 csel = inb(0x3cc);
147 csel &= 0x0d; 151 csel &= 0x0d;
148 csel |= 0xe2; 152 csel |= 0xe2;
149 outb(csel, 0x3cc); 153 outb(csel, 0x3c2);
150} 154}
151 155
152static void vga_set_80x30(void) 156static void vga_set_80x30(void)
153{ 157{
154 vga_set_480_scanlines(0xdf); 158 vga_set_480_scanlines(30*16);
155} 159}
156 160
157static void vga_set_80x34(void) 161static void vga_set_80x34(void)
158{ 162{
159 vga_set_14font(); 163 vga_set_14font();
160 vga_set_480_scanlines(0xdb); 164 vga_set_480_scanlines(34*14);
161} 165}
162 166
163static void vga_set_80x60(void) 167static void vga_set_80x60(void)
164{ 168{
165 vga_set_8font(); 169 vga_set_8font();
166 vga_set_480_scanlines(0xdf); 170 vga_set_480_scanlines(60*8);
167} 171}
168 172
169static int vga_set_mode(struct mode_info *mode) 173static int vga_set_mode(struct mode_info *mode)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4ef949c1972e..df8a300dfe6c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -75,7 +75,7 @@ static inline void default_inquire_remote_apic(int apicid)
75#define setup_secondary_clock setup_secondary_APIC_clock 75#define setup_secondary_clock setup_secondary_APIC_clock
76#endif 76#endif
77 77
78#ifdef CONFIG_X86_VSMP 78#ifdef CONFIG_X86_64
79extern int is_vsmp_box(void); 79extern int is_vsmp_box(void);
80#else 80#else
81static inline int is_vsmp_box(void) 81static inline int is_vsmp_box(void)
@@ -108,6 +108,16 @@ extern void native_apic_icr_write(u32 low, u32 id);
108extern u64 native_apic_icr_read(void); 108extern u64 native_apic_icr_read(void);
109 109
110#ifdef CONFIG_X86_X2APIC 110#ifdef CONFIG_X86_X2APIC
111/*
112 * Make previous memory operations globally visible before
113 * sending the IPI through x2apic wrmsr. We need a serializing instruction or
114 * mfence for this.
115 */
116static inline void x2apic_wrmsr_fence(void)
117{
118 asm volatile("mfence" : : : "memory");
119}
120
111static inline void native_apic_msr_write(u32 reg, u32 v) 121static inline void native_apic_msr_write(u32 reg, u32 v)
112{ 122{
113 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || 123 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -184,6 +194,9 @@ static inline int x2apic_enabled(void)
184{ 194{
185 return 0; 195 return 0;
186} 196}
197
198#define x2apic 0
199
187#endif 200#endif
188 201
189extern int get_physical_broadcast(void); 202extern int get_physical_broadcast(void);
@@ -379,6 +392,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
379 392
380static inline void ack_APIC_irq(void) 393static inline void ack_APIC_irq(void)
381{ 394{
395#ifdef CONFIG_X86_LOCAL_APIC
382 /* 396 /*
383 * ack_APIC_irq() actually gets compiled as a single instruction 397 * ack_APIC_irq() actually gets compiled as a single instruction
384 * ... yummie. 398 * ... yummie.
@@ -386,6 +400,7 @@ static inline void ack_APIC_irq(void)
386 400
387 /* Docs say use 0 for future compatibility */ 401 /* Docs say use 0 for future compatibility */
388 apic_write(APIC_EOI, 0); 402 apic_write(APIC_EOI, 0);
403#endif
389} 404}
390 405
391static inline unsigned default_get_apic_id(unsigned long x) 406static inline unsigned default_get_apic_id(unsigned long x)
@@ -474,10 +489,19 @@ static inline int default_apic_id_registered(void)
474 return physid_isset(read_apic_id(), phys_cpu_present_map); 489 return physid_isset(read_apic_id(), phys_cpu_present_map);
475} 490}
476 491
492static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
493{
494 return cpuid_apic >> index_msb;
495}
496
497extern int default_apicid_to_node(int logical_apicid);
498
499#endif
500
477static inline unsigned int 501static inline unsigned int
478default_cpu_mask_to_apicid(const struct cpumask *cpumask) 502default_cpu_mask_to_apicid(const struct cpumask *cpumask)
479{ 503{
480 return cpumask_bits(cpumask)[0]; 504 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
481} 505}
482 506
483static inline unsigned int 507static inline unsigned int
@@ -491,15 +515,6 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
491 return (unsigned int)(mask1 & mask2 & mask3); 515 return (unsigned int)(mask1 & mask2 & mask3);
492} 516}
493 517
494static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
495{
496 return cpuid_apic >> index_msb;
497}
498
499extern int default_apicid_to_node(int logical_apicid);
500
501#endif
502
503static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) 518static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
504{ 519{
505 return physid_isset(apicid, bitmap); 520 return physid_isset(apicid, bitmap);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..bc9514fb3b13 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
53#define APIC_ESR_SENDILL 0x00020 53#define APIC_ESR_SENDILL 0x00020
54#define APIC_ESR_RECVILL 0x00040 54#define APIC_ESR_RECVILL 0x00040
55#define APIC_ESR_ILLREGA 0x00080 55#define APIC_ESR_ILLREGA 0x00080
56#define APIC_LVTCMCI 0x2f0
56#define APIC_ICR 0x300 57#define APIC_ICR 0x300
57#define APIC_DEST_SELF 0x40000 58#define APIC_DEST_SELF 0x40000
58#define APIC_DEST_ALLINC 0x80000 59#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6526cf08b0e4..6ba23dd9fc92 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -1,10 +1,6 @@
1#ifndef _ASM_X86_BOOT_H 1#ifndef _ASM_X86_BOOT_H
2#define _ASM_X86_BOOT_H 2#define _ASM_X86_BOOT_H
3 3
4/* Don't touch these, unless you really know what you're doing. */
5#define DEF_SYSSEG 0x1000
6#define DEF_SYSSIZE 0x7F00
7
8/* Internal svga startup constants */ 4/* Internal svga startup constants */
9#define NORMAL_VGA 0xffff /* 80x25 mode */ 5#define NORMAL_VGA 0xffff /* 80x25 mode */
10#define EXTENDED_VGA 0xfffe /* 80x50 mode */ 6#define EXTENDED_VGA 0xfffe /* 80x50 mode */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 5b301b7ff5f4..b3894bf52fcd 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -90,6 +90,9 @@ int set_memory_4k(unsigned long addr, int numpages);
90int set_memory_array_uc(unsigned long *addr, int addrinarray); 90int set_memory_array_uc(unsigned long *addr, int addrinarray);
91int set_memory_array_wb(unsigned long *addr, int addrinarray); 91int set_memory_array_wb(unsigned long *addr, int addrinarray);
92 92
93int set_pages_array_uc(struct page **pages, int addrinarray);
94int set_pages_array_wb(struct page **pages, int addrinarray);
95
93/* 96/*
94 * For legacy compatibility with the old APIs, a few functions 97 * For legacy compatibility with the old APIs, a few functions
95 * are provided that work on a "struct page". 98 * are provided that work on a "struct page".
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
new file mode 100755
index 000000000000..222802029fa6
--- /dev/null
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -0,0 +1,226 @@
1#ifndef _ASM_X86_CPU_DEBUG_H
2#define _ASM_X86_CPU_DEBUG_H
3
4/*
5 * CPU x86 architecture debug
6 *
7 * Copyright(C) 2009 Jaswinder Singh Rajput
8 */
9
10/* Register flags */
11enum cpu_debug_bit {
12/* Model Specific Registers (MSRs) */
13 CPU_MC_BIT, /* Machine Check */
14 CPU_MONITOR_BIT, /* Monitor */
15 CPU_TIME_BIT, /* Time */
16 CPU_PMC_BIT, /* Performance Monitor */
17 CPU_PLATFORM_BIT, /* Platform */
18 CPU_APIC_BIT, /* APIC */
19 CPU_POWERON_BIT, /* Power-on */
20 CPU_CONTROL_BIT, /* Control */
21 CPU_FEATURES_BIT, /* Features control */
22 CPU_LBRANCH_BIT, /* Last Branch */
23 CPU_BIOS_BIT, /* BIOS */
24 CPU_FREQ_BIT, /* Frequency */
25 CPU_MTTR_BIT, /* MTRR */
26 CPU_PERF_BIT, /* Performance */
27 CPU_CACHE_BIT, /* Cache */
28 CPU_SYSENTER_BIT, /* Sysenter */
29 CPU_THERM_BIT, /* Thermal */
30 CPU_MISC_BIT, /* Miscellaneous */
31 CPU_DEBUG_BIT, /* Debug */
32 CPU_PAT_BIT, /* PAT */
33 CPU_VMX_BIT, /* VMX */
34 CPU_CALL_BIT, /* System Call */
35 CPU_BASE_BIT, /* BASE Address */
36 CPU_VER_BIT, /* Version ID */
37 CPU_CONF_BIT, /* Configuration */
38 CPU_SMM_BIT, /* System mgmt mode */
39 CPU_SVM_BIT, /*Secure Virtual Machine*/
40 CPU_OSVM_BIT, /* OS-Visible Workaround*/
41/* Standard Registers */
42 CPU_TSS_BIT, /* Task Stack Segment */
43 CPU_CR_BIT, /* Control Registers */
44 CPU_DT_BIT, /* Descriptor Table */
45/* End of Registers flags */
46 CPU_REG_ALL_BIT, /* Select all Registers */
47};
48
49#define CPU_REG_ALL (~0) /* Select all Registers */
50
51#define CPU_MC (1 << CPU_MC_BIT)
52#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
53#define CPU_TIME (1 << CPU_TIME_BIT)
54#define CPU_PMC (1 << CPU_PMC_BIT)
55#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
56#define CPU_APIC (1 << CPU_APIC_BIT)
57#define CPU_POWERON (1 << CPU_POWERON_BIT)
58#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
59#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
60#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
61#define CPU_BIOS (1 << CPU_BIOS_BIT)
62#define CPU_FREQ (1 << CPU_FREQ_BIT)
63#define CPU_MTRR (1 << CPU_MTTR_BIT)
64#define CPU_PERF (1 << CPU_PERF_BIT)
65#define CPU_CACHE (1 << CPU_CACHE_BIT)
66#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
67#define CPU_THERM (1 << CPU_THERM_BIT)
68#define CPU_MISC (1 << CPU_MISC_BIT)
69#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
70#define CPU_PAT (1 << CPU_PAT_BIT)
71#define CPU_VMX (1 << CPU_VMX_BIT)
72#define CPU_CALL (1 << CPU_CALL_BIT)
73#define CPU_BASE (1 << CPU_BASE_BIT)
74#define CPU_VER (1 << CPU_VER_BIT)
75#define CPU_CONF (1 << CPU_CONF_BIT)
76#define CPU_SMM (1 << CPU_SMM_BIT)
77#define CPU_SVM (1 << CPU_SVM_BIT)
78#define CPU_OSVM (1 << CPU_OSVM_BIT)
79#define CPU_TSS (1 << CPU_TSS_BIT)
80#define CPU_CR (1 << CPU_CR_BIT)
81#define CPU_DT (1 << CPU_DT_BIT)
82
83/* Register file flags */
84enum cpu_file_bit {
85 CPU_INDEX_BIT, /* index */
86 CPU_VALUE_BIT, /* value */
87};
88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91/*
92 * DisplayFamily_DisplayModel Processor Families/Processor Number Series
93 * -------------------------- ------------------------------------------
94 * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
95 *
96 * 06_01 Pentium Pro
97 * 06_03, 06_05 Pentium II Xeon, Pentium II
98 * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
99 *
100 * 06_09, 060D Pentium M
101 *
102 * 06_0E Core Duo, Core Solo
103 *
104 * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
105 * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
106 * Pentium dual-core
107 * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
108 *
109 * 06_1C Atom
110 *
111 * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
112 * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
113 *
114 * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
115 * Pentium 4, Pentium D
116 */
117
118/* Register processors bits */
119enum cpu_processor_bit {
120 CPU_NONE,
121/* Intel */
122 CPU_INTEL_PENTIUM_BIT,
123 CPU_INTEL_P6_BIT,
124 CPU_INTEL_PENTIUM_M_BIT,
125 CPU_INTEL_CORE_BIT,
126 CPU_INTEL_CORE2_BIT,
127 CPU_INTEL_ATOM_BIT,
128 CPU_INTEL_XEON_P4_BIT,
129 CPU_INTEL_XEON_MP_BIT,
130/* AMD */
131 CPU_AMD_K6_BIT,
132 CPU_AMD_K7_BIT,
133 CPU_AMD_K8_BIT,
134 CPU_AMD_0F_BIT,
135 CPU_AMD_10_BIT,
136 CPU_AMD_11_BIT,
137};
138
139#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
140#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
141#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
142#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
143#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
144#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
145#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
146#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
147
148#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
149#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
150#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
151#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
152#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
153#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
154#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
155#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
156#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
157#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
158#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
159#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
160#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
161#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
162#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
163#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
164#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
165#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
166#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
167
168/* Select all supported Intel CPUs */
169#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
170
171#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
172#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
173#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
174#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
175#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
176#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
177
178#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
179#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
180#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
181#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
182
183/* Select all supported AMD CPUs */
184#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
185
186/* Select all supported CPUs */
187#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
188
189#define MAX_CPU_FILES 512
190
191struct cpu_private {
192 unsigned cpu;
193 unsigned type;
194 unsigned reg;
195 unsigned file;
196};
197
198struct cpu_debug_base {
199 char *name; /* Register name */
200 unsigned flag; /* Register flag */
201 unsigned write; /* Register write flag */
202};
203
204/*
205 * Currently it looks similar to cpu_debug_base but once we add more files
206 * cpu_file_base will go in different direction
207 */
208struct cpu_file_base {
209 char *name; /* Register file name */
210 unsigned flag; /* Register file flag */
211 unsigned write; /* Register write flag */
212};
213
214struct cpu_cpuX_base {
215 struct dentry *dentry; /* Register dentry */
216 int init; /* Register index file */
217};
218
219struct cpu_debug_range {
220 unsigned min; /* Register range min */
221 unsigned max; /* Register range max */
222 unsigned flag; /* Supported flags */
223 unsigned model; /* Supported models */
224};
225
226#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index dc27705f5443..5623c50d67b2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -91,7 +91,6 @@ static inline int desc_empty(const void *ptr)
91#define store_gdt(dtr) native_store_gdt(dtr) 91#define store_gdt(dtr) native_store_gdt(dtr)
92#define store_idt(dtr) native_store_idt(dtr) 92#define store_idt(dtr) native_store_idt(dtr)
93#define store_tr(tr) (tr = native_store_tr()) 93#define store_tr(tr) (tr = native_store_tr())
94#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
95 94
96#define load_TLS(t, cpu) native_load_tls(t, cpu) 95#define load_TLS(t, cpu) native_load_tls(t, cpu)
97#define set_ldt native_set_ldt 96#define set_ldt native_set_ldt
@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
112} 111}
113#endif /* CONFIG_PARAVIRT */ 112#endif /* CONFIG_PARAVIRT */
114 113
114#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
115
115static inline void native_write_idt_entry(gate_desc *idt, int entry, 116static inline void native_write_idt_entry(gate_desc *idt, int entry,
116 const gate_desc *gate) 117 const gate_desc *gate)
117{ 118{
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index bc68212c6bc0..fd8f9e2ca35f 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -1,22 +1,15 @@
1#ifndef _ASM_X86_DMI_H 1#ifndef _ASM_X86_DMI_H
2#define _ASM_X86_DMI_H 2#define _ASM_X86_DMI_H
3 3
4#include <asm/io.h> 4#include <linux/compiler.h>
5 5#include <linux/init.h>
6#define DMI_MAX_DATA 2048
7 6
8extern int dmi_alloc_index; 7#include <asm/io.h>
9extern char dmi_alloc_data[DMI_MAX_DATA]; 8#include <asm/setup.h>
10 9
11/* This is so early that there is no good way to allocate dynamic memory. 10static __always_inline __init void *dmi_alloc(unsigned len)
12 Allocate data in an BSS array. */
13static inline void *dmi_alloc(unsigned len)
14{ 11{
15 int idx = dmi_alloc_index; 12 return extend_brk(len, sizeof(int));
16 if ((dmi_alloc_index + len) > DMI_MAX_DATA)
17 return NULL;
18 dmi_alloc_index += len;
19 return dmi_alloc_data + idx;
20} 13}
21 14
22/* Use early IO mappings for DMI because it's initialized early */ 15/* Use early IO mappings for DMI because it's initialized early */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 00d41ce4c844..7ecba4d85089 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,7 +72,7 @@ extern int e820_all_mapped(u64 start, u64 end, unsigned type);
72extern void e820_add_region(u64 start, u64 size, int type); 72extern void e820_add_region(u64 start, u64 size, int type);
73extern void e820_print_map(char *who); 73extern void e820_print_map(char *who);
74extern int 74extern int
75sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); 75sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
76extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, 76extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
77 unsigned new_type); 77 unsigned new_type);
78extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, 78extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 854d538ae857..c2e6bedaf258 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
33 smp_invalidate_interrupt) 33 smp_invalidate_interrupt)
34#endif 34#endif
35 35
36BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
37
36/* 38/*
37 * every pentium local APIC has two 'local interrupts', with a 39 * every pentium local APIC has two 'local interrupts', with a
38 * soft-definable vector attached to both interrupts, one of 40 * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 176f058e7159..039db6aa8e02 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */
15#ifdef CONFIG_SMP 16#ifdef CONFIG_SMP
16 unsigned int irq_resched_count; 17 unsigned int irq_resched_count;
17 unsigned int irq_call_count; 18 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index bf9276bea660..014c2b85ae45 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
63void *kmap_atomic(struct page *page, enum km_type type); 63void *kmap_atomic(struct page *page, enum km_type type);
64void kunmap_atomic(void *kvaddr, enum km_type type); 64void kunmap_atomic(void *kvaddr, enum km_type type);
65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); 65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
66struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
67 68
68#ifndef CONFIG_PARAVIRT 69#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 370e1c83bb49..b762ea49bd70 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,6 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void);
30extern void error_interrupt(void); 31extern void error_interrupt(void);
31extern void spurious_interrupt(void); 32extern void spurious_interrupt(void);
32extern void thermal_interrupt(void); 33extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 000000000000..36fb1a6a5109
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_INIT_32_H
2#define _ASM_X86_INIT_32_H
3
4#ifdef CONFIG_X86_32
5extern void __init early_ioremap_page_table_range_init(void);
6#endif
7
8extern unsigned long __init
9kernel_physical_mapping_init(unsigned long start,
10 unsigned long end,
11 unsigned long page_size_mask);
12
13
14extern unsigned long __initdata e820_table_start;
15extern unsigned long __meminitdata e820_table_end;
16extern unsigned long __meminitdata e820_table_top;
17
18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 59cb4a1317b7..373cc2bbcad2 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -162,7 +162,8 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq);
162extern void ioapic_init_mappings(void); 162extern void ioapic_init_mappings(void);
163 163
164#ifdef CONFIG_X86_64 164#ifdef CONFIG_X86_64
165extern int save_mask_IO_APIC_setup(void); 165extern int save_IO_APIC_setup(void);
166extern void mask_IO_APIC_setup(void);
166extern void restore_IO_APIC_setup(void); 167extern void restore_IO_APIC_setup(void);
167extern void reinit_intr_remapped_IO_APIC(int); 168extern void reinit_intr_remapped_IO_APIC(int);
168#endif 169#endif
@@ -172,7 +173,7 @@ extern void probe_nr_irqs_gsi(void);
172extern int setup_ioapic_entry(int apic, int irq, 173extern int setup_ioapic_entry(int apic, int irq,
173 struct IO_APIC_route_entry *entry, 174 struct IO_APIC_route_entry *entry,
174 unsigned int destination, int trigger, 175 unsigned int destination, int trigger,
175 int polarity, int vector); 176 int polarity, int vector, int pin);
176extern void ioapic_write_entry(int apic, int pin, 177extern void ioapic_write_entry(int apic, int pin,
177 struct IO_APIC_route_entry e); 178 struct IO_APIC_route_entry e);
178#else /* !CONFIG_X86_IO_APIC */ 179#else /* !CONFIG_X86_IO_APIC */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 107eb2196691..f38481bcd455 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
36extern void fixup_irqs(void); 36extern void fixup_irqs(void);
37#endif 37#endif
38 38
39extern void (*generic_interrupt_extension)(void);
39extern void init_IRQ(void); 40extern void init_IRQ(void);
40extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
41extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 20e1fd588dbf..0396760fccb8 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H 1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H 2#define _ASM_X86_IRQ_REMAPPING_H
3 3
4extern int x2apic;
5
6#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
7 5
8#endif /* _ASM_X86_IRQ_REMAPPING_H */ 6#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8a285f356f8a..3cbd79bbb47c 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,6 +112,11 @@
112#define LOCAL_PERF_VECTOR 0xee 112#define LOCAL_PERF_VECTOR 0xee
113 113
114/* 114/*
115 * Generic system vector for platform specific use
116 */
117#define GENERIC_INTERRUPT_VECTOR 0xed
118
119/*
115 * First APIC vector available to drivers: (vectors 0x30-0xee) we 120 * First APIC vector available to drivers: (vectors 0x30-0xee) we
116 * start at 0x31(0x41) to spread out vectors evenly between priority 121 * start at 0x31(0x41) to spread out vectors evenly between priority
117 * levels. (0x80 is the syscall vector) 122 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0ceb6d19ed30..317ff1703d0b 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -9,13 +9,13 @@
9# define PAGES_NR 4 9# define PAGES_NR 4
10#else 10#else
11# define PA_CONTROL_PAGE 0 11# define PA_CONTROL_PAGE 0
12# define PA_TABLE_PAGE 1 12# define VA_CONTROL_PAGE 1
13# define PAGES_NR 2 13# define PA_TABLE_PAGE 2
14# define PA_SWAP_PAGE 3
15# define PAGES_NR 4
14#endif 16#endif
15 17
16#ifdef CONFIG_X86_32
17# define KEXEC_CONTROL_CODE_MAX_SIZE 2048 18# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
18#endif
19 19
20#ifndef __ASSEMBLY__ 20#ifndef __ASSEMBLY__
21 21
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
136 unsigned int has_pae, 136 unsigned int has_pae,
137 unsigned int preserve_context); 137 unsigned int preserve_context);
138#else 138#else
139NORET_TYPE void 139unsigned long
140relocate_kernel(unsigned long indirection_page, 140relocate_kernel(unsigned long indirection_page,
141 unsigned long page_list, 141 unsigned long page_list,
142 unsigned long start_address) ATTRIB_NORET; 142 unsigned long start_address,
143 unsigned int preserve_context);
143#endif 144#endif
144 145
145#define ARCH_HAS_KIMAGE_ARCH 146#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a8a26a..12d55e773eb6 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -1,14 +1,11 @@
1#ifndef _ASM_X86_LINKAGE_H 1#ifndef _ASM_X86_LINKAGE_H
2#define _ASM_X86_LINKAGE_H 2#define _ASM_X86_LINKAGE_H
3 3
4#include <linux/stringify.h>
5
4#undef notrace 6#undef notrace
5#define notrace __attribute__((no_instrument_function)) 7#define notrace __attribute__((no_instrument_function))
6 8
7#ifdef CONFIG_X86_64
8#define __ALIGN .p2align 4,,15
9#define __ALIGN_STR ".p2align 4,,15"
10#endif
11
12#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
13#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) 10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
14/* 11/*
@@ -50,16 +47,20 @@
50 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 47 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
51 "g" (arg4), "g" (arg5), "g" (arg6)) 48 "g" (arg4), "g" (arg5), "g" (arg6))
52 49
53#endif 50#endif /* CONFIG_X86_32 */
51
52#ifdef __ASSEMBLY__
54 53
55#define GLOBAL(name) \ 54#define GLOBAL(name) \
56 .globl name; \ 55 .globl name; \
57 name: 56 name:
58 57
59#ifdef CONFIG_X86_ALIGNMENT_16 58#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
60#define __ALIGN .align 16,0x90 59#define __ALIGN .p2align 4, 0x90
61#define __ALIGN_STR ".align 16,0x90" 60#define __ALIGN_STR __stringify(__ALIGN)
62#endif 61#endif
63 62
63#endif /* __ASSEMBLY__ */
64
64#endif /* _ASM_X86_LINKAGE_H */ 65#endif /* _ASM_X86_LINKAGE_H */
65 66
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960b..563933e06a35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
11 */ 11 */
12 12
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
14 16
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
90 92
91#include <asm/atomic.h> 93#include <asm/atomic.h>
92 94
95void mce_setup(struct mce *m);
93void mce_log(struct mce *m); 96void mce_log(struct mce *m);
94DECLARE_PER_CPU(struct sys_device, device_mce); 97DECLARE_PER_CPU(struct sys_device, device_mce);
95extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
96 99
100/*
101 * To support more than 128 would need to escape the predefined
102 * Linux defined extended banks first.
103 */
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105
97#ifdef CONFIG_X86_MCE_INTEL 106#ifdef CONFIG_X86_MCE_INTEL
98void mce_intel_feature_init(struct cpuinfo_x86 *c); 107void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void);
109void cmci_reenable(void);
110void cmci_rediscover(int dying);
111void cmci_recheck(void);
99#else 112#else
100static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } 113static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
114static inline void cmci_clear(void) {}
115static inline void cmci_reenable(void) {}
116static inline void cmci_rediscover(int dying) {}
117static inline void cmci_recheck(void) {}
101#endif 118#endif
102 119
103#ifdef CONFIG_X86_MCE_AMD 120#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
106static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
107#endif 124#endif
108 125
109void mce_log_therm_throt_event(unsigned int cpu, __u64 status); 126extern int mce_available(struct cpuinfo_x86 *c);
127
128void mce_log_therm_throt_event(__u64 status);
110 129
111extern atomic_t mce_entry; 130extern atomic_t mce_entry;
112 131
113extern void do_machine_check(struct pt_regs *, long); 132extern void do_machine_check(struct pt_regs *, long);
133
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
136
137enum mcp_flags {
138 MCP_TIMESTAMP = (1 << 0), /* log time stamp */
139 MCP_UC = (1 << 1), /* log uncorrected errors */
140};
141extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
142
114extern int mce_notify_user(void); 143extern int mce_notify_user(void);
115 144
116#endif /* !CONFIG_X86_32 */ 145#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
120#else 149#else
121#define mcheck_init(c) do { } while (0) 150#define mcheck_init(c) do { } while (0)
122#endif 151#endif
123extern void stop_mce(void); 152
124extern void restart_mce(void); 153extern void (*mce_threshold_vector)(void);
125 154
126#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
127#endif /* _ASM_X86_MCE_H */ 156#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index 6706b3006f13..4cc48af23fef 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -47,6 +47,7 @@
47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ 48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
49 MSI_ADDR_DEST_ID_MASK) 49 MSI_ADDR_DEST_ID_MASK)
50#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00)
50 51
51#define MSI_ADDR_IR_EXT_INT (1 << 4) 52#define MSI_ADDR_IR_EXT_INT (1 << 4)
52#define MSI_ADDR_IR_SHV (1 << 3) 53#define MSI_ADDR_IR_SHV (1 << 3)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f4e505f286bc..ec41fc16c167 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,6 +81,11 @@
81#define MSR_IA32_MC0_ADDR 0x00000402 81#define MSR_IA32_MC0_ADDR 0x00000402
82#define MSR_IA32_MC0_MISC 0x00000403 82#define MSR_IA32_MC0_MISC 0x00000403
83 83
84/* These are consecutive and not in the normal 4er MCE bank block */
85#define MSR_IA32_MC0_CTL2 0x00000280
86#define CMCI_EN (1ULL << 30)
87#define CMCI_THRESHOLD_MASK 0xffffULL
88
84#define MSR_P6_PERFCTR0 0x000000c1 89#define MSR_P6_PERFCTR0 0x000000c1
85#define MSR_P6_PERFCTR1 0x000000c2 90#define MSR_P6_PERFCTR1 0x000000c2
86#define MSR_P6_EVNTSEL0 0x00000186 91#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f1e4a79a6e41..0f915ae649a7 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
39#define __VIRTUAL_MASK_SHIFT 32 39#define __VIRTUAL_MASK_SHIFT 32
40#endif /* CONFIG_X86_PAE */ 40#endif /* CONFIG_X86_PAE */
41 41
42/*
43 * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
44 */
45#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
46
42#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
43 48
44/* 49/*
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603c..826ad37006ab 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42 42
43struct pgprot;
44
45extern int page_is_ram(unsigned long pagenr); 43extern int page_is_ram(unsigned long pagenr);
46extern int devmem_is_allowed(unsigned long pagenr); 44extern int devmem_is_allowed(unsigned long pagenr);
47extern void map_devmem(unsigned long pfn, unsigned long size,
48 struct pgprot vma_prot);
49extern void unmap_devmem(unsigned long pfn, unsigned long size,
50 struct pgprot vma_prot);
51 45
52extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
53extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0617d5cc9712..7727aa8b7dda 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -317,8 +317,6 @@ struct pv_mmu_ops {
317#if PAGETABLE_LEVELS >= 3 317#if PAGETABLE_LEVELS >= 3
318#ifdef CONFIG_X86_PAE 318#ifdef CONFIG_X86_PAE
319 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); 319 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
320 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
321 pte_t *ptep, pte_t pte);
322 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, 320 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
323 pte_t *ptep); 321 pte_t *ptep);
324 void (*pmd_clear)(pmd_t *pmdp); 322 void (*pmd_clear)(pmd_t *pmdp);
@@ -389,7 +387,7 @@ extern struct pv_lock_ops pv_lock_ops;
389 387
390#define paravirt_type(op) \ 388#define paravirt_type(op) \
391 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ 389 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
392 [paravirt_opptr] "m" (op) 390 [paravirt_opptr] "i" (&(op))
393#define paravirt_clobber(clobber) \ 391#define paravirt_clobber(clobber) \
394 [paravirt_clobber] "i" (clobber) 392 [paravirt_clobber] "i" (clobber)
395 393
@@ -443,7 +441,7 @@ int paravirt_disable_iospace(void);
443 * offset into the paravirt_patch_template structure, and can therefore be 441 * offset into the paravirt_patch_template structure, and can therefore be
444 * freely converted back into a structure offset. 442 * freely converted back into a structure offset.
445 */ 443 */
446#define PARAVIRT_CALL "call *%[paravirt_opptr];" 444#define PARAVIRT_CALL "call *%c[paravirt_opptr];"
447 445
448/* 446/*
449 * These macros are intended to wrap calls through one of the paravirt 447 * These macros are intended to wrap calls through one of the paravirt
@@ -1365,13 +1363,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1365 pte.pte, pte.pte >> 32); 1363 pte.pte, pte.pte >> 32);
1366} 1364}
1367 1365
1368static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1369 pte_t *ptep, pte_t pte)
1370{
1371 /* 5 arg words */
1372 pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
1373}
1374
1375static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 1366static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1376 pte_t *ptep) 1367 pte_t *ptep)
1377{ 1368{
@@ -1388,12 +1379,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1388 set_pte(ptep, pte); 1379 set_pte(ptep, pte);
1389} 1380}
1390 1381
1391static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1392 pte_t *ptep, pte_t pte)
1393{
1394 set_pte(ptep, pte);
1395}
1396
1397static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 1382static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1398 pte_t *ptep) 1383 pte_t *ptep)
1399{ 1384{
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838e..2cd07b9422f4 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PAT_H 2#define _ASM_X86_PAT_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/pgtable_types.h>
5 6
6#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
7extern int pat_enabled; 8extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
17 18
18extern int kernel_map_sync_memtype(u64 base, unsigned long size, 19extern int kernel_map_sync_memtype(u64 base, unsigned long size,
19 unsigned long flag); 20 unsigned long flag);
21extern void map_devmem(unsigned long pfn, unsigned long size,
22 struct pgprot vma_prot);
23extern void unmap_devmem(unsigned long pfn, unsigned long size,
24 struct pgprot vma_prot);
20 25
21#endif /* _ASM_X86_PAT_H */ 26#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index c1774ac9da7a..2334982b339e 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -26,13 +26,6 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
26 native_set_pte(ptep, pte); 26 native_set_pte(ptep, pte);
27} 27}
28 28
29static inline void native_set_pte_present(struct mm_struct *mm,
30 unsigned long addr,
31 pte_t *ptep, pte_t pte)
32{
33 native_set_pte(ptep, pte);
34}
35
36static inline void native_pmd_clear(pmd_t *pmdp) 29static inline void native_pmd_clear(pmd_t *pmdp)
37{ 30{
38 native_set_pmd(pmdp, __pmd(0)); 31 native_set_pmd(pmdp, __pmd(0));
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 3f13cdf61156..177b0165ea01 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,23 +31,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
31 ptep->pte_low = pte.pte_low; 31 ptep->pte_low = pte.pte_low;
32} 32}
33 33
34/*
35 * Since this is only called on user PTEs, and the page fault handler
36 * must handle the already racy situation of simultaneous page faults,
37 * we are justified in merely clearing the PTE present bit, followed
38 * by a set. The ordering here is important.
39 */
40static inline void native_set_pte_present(struct mm_struct *mm,
41 unsigned long addr,
42 pte_t *ptep, pte_t pte)
43{
44 ptep->pte_low = 0;
45 smp_wmb();
46 ptep->pte_high = pte.pte_high;
47 smp_wmb();
48 ptep->pte_low = pte.pte_low;
49}
50
51static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 34static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
52{ 35{
53 set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); 36 set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d0812e155f1d..29d96d168bc0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,8 +31,6 @@ extern struct list_head pgd_list;
31#define set_pte(ptep, pte) native_set_pte(ptep, pte) 31#define set_pte(ptep, pte) native_set_pte(ptep, pte)
32#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 32#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
33 33
34#define set_pte_present(mm, addr, ptep, pte) \
35 native_set_pte_present(mm, addr, ptep, pte)
36#define set_pte_atomic(ptep, pte) \ 34#define set_pte_atomic(ptep, pte) \
37 native_set_pte_atomic(ptep, pte) 35 native_set_pte_atomic(ptep, pte)
38 36
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 97612fc7632f..31bd120cf2a2 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
42 */ 42 */
43#undef TEST_ACCESS_OK 43#undef TEST_ACCESS_OK
44 44
45/* The boot page tables (all created as a single array) */
46extern unsigned long pg0[];
47
48#ifdef CONFIG_X86_PAE 45#ifdef CONFIG_X86_PAE
49# include <asm/pgtable-3level.h> 46# include <asm/pgtable-3level.h>
50#else 47#else
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe04..2733fad45f98 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
25 * area for the same reason. ;) 25 * area for the same reason. ;)
26 */ 26 */
27#define VMALLOC_OFFSET (8 * 1024 * 1024) 27#define VMALLOC_OFFSET (8 * 1024 * 1024)
28
29#ifndef __ASSEMBLER__
30extern bool __vmalloc_start_set; /* set once high_memory is set */
31#endif
32
28#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) 33#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
29#ifdef CONFIG_X86_PAE 34#ifdef CONFIG_X86_PAE
30#define LAST_PKMAP 512 35#define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0f..b8238dc8786d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
273 273
274extern pteval_t __supported_pte_mask; 274extern pteval_t __supported_pte_mask;
275extern int nx_enabled; 275extern int nx_enabled;
276extern void set_nx(void);
276 277
277#define pgprot_writecombine pgprot_writecombine 278#define pgprot_writecombine pgprot_writecombine
278extern pgprot_t pgprot_writecombine(pgprot_t prot); 279extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 76139506c3e4..ae85a8d66a30 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -75,9 +75,9 @@ struct cpuinfo_x86 {
75#else 75#else
76 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ 76 /* Number of 4K pages in DTLB/ITLB combined(in pages): */
77 int x86_tlbsize; 77 int x86_tlbsize;
78#endif
78 __u8 x86_virt_bits; 79 __u8 x86_virt_bits;
79 __u8 x86_phys_bits; 80 __u8 x86_phys_bits;
80#endif
81 /* CPUID returned core id bits: */ 81 /* CPUID returned core id bits: */
82 __u8 x86_coreid_bits; 82 __u8 x86_coreid_bits;
83 /* Max extended CPUID function supported: */ 83 /* Max extended CPUID function supported: */
@@ -391,6 +391,9 @@ DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
391DECLARE_INIT_PER_CPU(irq_stack_union); 391DECLARE_INIT_PER_CPU(irq_stack_union);
392 392
393DECLARE_PER_CPU(char *, irq_stack_ptr); 393DECLARE_PER_CPU(char *, irq_stack_ptr);
394DECLARE_PER_CPU(unsigned int, irq_count);
395extern unsigned long kernel_eflags;
396extern asmlinkage void ignore_sysret(void);
394#else /* X86_64 */ 397#else /* X86_64 */
395#ifdef CONFIG_CC_STACKPROTECTOR 398#ifdef CONFIG_CC_STACKPROTECTOR
396DECLARE_PER_CPU(unsigned long, stack_canary); 399DECLARE_PER_CPU(unsigned long, stack_canary);
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 2b8c5160388f..1b7ee5d673c2 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -1 +1,8 @@
1#ifndef _ASM_X86_SECTIONS_H
2#define _ASM_X86_SECTIONS_H
3
1#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5
6extern char __brk_base[], __brk_limit[];
7
8#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 05c6f6b11fd5..bdc2ada05ae0 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -64,7 +64,7 @@ extern void x86_quirk_time_init(void);
64#include <asm/bootparam.h> 64#include <asm/bootparam.h>
65 65
66/* Interrupt control for vSMPowered x86_64 systems */ 66/* Interrupt control for vSMPowered x86_64 systems */
67#ifdef CONFIG_X86_VSMP 67#ifdef CONFIG_X86_64
68void vsmp_init(void); 68void vsmp_init(void);
69#else 69#else
70static inline void vsmp_init(void) { } 70static inline void vsmp_init(void) { }
@@ -100,20 +100,51 @@ extern struct boot_params boot_params;
100 */ 100 */
101#define LOWMEMSIZE() (0x9f000) 101#define LOWMEMSIZE() (0x9f000)
102 102
103/* exceedingly early brk-like allocator */
104extern unsigned long _brk_end;
105void *extend_brk(size_t size, size_t align);
106
107/*
108 * Reserve space in the brk section. The name must be unique within
109 * the file, and somewhat descriptive. The size is in bytes. Must be
110 * used at file scope.
111 *
112 * (This uses a temp function to wrap the asm so we can pass it the
113 * size parameter; otherwise we wouldn't be able to. We can't use a
114 * "section" attribute on a normal variable because it always ends up
115 * being @progbits, which ends up allocating space in the vmlinux
116 * executable.)
117 */
118#define RESERVE_BRK(name,sz) \
119 static void __section(.discard) __used \
120 __brk_reservation_fn_##name##__(void) { \
121 asm volatile ( \
122 ".pushsection .brk_reservation,\"aw\",@nobits;" \
123 ".brk." #name ":" \
124 " 1:.skip %c0;" \
125 " .size .brk." #name ", . - 1b;" \
126 " .popsection" \
127 : : "i" (sz)); \
128 }
129
103#ifdef __i386__ 130#ifdef __i386__
104 131
105void __init i386_start_kernel(void); 132void __init i386_start_kernel(void);
106extern void probe_roms(void); 133extern void probe_roms(void);
107 134
108extern unsigned long init_pg_tables_start;
109extern unsigned long init_pg_tables_end;
110
111#else 135#else
112void __init x86_64_start_kernel(char *real_mode); 136void __init x86_64_start_kernel(char *real_mode);
113void __init x86_64_start_reservations(char *real_mode_data); 137void __init x86_64_start_reservations(char *real_mode_data);
114 138
115#endif /* __i386__ */ 139#endif /* __i386__ */
116#endif /* _SETUP */ 140#endif /* _SETUP */
141#else
142#define RESERVE_BRK(name,sz) \
143 .pushsection .brk_reservation,"aw",@nobits; \
144.brk.name: \
1451: .skip sz; \
146 .size .brk.name,.-1b; \
147 .popsection
117#endif /* __ASSEMBLY__ */ 148#endif /* __ASSEMBLY__ */
118#endif /* __KERNEL__ */ 149#endif /* __KERNEL__ */
119 150
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327ef05c1..9f4dfba33b28 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */ 199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ 200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
201 201
202/* Loop through all installed blades */
203#define for_each_possible_blade(bid) \
204 for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
205
202/* 206/*
203 * Macros for converting between kernel virtual addresses, socket local physical 207 * Macros for converting between kernel virtual addresses, socket local physical
204 * addresses, and UV global physical addresses. 208 * addresses, and UV global physical addresses.
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5e79ca694326..9c371e4a9fa6 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -296,6 +296,8 @@ HYPERVISOR_get_debugreg(int reg)
296static inline int 296static inline int
297HYPERVISOR_update_descriptor(u64 ma, u64 desc) 297HYPERVISOR_update_descriptor(u64 ma, u64 desc)
298{ 298{
299 if (sizeof(u64) == sizeof(long))
300 return _hypercall2(int, update_descriptor, ma, desc);
299 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); 301 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
300} 302}
301 303
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 95f216bbfaf1..6e9c1f320acf 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -70,7 +70,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
70obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 70obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
71obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 71obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
72obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 72obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
73obj-$(CONFIG_X86_VSMP) += vsmp_64.o
74obj-$(CONFIG_KPROBES) += kprobes.o 73obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_MODULES) += module_$(BITS).o 74obj-$(CONFIG_MODULES) += module_$(BITS).o
76obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 75obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -111,7 +110,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
111### 110###
112# 64 bit specific files 111# 64 bit specific files
113ifeq ($(CONFIG_X86_64),y) 112ifeq ($(CONFIG_X86_64),y)
114 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o 113 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
115 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 114 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
116 obj-$(CONFIG_AUDIT) += audit_64.o 115 obj-$(CONFIG_AUDIT) += audit_64.o
117 116
@@ -120,4 +119,5 @@ ifeq ($(CONFIG_X86_64),y)
120 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 119 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
121 120
122 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 121 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
122 obj-y += vsmp_64.o
123endif 123endif
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d52..4c80f1557433 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
414 that might execute the to be patched code. 414 that might execute the to be patched code.
415 Other CPUs are not running. */ 415 Other CPUs are not running. */
416 stop_nmi(); 416 stop_nmi();
417#ifdef CONFIG_X86_MCE 417
418 stop_mce(); 418 /*
419#endif 419 * Don't stop machine check exceptions while patching.
420 * MCEs only happen when something got corrupted and in this
421 * case we must do something about the corruption.
422 * Ignoring it is worse than a unlikely patching race.
423 * Also machine checks tend to be broadcast and if one CPU
424 * goes into machine check the others follow quickly, so we don't
425 * expect a machine check to cause undue problems during to code
426 * patching.
427 */
420 428
421 apply_alternatives(__alt_instructions, __alt_instructions_end); 429 apply_alternatives(__alt_instructions, __alt_instructions_end);
422 430
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
456 (unsigned long)__smp_locks_end); 464 (unsigned long)__smp_locks_end);
457 465
458 restart_nmi(); 466 restart_nmi();
459#ifdef CONFIG_X86_MCE
460 restart_mce();
461#endif
462} 467}
463 468
464/** 469/**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c5..85eb8e100818 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/mtrr.h> 47#include <asm/mtrr.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/mce.h>
49 50
50unsigned int num_processors; 51unsigned int num_processors;
51 52
@@ -808,7 +809,7 @@ void clear_local_APIC(void)
808 u32 v; 809 u32 v;
809 810
810 /* APIC hasn't been mapped yet */ 811 /* APIC hasn't been mapped yet */
811 if (!apic_phys) 812 if (!x2apic && !apic_phys)
812 return; 813 return;
813 814
814 maxlvt = lapic_get_maxlvt(); 815 maxlvt = lapic_get_maxlvt();
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
842 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 843 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
843 } 844 }
844#endif 845#endif
846#ifdef CONFIG_X86_MCE_INTEL
847 if (maxlvt >= 6) {
848 v = apic_read(APIC_LVTCMCI);
849 if (!(v & APIC_LVT_MASKED))
850 apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
851 }
852#endif
853
845 /* 854 /*
846 * Clean APIC state for other OSs: 855 * Clean APIC state for other OSs:
847 */ 856 */
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
1241 apic_write(APIC_LVT1, value); 1250 apic_write(APIC_LVT1, value);
1242 1251
1243 preempt_enable(); 1252 preempt_enable();
1253
1254#ifdef CONFIG_X86_MCE_INTEL
1255 /* Recheck CMCI information after local APIC is up on CPU #0 */
1256 if (smp_processor_id() == 0)
1257 cmci_recheck();
1258#endif
1244} 1259}
1245 1260
1246void __cpuinit end_local_APIC_setup(void) 1261void __cpuinit end_local_APIC_setup(void)
@@ -1319,15 +1334,16 @@ void __init enable_IR_x2apic(void)
1319 return; 1334 return;
1320 } 1335 }
1321 1336
1322 local_irq_save(flags); 1337 ret = save_IO_APIC_setup();
1323 mask_8259A();
1324
1325 ret = save_mask_IO_APIC_setup();
1326 if (ret) { 1338 if (ret) {
1327 pr_info("Saving IO-APIC state failed: %d\n", ret); 1339 pr_info("Saving IO-APIC state failed: %d\n", ret);
1328 goto end; 1340 goto end;
1329 } 1341 }
1330 1342
1343 local_irq_save(flags);
1344 mask_IO_APIC_setup();
1345 mask_8259A();
1346
1331 ret = enable_intr_remapping(1); 1347 ret = enable_intr_remapping(1);
1332 1348
1333 if (ret && x2apic_preenabled) { 1349 if (ret && x2apic_preenabled) {
@@ -1352,10 +1368,10 @@ end_restore:
1352 else 1368 else
1353 reinit_intr_remapped_IO_APIC(x2apic_preenabled); 1369 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1354 1370
1355end:
1356 unmask_8259A(); 1371 unmask_8259A();
1357 local_irq_restore(flags); 1372 local_irq_restore(flags);
1358 1373
1374end:
1359 if (!ret) { 1375 if (!ret) {
1360 if (!x2apic_preenabled) 1376 if (!x2apic_preenabled)
1361 pr_info("Enabled x2apic and interrupt-remapping\n"); 1377 pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1508,12 +1524,10 @@ void __init early_init_lapic_mapping(void)
1508 */ 1524 */
1509void __init init_apic_mappings(void) 1525void __init init_apic_mappings(void)
1510{ 1526{
1511#ifdef CONFIG_X86_X2APIC
1512 if (x2apic) { 1527 if (x2apic) {
1513 boot_cpu_physical_apicid = read_apic_id(); 1528 boot_cpu_physical_apicid = read_apic_id();
1514 return; 1529 return;
1515 } 1530 }
1516#endif
1517 1531
1518 /* 1532 /*
1519 * If no local APIC can be found then set up a fake all 1533 * If no local APIC can be found then set up a fake all
@@ -1957,12 +1971,9 @@ static int lapic_resume(struct sys_device *dev)
1957 1971
1958 local_irq_save(flags); 1972 local_irq_save(flags);
1959 1973
1960#ifdef CONFIG_X86_X2APIC
1961 if (x2apic) 1974 if (x2apic)
1962 enable_x2apic(); 1975 enable_x2apic();
1963 else 1976 else {
1964#endif
1965 {
1966 /* 1977 /*
1967 * Make sure the APICBASE points to the right address 1978 * Make sure the APICBASE points to the right address
1968 * 1979 *
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f933822dba18..0014714ea97b 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -159,20 +159,6 @@ static int flat_apic_id_registered(void)
159 return physid_isset(read_xapic_id(), phys_cpu_present_map); 159 return physid_isset(read_xapic_id(), phys_cpu_present_map);
160} 160}
161 161
162static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
163{
164 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
165}
166
167static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
168 const struct cpumask *andmask)
169{
170 unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
171 unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
172
173 return mask1 & mask2;
174}
175
176static int flat_phys_pkg_id(int initial_apic_id, int index_msb) 162static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
177{ 163{
178 return hard_smp_processor_id() >> index_msb; 164 return hard_smp_processor_id() >> index_msb;
@@ -213,8 +199,8 @@ struct apic apic_flat = {
213 .set_apic_id = set_apic_id, 199 .set_apic_id = set_apic_id,
214 .apic_id_mask = 0xFFu << 24, 200 .apic_id_mask = 0xFFu << 24,
215 201
216 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 202 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
217 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, 203 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
218 204
219 .send_IPI_mask = flat_send_IPI_mask, 205 .send_IPI_mask = flat_send_IPI_mask,
220 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, 206 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00e6071cefc4..da99ffcdfde6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -389,6 +389,8 @@ struct io_apic {
389 unsigned int index; 389 unsigned int index;
390 unsigned int unused[3]; 390 unsigned int unused[3];
391 unsigned int data; 391 unsigned int data;
392 unsigned int unused2[11];
393 unsigned int eoi;
392}; 394};
393 395
394static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 396static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
@@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
397 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); 399 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
398} 400}
399 401
402static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
403{
404 struct io_apic __iomem *io_apic = io_apic_base(apic);
405 writel(vector, &io_apic->eoi);
406}
407
400static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 408static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
401{ 409{
402 struct io_apic __iomem *io_apic = io_apic_base(apic); 410 struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -546,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
546 554
547 apic = entry->apic; 555 apic = entry->apic;
548 pin = entry->pin; 556 pin = entry->pin;
549#ifdef CONFIG_INTR_REMAP
550 /* 557 /*
551 * With interrupt-remapping, destination information comes 558 * With interrupt-remapping, destination information comes
552 * from interrupt-remapping table entry. 559 * from interrupt-remapping table entry.
553 */ 560 */
554 if (!irq_remapped(irq)) 561 if (!irq_remapped(irq))
555 io_apic_write(apic, 0x11 + pin*2, dest); 562 io_apic_write(apic, 0x11 + pin*2, dest);
556#else
557 io_apic_write(apic, 0x11 + pin*2, dest);
558#endif
559 reg = io_apic_read(apic, 0x10 + pin*2); 563 reg = io_apic_read(apic, 0x10 + pin*2);
560 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
561 reg |= vector; 565 reg |= vector;
@@ -588,10 +592,12 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
588 if (assign_irq_vector(irq, cfg, mask)) 592 if (assign_irq_vector(irq, cfg, mask))
589 return BAD_APICID; 593 return BAD_APICID;
590 594
591 cpumask_and(desc->affinity, cfg->domain, mask); 595 /* check that before desc->addinity get updated */
592 set_extra_move_desc(desc, mask); 596 set_extra_move_desc(desc, mask);
593 597
594 return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask); 598 cpumask_copy(desc->affinity, mask);
599
600 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
595} 601}
596 602
597static void 603static void
@@ -849,9 +855,9 @@ __setup("pirq=", ioapic_pirq_setup);
849static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; 855static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
850 856
851/* 857/*
852 * Saves and masks all the unmasked IO-APIC RTE's 858 * Saves all the IO-APIC RTE's
853 */ 859 */
854int save_mask_IO_APIC_setup(void) 860int save_IO_APIC_setup(void)
855{ 861{
856 union IO_APIC_reg_01 reg_01; 862 union IO_APIC_reg_01 reg_01;
857 unsigned long flags; 863 unsigned long flags;
@@ -876,16 +882,9 @@ int save_mask_IO_APIC_setup(void)
876 } 882 }
877 883
878 for (apic = 0; apic < nr_ioapics; apic++) 884 for (apic = 0; apic < nr_ioapics; apic++)
879 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 885 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
880 struct IO_APIC_route_entry entry; 886 early_ioapic_entries[apic][pin] =
881
882 entry = early_ioapic_entries[apic][pin] =
883 ioapic_read_entry(apic, pin); 887 ioapic_read_entry(apic, pin);
884 if (!entry.mask) {
885 entry.mask = 1;
886 ioapic_write_entry(apic, pin, entry);
887 }
888 }
889 888
890 return 0; 889 return 0;
891 890
@@ -898,6 +897,25 @@ nomem:
898 return -ENOMEM; 897 return -ENOMEM;
899} 898}
900 899
900void mask_IO_APIC_setup(void)
901{
902 int apic, pin;
903
904 for (apic = 0; apic < nr_ioapics; apic++) {
905 if (!early_ioapic_entries[apic])
906 break;
907 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
908 struct IO_APIC_route_entry entry;
909
910 entry = early_ioapic_entries[apic][pin];
911 if (!entry.mask) {
912 entry.mask = 1;
913 ioapic_write_entry(apic, pin, entry);
914 }
915 }
916 }
917}
918
901void restore_IO_APIC_setup(void) 919void restore_IO_APIC_setup(void)
902{ 920{
903 int apic, pin; 921 int apic, pin;
@@ -1411,9 +1429,7 @@ void __setup_vector_irq(int cpu)
1411} 1429}
1412 1430
1413static struct irq_chip ioapic_chip; 1431static struct irq_chip ioapic_chip;
1414#ifdef CONFIG_INTR_REMAP
1415static struct irq_chip ir_ioapic_chip; 1432static struct irq_chip ir_ioapic_chip;
1416#endif
1417 1433
1418#define IOAPIC_AUTO -1 1434#define IOAPIC_AUTO -1
1419#define IOAPIC_EDGE 0 1435#define IOAPIC_EDGE 0
@@ -1452,7 +1468,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1452 else 1468 else
1453 desc->status &= ~IRQ_LEVEL; 1469 desc->status &= ~IRQ_LEVEL;
1454 1470
1455#ifdef CONFIG_INTR_REMAP
1456 if (irq_remapped(irq)) { 1471 if (irq_remapped(irq)) {
1457 desc->status |= IRQ_MOVE_PCNTXT; 1472 desc->status |= IRQ_MOVE_PCNTXT;
1458 if (trigger) 1473 if (trigger)
@@ -1464,7 +1479,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1464 handle_edge_irq, "edge"); 1479 handle_edge_irq, "edge");
1465 return; 1480 return;
1466 } 1481 }
1467#endif 1482
1468 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1483 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1469 trigger == IOAPIC_LEVEL) 1484 trigger == IOAPIC_LEVEL)
1470 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1485 set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1478,14 +1493,13 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1478int setup_ioapic_entry(int apic_id, int irq, 1493int setup_ioapic_entry(int apic_id, int irq,
1479 struct IO_APIC_route_entry *entry, 1494 struct IO_APIC_route_entry *entry,
1480 unsigned int destination, int trigger, 1495 unsigned int destination, int trigger,
1481 int polarity, int vector) 1496 int polarity, int vector, int pin)
1482{ 1497{
1483 /* 1498 /*
1484 * add it to the IO-APIC irq-routing table: 1499 * add it to the IO-APIC irq-routing table:
1485 */ 1500 */
1486 memset(entry,0,sizeof(*entry)); 1501 memset(entry,0,sizeof(*entry));
1487 1502
1488#ifdef CONFIG_INTR_REMAP
1489 if (intr_remapping_enabled) { 1503 if (intr_remapping_enabled) {
1490 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); 1504 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
1491 struct irte irte; 1505 struct irte irte;
@@ -1504,7 +1518,14 @@ int setup_ioapic_entry(int apic_id, int irq,
1504 1518
1505 irte.present = 1; 1519 irte.present = 1;
1506 irte.dst_mode = apic->irq_dest_mode; 1520 irte.dst_mode = apic->irq_dest_mode;
1507 irte.trigger_mode = trigger; 1521 /*
1522 * Trigger mode in the IRTE will always be edge, and the
1523 * actual level or edge trigger will be setup in the IO-APIC
1524 * RTE. This will help simplify level triggered irq migration.
1525 * For more details, see the comments above explainig IO-APIC
1526 * irq migration in the presence of interrupt-remapping.
1527 */
1528 irte.trigger_mode = 0;
1508 irte.dlvry_mode = apic->irq_delivery_mode; 1529 irte.dlvry_mode = apic->irq_delivery_mode;
1509 irte.vector = vector; 1530 irte.vector = vector;
1510 irte.dest_id = IRTE_DEST(destination); 1531 irte.dest_id = IRTE_DEST(destination);
@@ -1515,18 +1536,21 @@ int setup_ioapic_entry(int apic_id, int irq,
1515 ir_entry->zero = 0; 1536 ir_entry->zero = 0;
1516 ir_entry->format = 1; 1537 ir_entry->format = 1;
1517 ir_entry->index = (index & 0x7fff); 1538 ir_entry->index = (index & 0x7fff);
1518 } else 1539 /*
1519#endif 1540 * IO-APIC RTE will be configured with virtual vector.
1520 { 1541 * irq handler will do the explicit EOI to the io-apic.
1542 */
1543 ir_entry->vector = pin;
1544 } else {
1521 entry->delivery_mode = apic->irq_delivery_mode; 1545 entry->delivery_mode = apic->irq_delivery_mode;
1522 entry->dest_mode = apic->irq_dest_mode; 1546 entry->dest_mode = apic->irq_dest_mode;
1523 entry->dest = destination; 1547 entry->dest = destination;
1548 entry->vector = vector;
1524 } 1549 }
1525 1550
1526 entry->mask = 0; /* enable IRQ */ 1551 entry->mask = 0; /* enable IRQ */
1527 entry->trigger = trigger; 1552 entry->trigger = trigger;
1528 entry->polarity = polarity; 1553 entry->polarity = polarity;
1529 entry->vector = vector;
1530 1554
1531 /* Mask level triggered irqs. 1555 /* Mask level triggered irqs.
1532 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1556 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1561,7 +1585,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1561 1585
1562 1586
1563 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, 1587 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
1564 dest, trigger, polarity, cfg->vector)) { 1588 dest, trigger, polarity, cfg->vector, pin)) {
1565 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1589 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1566 mp_ioapics[apic_id].apicid, pin); 1590 mp_ioapics[apic_id].apicid, pin);
1567 __clear_irq_vector(irq, cfg); 1591 __clear_irq_vector(irq, cfg);
@@ -1642,10 +1666,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1642{ 1666{
1643 struct IO_APIC_route_entry entry; 1667 struct IO_APIC_route_entry entry;
1644 1668
1645#ifdef CONFIG_INTR_REMAP
1646 if (intr_remapping_enabled) 1669 if (intr_remapping_enabled)
1647 return; 1670 return;
1648#endif
1649 1671
1650 memset(&entry, 0, sizeof(entry)); 1672 memset(&entry, 0, sizeof(entry));
1651 1673
@@ -2040,8 +2062,13 @@ void disable_IO_APIC(void)
2040 * If the i8259 is routed through an IOAPIC 2062 * If the i8259 is routed through an IOAPIC
2041 * Put that IOAPIC in virtual wire mode 2063 * Put that IOAPIC in virtual wire mode
2042 * so legacy interrupts can be delivered. 2064 * so legacy interrupts can be delivered.
2065 *
2066 * With interrupt-remapping, for now we will use virtual wire A mode,
2067 * as virtual wire B is little complex (need to configure both
2068 * IOAPIC RTE aswell as interrupt-remapping table entry).
2069 * As this gets called during crash dump, keep this simple for now.
2043 */ 2070 */
2044 if (ioapic_i8259.pin != -1) { 2071 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
2045 struct IO_APIC_route_entry entry; 2072 struct IO_APIC_route_entry entry;
2046 2073
2047 memset(&entry, 0, sizeof(entry)); 2074 memset(&entry, 0, sizeof(entry));
@@ -2061,7 +2088,10 @@ void disable_IO_APIC(void)
2061 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); 2088 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
2062 } 2089 }
2063 2090
2064 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 2091 /*
2092 * Use virtual wire A mode when interrupt remapping is enabled.
2093 */
2094 disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
2065} 2095}
2066 2096
2067#ifdef CONFIG_X86_32 2097#ifdef CONFIG_X86_32
@@ -2303,37 +2333,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
2303#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2304 2334
2305#ifdef CONFIG_INTR_REMAP 2335#ifdef CONFIG_INTR_REMAP
2306static void ir_irq_migration(struct work_struct *work);
2307
2308static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2309 2336
2310/* 2337/*
2311 * Migrate the IO-APIC irq in the presence of intr-remapping. 2338 * Migrate the IO-APIC irq in the presence of intr-remapping.
2312 * 2339 *
2313 * For edge triggered, irq migration is a simple atomic update(of vector 2340 * For both level and edge triggered, irq migration is a simple atomic
2314 * and cpu destination) of IRTE and flush the hardware cache. 2341 * update(of vector and cpu destination) of IRTE and flush the hardware cache.
2315 *
2316 * For level triggered, we need to modify the io-apic RTE aswell with the update
2317 * vector information, along with modifying IRTE with vector and destination.
2318 * So irq migration for level triggered is little bit more complex compared to
2319 * edge triggered migration. But the good news is, we use the same algorithm
2320 * for level triggered migration as we have today, only difference being,
2321 * we now initiate the irq migration from process context instead of the
2322 * interrupt context.
2323 * 2342 *
2324 * In future, when we do a directed EOI (combined with cpu EOI broadcast 2343 * For level triggered, we eliminate the io-apic RTE modification (with the
2325 * suppression) to the IO-APIC, level triggered irq migration will also be 2344 * updated vector information), by using a virtual vector (io-apic pin number).
2326 * as simple as edge triggered migration and we can do the irq migration 2345 * Real vector that is used for interrupting cpu will be coming from
2327 * with a simple atomic update to IO-APIC RTE. 2346 * the interrupt-remapping table entry.
2328 */ 2347 */
2329static void 2348static void
2330migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2349migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2331{ 2350{
2332 struct irq_cfg *cfg; 2351 struct irq_cfg *cfg;
2333 struct irte irte; 2352 struct irte irte;
2334 int modify_ioapic_rte;
2335 unsigned int dest; 2353 unsigned int dest;
2336 unsigned long flags;
2337 unsigned int irq; 2354 unsigned int irq;
2338 2355
2339 if (!cpumask_intersects(mask, cpu_online_mask)) 2356 if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2351,13 +2368,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2351 2368
2352 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2369 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2353 2370
2354 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2355 if (modify_ioapic_rte) {
2356 spin_lock_irqsave(&ioapic_lock, flags);
2357 __target_IO_APIC_irq(irq, dest, cfg);
2358 spin_unlock_irqrestore(&ioapic_lock, flags);
2359 }
2360
2361 irte.vector = cfg->vector; 2371 irte.vector = cfg->vector;
2362 irte.dest_id = IRTE_DEST(dest); 2372 irte.dest_id = IRTE_DEST(dest);
2363 2373
@@ -2372,73 +2382,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2372 cpumask_copy(desc->affinity, mask); 2382 cpumask_copy(desc->affinity, mask);
2373} 2383}
2374 2384
2375static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2376{
2377 int ret = -1;
2378 struct irq_cfg *cfg = desc->chip_data;
2379
2380 mask_IO_APIC_irq_desc(desc);
2381
2382 if (io_apic_level_ack_pending(cfg)) {
2383 /*
2384 * Interrupt in progress. Migrating irq now will change the
2385 * vector information in the IO-APIC RTE and that will confuse
2386 * the EOI broadcast performed by cpu.
2387 * So, delay the irq migration to the next instance.
2388 */
2389 schedule_delayed_work(&ir_migration_work, 1);
2390 goto unmask;
2391 }
2392
2393 /* everthing is clear. we have right of way */
2394 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2395
2396 ret = 0;
2397 desc->status &= ~IRQ_MOVE_PENDING;
2398 cpumask_clear(desc->pending_mask);
2399
2400unmask:
2401 unmask_IO_APIC_irq_desc(desc);
2402
2403 return ret;
2404}
2405
2406static void ir_irq_migration(struct work_struct *work)
2407{
2408 unsigned int irq;
2409 struct irq_desc *desc;
2410
2411 for_each_irq_desc(irq, desc) {
2412 if (desc->status & IRQ_MOVE_PENDING) {
2413 unsigned long flags;
2414
2415 spin_lock_irqsave(&desc->lock, flags);
2416 if (!desc->chip->set_affinity ||
2417 !(desc->status & IRQ_MOVE_PENDING)) {
2418 desc->status &= ~IRQ_MOVE_PENDING;
2419 spin_unlock_irqrestore(&desc->lock, flags);
2420 continue;
2421 }
2422
2423 desc->chip->set_affinity(irq, desc->pending_mask);
2424 spin_unlock_irqrestore(&desc->lock, flags);
2425 }
2426 }
2427}
2428
2429/* 2385/*
2430 * Migrates the IRQ destination in the process context. 2386 * Migrates the IRQ destination in the process context.
2431 */ 2387 */
2432static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2388static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2433 const struct cpumask *mask) 2389 const struct cpumask *mask)
2434{ 2390{
2435 if (desc->status & IRQ_LEVEL) {
2436 desc->status |= IRQ_MOVE_PENDING;
2437 cpumask_copy(desc->pending_mask, mask);
2438 migrate_irq_remapped_level_desc(desc);
2439 return;
2440 }
2441
2442 migrate_ioapic_irq_desc(desc, mask); 2391 migrate_ioapic_irq_desc(desc, mask);
2443} 2392}
2444static void set_ir_ioapic_affinity_irq(unsigned int irq, 2393static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2448,6 +2397,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
2448 2397
2449 set_ir_ioapic_affinity_irq_desc(desc, mask); 2398 set_ir_ioapic_affinity_irq_desc(desc, mask);
2450} 2399}
2400#else
2401static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2402 const struct cpumask *mask)
2403{
2404}
2451#endif 2405#endif
2452 2406
2453asmlinkage void smp_irq_move_cleanup_interrupt(void) 2407asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2461,6 +2415,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2461 me = smp_processor_id(); 2415 me = smp_processor_id();
2462 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2416 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
2463 unsigned int irq; 2417 unsigned int irq;
2418 unsigned int irr;
2464 struct irq_desc *desc; 2419 struct irq_desc *desc;
2465 struct irq_cfg *cfg; 2420 struct irq_cfg *cfg;
2466 irq = __get_cpu_var(vector_irq)[vector]; 2421 irq = __get_cpu_var(vector_irq)[vector];
@@ -2480,6 +2435,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2480 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2435 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2481 goto unlock; 2436 goto unlock;
2482 2437
2438 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
2439 /*
2440 * Check if the vector that needs to be cleanedup is
2441 * registered at the cpu's IRR. If so, then this is not
2442 * the best time to clean it up. Lets clean it up in the
2443 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
2444 * to myself.
2445 */
2446 if (irr & (1 << (vector % 32))) {
2447 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2448 goto unlock;
2449 }
2483 __get_cpu_var(vector_irq)[vector] = -1; 2450 __get_cpu_var(vector_irq)[vector] = -1;
2484 cfg->move_cleanup_count--; 2451 cfg->move_cleanup_count--;
2485unlock: 2452unlock:
@@ -2529,9 +2496,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
2529#endif 2496#endif
2530 2497
2531#ifdef CONFIG_INTR_REMAP 2498#ifdef CONFIG_INTR_REMAP
2499static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2500{
2501 int apic, pin;
2502 struct irq_pin_list *entry;
2503
2504 entry = cfg->irq_2_pin;
2505 for (;;) {
2506
2507 if (!entry)
2508 break;
2509
2510 apic = entry->apic;
2511 pin = entry->pin;
2512 io_apic_eoi(apic, pin);
2513 entry = entry->next;
2514 }
2515}
2516
2517static void
2518eoi_ioapic_irq(struct irq_desc *desc)
2519{
2520 struct irq_cfg *cfg;
2521 unsigned long flags;
2522 unsigned int irq;
2523
2524 irq = desc->irq;
2525 cfg = desc->chip_data;
2526
2527 spin_lock_irqsave(&ioapic_lock, flags);
2528 __eoi_ioapic_irq(irq, cfg);
2529 spin_unlock_irqrestore(&ioapic_lock, flags);
2530}
2531
2532static void ack_x2apic_level(unsigned int irq) 2532static void ack_x2apic_level(unsigned int irq)
2533{ 2533{
2534 struct irq_desc *desc = irq_to_desc(irq);
2534 ack_x2APIC_irq(); 2535 ack_x2APIC_irq();
2536 eoi_ioapic_irq(desc);
2535} 2537}
2536 2538
2537static void ack_x2apic_edge(unsigned int irq) 2539static void ack_x2apic_edge(unsigned int irq)
@@ -2662,20 +2664,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
2662 .retrigger = ioapic_retrigger_irq, 2664 .retrigger = ioapic_retrigger_irq,
2663}; 2665};
2664 2666
2665#ifdef CONFIG_INTR_REMAP
2666static struct irq_chip ir_ioapic_chip __read_mostly = { 2667static struct irq_chip ir_ioapic_chip __read_mostly = {
2667 .name = "IR-IO-APIC", 2668 .name = "IR-IO-APIC",
2668 .startup = startup_ioapic_irq, 2669 .startup = startup_ioapic_irq,
2669 .mask = mask_IO_APIC_irq, 2670 .mask = mask_IO_APIC_irq,
2670 .unmask = unmask_IO_APIC_irq, 2671 .unmask = unmask_IO_APIC_irq,
2672#ifdef CONFIG_INTR_REMAP
2671 .ack = ack_x2apic_edge, 2673 .ack = ack_x2apic_edge,
2672 .eoi = ack_x2apic_level, 2674 .eoi = ack_x2apic_level,
2673#ifdef CONFIG_SMP 2675#ifdef CONFIG_SMP
2674 .set_affinity = set_ir_ioapic_affinity_irq, 2676 .set_affinity = set_ir_ioapic_affinity_irq,
2675#endif 2677#endif
2678#endif
2676 .retrigger = ioapic_retrigger_irq, 2679 .retrigger = ioapic_retrigger_irq,
2677}; 2680};
2678#endif
2679 2681
2680static inline void init_IO_APIC_traps(void) 2682static inline void init_IO_APIC_traps(void)
2681{ 2683{
@@ -2901,10 +2903,8 @@ static inline void __init check_timer(void)
2901 * 8259A. 2903 * 8259A.
2902 */ 2904 */
2903 if (pin1 == -1) { 2905 if (pin1 == -1) {
2904#ifdef CONFIG_INTR_REMAP
2905 if (intr_remapping_enabled) 2906 if (intr_remapping_enabled)
2906 panic("BIOS bug: timer not connected to IO-APIC"); 2907 panic("BIOS bug: timer not connected to IO-APIC");
2907#endif
2908 pin1 = pin2; 2908 pin1 = pin2;
2909 apic1 = apic2; 2909 apic1 = apic2;
2910 no_pin1 = 1; 2910 no_pin1 = 1;
@@ -2940,10 +2940,8 @@ static inline void __init check_timer(void)
2940 clear_IO_APIC_pin(0, pin1); 2940 clear_IO_APIC_pin(0, pin1);
2941 goto out; 2941 goto out;
2942 } 2942 }
2943#ifdef CONFIG_INTR_REMAP
2944 if (intr_remapping_enabled) 2943 if (intr_remapping_enabled)
2945 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2944 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2946#endif
2947 local_irq_disable(); 2945 local_irq_disable();
2948 clear_IO_APIC_pin(apic1, pin1); 2946 clear_IO_APIC_pin(apic1, pin1);
2949 if (!no_pin1) 2947 if (!no_pin1)
@@ -3237,9 +3235,7 @@ void destroy_irq(unsigned int irq)
3237 if (desc) 3235 if (desc)
3238 desc->chip_data = cfg; 3236 desc->chip_data = cfg;
3239 3237
3240#ifdef CONFIG_INTR_REMAP
3241 free_irte(irq); 3238 free_irte(irq);
3242#endif
3243 spin_lock_irqsave(&vector_lock, flags); 3239 spin_lock_irqsave(&vector_lock, flags);
3244 __clear_irq_vector(irq, cfg); 3240 __clear_irq_vector(irq, cfg);
3245 spin_unlock_irqrestore(&vector_lock, flags); 3241 spin_unlock_irqrestore(&vector_lock, flags);
@@ -3265,7 +3261,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3265 3261
3266 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3262 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3267 3263
3268#ifdef CONFIG_INTR_REMAP
3269 if (irq_remapped(irq)) { 3264 if (irq_remapped(irq)) {
3270 struct irte irte; 3265 struct irte irte;
3271 int ir_index; 3266 int ir_index;
@@ -3291,10 +3286,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3291 MSI_ADDR_IR_SHV | 3286 MSI_ADDR_IR_SHV |
3292 MSI_ADDR_IR_INDEX1(ir_index) | 3287 MSI_ADDR_IR_INDEX1(ir_index) |
3293 MSI_ADDR_IR_INDEX2(ir_index); 3288 MSI_ADDR_IR_INDEX2(ir_index);
3294 } else 3289 } else {
3295#endif 3290 if (x2apic_enabled())
3296 { 3291 msg->address_hi = MSI_ADDR_BASE_HI |
3297 msg->address_hi = MSI_ADDR_BASE_HI; 3292 MSI_ADDR_EXT_DEST_ID(dest);
3293 else
3294 msg->address_hi = MSI_ADDR_BASE_HI;
3295
3298 msg->address_lo = 3296 msg->address_lo =
3299 MSI_ADDR_BASE_LO | 3297 MSI_ADDR_BASE_LO |
3300 ((apic->irq_dest_mode == 0) ? 3298 ((apic->irq_dest_mode == 0) ?
@@ -3394,15 +3392,16 @@ static struct irq_chip msi_chip = {
3394 .retrigger = ioapic_retrigger_irq, 3392 .retrigger = ioapic_retrigger_irq,
3395}; 3393};
3396 3394
3397#ifdef CONFIG_INTR_REMAP
3398static struct irq_chip msi_ir_chip = { 3395static struct irq_chip msi_ir_chip = {
3399 .name = "IR-PCI-MSI", 3396 .name = "IR-PCI-MSI",
3400 .unmask = unmask_msi_irq, 3397 .unmask = unmask_msi_irq,
3401 .mask = mask_msi_irq, 3398 .mask = mask_msi_irq,
3399#ifdef CONFIG_INTR_REMAP
3402 .ack = ack_x2apic_edge, 3400 .ack = ack_x2apic_edge,
3403#ifdef CONFIG_SMP 3401#ifdef CONFIG_SMP
3404 .set_affinity = ir_set_msi_irq_affinity, 3402 .set_affinity = ir_set_msi_irq_affinity,
3405#endif 3403#endif
3404#endif
3406 .retrigger = ioapic_retrigger_irq, 3405 .retrigger = ioapic_retrigger_irq,
3407}; 3406};
3408 3407
@@ -3432,7 +3431,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3432 } 3431 }
3433 return index; 3432 return index;
3434} 3433}
3435#endif
3436 3434
3437static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3435static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3438{ 3436{
@@ -3446,7 +3444,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3446 set_irq_msi(irq, msidesc); 3444 set_irq_msi(irq, msidesc);
3447 write_msi_msg(irq, &msg); 3445 write_msi_msg(irq, &msg);
3448 3446
3449#ifdef CONFIG_INTR_REMAP
3450 if (irq_remapped(irq)) { 3447 if (irq_remapped(irq)) {
3451 struct irq_desc *desc = irq_to_desc(irq); 3448 struct irq_desc *desc = irq_to_desc(irq);
3452 /* 3449 /*
@@ -3455,7 +3452,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3455 desc->status |= IRQ_MOVE_PCNTXT; 3452 desc->status |= IRQ_MOVE_PCNTXT;
3456 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3453 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3457 } else 3454 } else
3458#endif
3459 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3455 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3460 3456
3461 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3457 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3469,11 +3465,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3469 int ret, sub_handle; 3465 int ret, sub_handle;
3470 struct msi_desc *msidesc; 3466 struct msi_desc *msidesc;
3471 unsigned int irq_want; 3467 unsigned int irq_want;
3472 3468 struct intel_iommu *iommu = NULL;
3473#ifdef CONFIG_INTR_REMAP
3474 struct intel_iommu *iommu = 0;
3475 int index = 0; 3469 int index = 0;
3476#endif
3477 3470
3478 irq_want = nr_irqs_gsi; 3471 irq_want = nr_irqs_gsi;
3479 sub_handle = 0; 3472 sub_handle = 0;
@@ -3482,7 +3475,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3482 if (irq == 0) 3475 if (irq == 0)
3483 return -1; 3476 return -1;
3484 irq_want = irq + 1; 3477 irq_want = irq + 1;
3485#ifdef CONFIG_INTR_REMAP
3486 if (!intr_remapping_enabled) 3478 if (!intr_remapping_enabled)
3487 goto no_ir; 3479 goto no_ir;
3488 3480
@@ -3510,7 +3502,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3510 set_irte_irq(irq, iommu, index, sub_handle); 3502 set_irte_irq(irq, iommu, index, sub_handle);
3511 } 3503 }
3512no_ir: 3504no_ir:
3513#endif
3514 ret = setup_msi_irq(dev, msidesc, irq); 3505 ret = setup_msi_irq(dev, msidesc, irq);
3515 if (ret < 0) 3506 if (ret < 0)
3516 goto error; 3507 goto error;
@@ -3528,7 +3519,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3528 destroy_irq(irq); 3519 destroy_irq(irq);
3529} 3520}
3530 3521
3531#ifdef CONFIG_DMAR 3522#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3532#ifdef CONFIG_SMP 3523#ifdef CONFIG_SMP
3533static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3524static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3534{ 3525{
@@ -3609,7 +3600,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3609 3600
3610#endif /* CONFIG_SMP */ 3601#endif /* CONFIG_SMP */
3611 3602
3612struct irq_chip hpet_msi_type = { 3603static struct irq_chip hpet_msi_type = {
3613 .name = "HPET_MSI", 3604 .name = "HPET_MSI",
3614 .unmask = hpet_msi_unmask, 3605 .unmask = hpet_msi_unmask,
3615 .mask = hpet_msi_mask, 3606 .mask = hpet_msi_mask,
@@ -4045,11 +4036,9 @@ void __init setup_ioapic_dest(void)
4045 else 4036 else
4046 mask = apic->target_cpus(); 4037 mask = apic->target_cpus();
4047 4038
4048#ifdef CONFIG_INTR_REMAP
4049 if (intr_remapping_enabled) 4039 if (intr_remapping_enabled)
4050 set_ir_ioapic_affinity_irq_desc(desc, mask); 4040 set_ir_ioapic_affinity_irq_desc(desc, mask);
4051 else 4041 else
4052#endif
4053 set_ioapic_affinity_irq_desc(desc, mask); 4042 set_ioapic_affinity_irq_desc(desc, mask);
4054 } 4043 }
4055 4044
@@ -4142,9 +4131,12 @@ static int __init ioapic_insert_resources(void)
4142 struct resource *r = ioapic_resources; 4131 struct resource *r = ioapic_resources;
4143 4132
4144 if (!r) { 4133 if (!r) {
4145 printk(KERN_ERR 4134 if (nr_ioapics > 0) {
4146 "IO APIC resources could be not be allocated.\n"); 4135 printk(KERN_ERR
4147 return -1; 4136 "IO APIC resources couldn't be allocated.\n");
4137 return -1;
4138 }
4139 return 0;
4148 } 4140 }
4149 4141
4150 for (i = 0; i < nr_ioapics; i++) { 4142 for (i = 0; i < nr_ioapics; i++) {
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 8d7748efe6a8..1783652bb0e5 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -68,6 +68,13 @@ void __init default_setup_apic_routing(void)
68 apic = &apic_physflat; 68 apic = &apic_physflat;
69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
70 } 70 }
71
72 /*
73 * Now that apic routing model is selected, configure the
74 * fault handling for intr remapping.
75 */
76 if (intr_remapping_enabled)
77 enable_drhd_fault_handling();
71} 78}
72 79
73/* Same for both flat and physical. */ 80/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8fb87b6dd633..4a903e2f0d17 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
57 unsigned long query_cpu; 57 unsigned long query_cpu;
58 unsigned long flags; 58 unsigned long flags;
59 59
60 x2apic_wrmsr_fence();
61
60 local_irq_save(flags); 62 local_irq_save(flags);
61 for_each_cpu(query_cpu, mask) { 63 for_each_cpu(query_cpu, mask) {
62 __x2apic_send_IPI_dest( 64 __x2apic_send_IPI_dest(
@@ -73,6 +75,8 @@ static void
73 unsigned long query_cpu; 75 unsigned long query_cpu;
74 unsigned long flags; 76 unsigned long flags;
75 77
78 x2apic_wrmsr_fence();
79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
78 if (query_cpu == this_cpu) 82 if (query_cpu == this_cpu)
@@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector)
90 unsigned long query_cpu; 94 unsigned long query_cpu;
91 unsigned long flags; 95 unsigned long flags;
92 96
97 x2apic_wrmsr_fence();
98
93 local_irq_save(flags); 99 local_irq_save(flags);
94 for_each_online_cpu(query_cpu) { 100 for_each_online_cpu(query_cpu) {
95 if (query_cpu == this_cpu) 101 if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 23625b9f98b2..a284359627e7 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58 unsigned long query_cpu; 58 unsigned long query_cpu;
59 unsigned long flags; 59 unsigned long flags;
60 60
61 x2apic_wrmsr_fence();
62
61 local_irq_save(flags); 63 local_irq_save(flags);
62 for_each_cpu(query_cpu, mask) { 64 for_each_cpu(query_cpu, mask) {
63 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 65 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
@@ -73,6 +75,8 @@ static void
73 unsigned long query_cpu; 75 unsigned long query_cpu;
74 unsigned long flags; 76 unsigned long flags;
75 77
78 x2apic_wrmsr_fence();
79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
78 if (query_cpu != this_cpu) 82 if (query_cpu != this_cpu)
@@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector)
89 unsigned long query_cpu; 93 unsigned long query_cpu;
90 unsigned long flags; 94 unsigned long flags;
91 95
96 x2apic_wrmsr_fence();
97
92 local_irq_save(flags); 98 local_irq_save(flags);
93 for_each_online_cpu(query_cpu) { 99 for_each_online_cpu(query_cpu) {
94 if (query_cpu == this_cpu) 100 if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412a..fc999e6fc46a 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,15 +83,15 @@ void __init setup_bios_corruption_check(void)
83 u64 size; 83 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE); 84 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
85 85
86 if (addr == 0) 86 if (!(addr + 1))
87 break;
88
89 if (addr >= corruption_check_size)
87 break; 90 break;
88 91
89 if ((addr + size) > corruption_check_size) 92 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr; 93 size = corruption_check_size - addr;
91 94
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED); 95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr; 96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size; 97 scan_areas[num_scan_areas].size = size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2de..4e242f9a06e4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,11 +14,12 @@ obj-y += vmware.o hypervisor.o
14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
15obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
16 16
17obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
18
17obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 19obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
18obj-$(CONFIG_CPU_SUP_AMD) += amd.o 20obj-$(CONFIG_CPU_SUP_AMD) += amd.o
19obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 21obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
20obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o 22obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
21obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
24 25
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 6882a735d9c0..8220ae69849d 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
29 u32 regs[4]; 29 u32 regs[4];
30 const struct cpuid_bit *cb; 30 const struct cpuid_bit *cb;
31 31
32 static const struct cpuid_bit cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { 0, 0, 0, 0 } 34 { 0, 0, 0, 0 }
35 }; 35 };
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80ed..7e4a459daa64 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5#include <asm/io.h> 5#include <asm/io.h>
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
10# include <asm/numa_64.h> 11# include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
141 } 142 }
142} 143}
143 144
145static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
146{
147#ifdef CONFIG_SMP
148 /* calling is from identify_secondary_cpu() ? */
149 if (c->cpu_index == boot_cpu_id)
150 return;
151
152 /*
153 * Certain Athlons might work (for various values of 'work') in SMP
154 * but they are not certified as MP capable.
155 */
156 /* Athlon 660/661 is valid. */
157 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
158 (c->x86_mask == 1)))
159 goto valid_k7;
160
161 /* Duron 670 is valid */
162 if ((c->x86_model == 7) && (c->x86_mask == 0))
163 goto valid_k7;
164
165 /*
166 * Athlon 662, Duron 671, and Athlon >model 7 have capability
167 * bit. It's worth noting that the A5 stepping (662) of some
168 * Athlon XP's have the MP bit set.
169 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
170 * more.
171 */
172 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
173 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
174 (c->x86_model > 7))
175 if (cpu_has_mp)
176 goto valid_k7;
177
178 /* If we get here, not a certified SMP capable AMD system. */
179
180 /*
181 * Don't taint if we are running SMP kernel on a single non-MP
182 * approved Athlon
183 */
184 WARN_ONCE(1, "WARNING: This combination of AMD"
185 "processors is not suitable for SMP.\n");
186 if (!test_taint(TAINT_UNSAFE_SMP))
187 add_taint(TAINT_UNSAFE_SMP);
188
189valid_k7:
190 ;
191#endif
192}
193
144static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 194static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
145{ 195{
146 u32 l, h; 196 u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
175 } 225 }
176 226
177 set_cpu_cap(c, X86_FEATURE_K7); 227 set_cpu_cap(c, X86_FEATURE_K7);
228
229 amd_k7_smp_check(c);
178} 230}
179#endif 231#endif
180 232
@@ -450,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
450} 502}
451#endif 503#endif
452 504
453static struct cpu_dev amd_cpu_dev __cpuinitdata = { 505static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
454 .c_vendor = "AMD", 506 .c_vendor = "AMD",
455 .c_ident = { "AuthenticAMD" }, 507 .c_ident = { "AuthenticAMD" },
456#ifdef CONFIG_X86_32 508#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc6..c95e831bb095 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
1#include <linux/bitops.h>
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/init.h> 3#include <linux/init.h>
3#include <linux/bitops.h>
4 4
5#include <asm/processor.h> 5#include <asm/processor.h>
6#include <asm/msr.h>
7#include <asm/e820.h> 6#include <asm/e820.h>
8#include <asm/mtrr.h> 7#include <asm/mtrr.h>
8#include <asm/msr.h>
9 9
10#include "cpu.h" 10#include "cpu.h"
11 11
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
276 */ 276 */
277 c->x86_capability[5] = cpuid_edx(0xC0000001); 277 c->x86_capability[5] = cpuid_edx(0xC0000001);
278 } 278 }
279 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 9) { 281 if (c->x86_model >= 6 && c->x86_model <= 9) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
288 /* Before Nehemiah, the C3's had 3dNOW! */ 288 /* Before Nehemiah, the C3's had 3dNOW! */
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291#endif
292 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
293 c->x86_cache_alignment = c->x86_clflush_size * 2;
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 }
291 296
292 display_cacheinfo(c); 297 display_cacheinfo(c);
293} 298}
@@ -316,16 +321,25 @@ enum {
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) 321static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{ 322{
318 switch (c->x86) { 323 switch (c->x86) {
324#ifdef CONFIG_X86_32
319 case 5: 325 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */ 326 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); 327 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break; 328 break;
329#endif
330 case 6:
331 if (c->x86_model >= 0xf)
332 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
333 break;
323 } 334 }
335#ifdef CONFIG_X86_64
336 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
337#endif
324} 338}
325 339
326static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 340static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
327{ 341{
328 342#ifdef CONFIG_X86_32
329 char *name; 343 char *name;
330 u32 fcr_set = 0; 344 u32 fcr_set = 0;
331 u32 fcr_clr = 0; 345 u32 fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
337 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 351 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
338 */ 352 */
339 clear_cpu_cap(c, 0*32+31); 353 clear_cpu_cap(c, 0*32+31);
340 354#endif
355 early_init_centaur(c);
341 switch (c->x86) { 356 switch (c->x86) {
357#ifdef CONFIG_X86_32
342 case 5: 358 case 5:
343 switch (c->x86_model) { 359 switch (c->x86_model) {
344 case 4: 360 case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
442 } 458 }
443 sprintf(c->x86_model_id, "WinChip %s", name); 459 sprintf(c->x86_model_id, "WinChip %s", name);
444 break; 460 break;
445 461#endif
446 case 6: 462 case 6:
447 init_c3(c); 463 init_c3(c);
448 break; 464 break;
449 } 465 }
466#ifdef CONFIG_X86_64
467 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
468#endif
450} 469}
451 470
452static unsigned int __cpuinit 471static unsigned int __cpuinit
453centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) 472centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
454{ 473{
474#ifdef CONFIG_X86_32
455 /* VIA C3 CPUs (670-68F) need further shifting. */ 475 /* VIA C3 CPUs (670-68F) need further shifting. */
456 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) 476 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
457 size >>= 8; 477 size >>= 8;
@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
464 if ((c->x86 == 6) && (c->x86_model == 9) && 484 if ((c->x86 == 6) && (c->x86_model == 9) &&
465 (c->x86_mask == 1) && (size == 65)) 485 (c->x86_mask == 1) && (size == 65))
466 size -= 1; 486 size -= 1;
467 487#endif
468 return size; 488 return size;
469} 489}
470 490
471static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 491static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
472 .c_vendor = "Centaur", 492 .c_vendor = "Centaur",
473 .c_ident = { "CentaurHauls" }, 493 .c_ident = { "CentaurHauls" },
474 .c_early_init = early_init_centaur, 494 .c_early_init = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index a1625f5a1e78..000000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3
4#include <asm/cpufeature.h>
5#include <asm/processor.h>
6
7#include "cpu.h"
8
9static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
10{
11 if (c->x86 == 0x6 && c->x86_model >= 0xf)
12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
13
14 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
15}
16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{
19 early_init_centaur(c);
20
21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
22 c->x86_cache_alignment = c->x86_clflush_size * 2;
23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
24 }
25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
26}
27
28static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_vendor = "Centaur",
30 .c_ident = { "CentaurHauls" },
31 .c_early_init = early_init_centaur,
32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
34};
35
36cpu_dev_register(centaur_cpu_dev);
37
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 826d5c876278..e2962cc1e27b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,52 +1,52 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h> 1#include <linux/bootmem.h>
2#include <linux/linkage.h>
6#include <linux/bitops.h> 3#include <linux/bitops.h>
4#include <linux/kernel.h>
7#include <linux/module.h> 5#include <linux/module.h>
8#include <linux/kgdb.h> 6#include <linux/percpu.h>
9#include <linux/topology.h> 7#include <linux/string.h>
10#include <linux/delay.h> 8#include <linux/delay.h>
9#include <linux/sched.h>
10#include <linux/init.h>
11#include <linux/kgdb.h>
11#include <linux/smp.h> 12#include <linux/smp.h>
12#include <linux/percpu.h> 13#include <linux/io.h>
13#include <asm/i387.h> 14
14#include <asm/msr.h> 15#include <asm/stackprotector.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h> 16#include <asm/mmu_context.h>
17#include <asm/hypervisor.h>
18#include <asm/processor.h>
19#include <asm/sections.h>
20#include <asm/topology.h>
21#include <asm/cpumask.h>
22#include <asm/pgtable.h>
23#include <asm/atomic.h>
24#include <asm/proto.h>
25#include <asm/setup.h>
26#include <asm/apic.h>
27#include <asm/desc.h>
28#include <asm/i387.h>
18#include <asm/mtrr.h> 29#include <asm/mtrr.h>
30#include <asm/numa.h>
31#include <asm/asm.h>
32#include <asm/cpu.h>
19#include <asm/mce.h> 33#include <asm/mce.h>
34#include <asm/msr.h>
20#include <asm/pat.h> 35#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
23#include <asm/smp.h> 36#include <asm/smp.h>
24#include <asm/cpu.h>
25#include <asm/cpumask.h>
26#include <asm/apic.h>
27 37
28#ifdef CONFIG_X86_LOCAL_APIC 38#ifdef CONFIG_X86_LOCAL_APIC
29#include <asm/uv/uv.h> 39#include <asm/uv/uv.h>
30#endif 40#endif
31 41
32#include <asm/pgtable.h>
33#include <asm/processor.h>
34#include <asm/desc.h>
35#include <asm/atomic.h>
36#include <asm/proto.h>
37#include <asm/sections.h>
38#include <asm/setup.h>
39#include <asm/hypervisor.h>
40#include <asm/stackprotector.h>
41
42#include "cpu.h" 42#include "cpu.h"
43 43
44#ifdef CONFIG_X86_64 44#ifdef CONFIG_X86_64
45 45
46/* all of these masks are initialized in setup_cpu_local_masks() */ 46/* all of these masks are initialized in setup_cpu_local_masks() */
47cpumask_var_t cpu_callin_mask;
48cpumask_var_t cpu_callout_mask;
49cpumask_var_t cpu_initialized_mask; 47cpumask_var_t cpu_initialized_mask;
48cpumask_var_t cpu_callout_mask;
49cpumask_var_t cpu_callin_mask;
50 50
51/* representing cpus for which sibling maps can be computed */ 51/* representing cpus for which sibling maps can be computed */
52cpumask_var_t cpu_sibling_setup_mask; 52cpumask_var_t cpu_sibling_setup_mask;
@@ -62,15 +62,15 @@ void __init setup_cpu_local_masks(void)
62 62
63#else /* CONFIG_X86_32 */ 63#else /* CONFIG_X86_32 */
64 64
65cpumask_t cpu_callin_map; 65cpumask_t cpu_sibling_setup_map;
66cpumask_t cpu_callout_map; 66cpumask_t cpu_callout_map;
67cpumask_t cpu_initialized; 67cpumask_t cpu_initialized;
68cpumask_t cpu_sibling_setup_map; 68cpumask_t cpu_callin_map;
69 69
70#endif /* CONFIG_X86_32 */ 70#endif /* CONFIG_X86_32 */
71 71
72 72
73static struct cpu_dev *this_cpu __cpuinitdata; 73static const struct cpu_dev *this_cpu __cpuinitdata;
74 74
75DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 75DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
76#ifdef CONFIG_X86_64 76#ifdef CONFIG_X86_64
@@ -79,48 +79,48 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
79 * IRET will check the segment types kkeil 2000/10/28 79 * IRET will check the segment types kkeil 2000/10/28
80 * Also sysret mandates a special GDT layout 80 * Also sysret mandates a special GDT layout
81 * 81 *
82 * The TLS descriptors are currently at a different place compared to i386. 82 * TLS descriptors are currently at a different place compared to i386.
83 * Hopefully nobody expects them at a fixed place (Wine?) 83 * Hopefully nobody expects them at a fixed place (Wine?)
84 */ 84 */
85 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 85 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
86 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 86 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
87 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 87 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
88 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 88 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
89 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 89 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
90 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 90 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
91#else 91#else
92 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 92 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
93 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 93 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
94 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 94 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
95 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, 95 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
96 /* 96 /*
97 * Segments used for calling PnP BIOS have byte granularity. 97 * Segments used for calling PnP BIOS have byte granularity.
98 * They code segments and data segments have fixed 64k limits, 98 * They code segments and data segments have fixed 64k limits,
99 * the transfer segment sizes are set at run time. 99 * the transfer segment sizes are set at run time.
100 */ 100 */
101 /* 32-bit code */ 101 /* 32-bit code */
102 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, 102 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
103 /* 16-bit code */ 103 /* 16-bit code */
104 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, 104 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
105 /* 16-bit data */ 105 /* 16-bit data */
106 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, 106 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
107 /* 16-bit data */ 107 /* 16-bit data */
108 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, 108 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
109 /* 16-bit data */ 109 /* 16-bit data */
110 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, 110 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
111 /* 111 /*
112 * The APM segments have byte granularity and their bases 112 * The APM segments have byte granularity and their bases
113 * are set at run time. All have 64k limits. 113 * are set at run time. All have 64k limits.
114 */ 114 */
115 /* 32-bit code */ 115 /* 32-bit code */
116 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, 116 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
117 /* 16-bit code */ 117 /* 16-bit code */
118 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, 118 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
119 /* data */ 119 /* data */
120 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 120 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
121 121
122 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 122 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
123 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, 123 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
124 GDT_STACK_CANARY_INIT 124 GDT_STACK_CANARY_INIT
125#endif 125#endif
126} }; 126} };
@@ -164,16 +164,17 @@ static inline int flag_is_changeable_p(u32 flag)
164 * the CPUID. Add "volatile" to not allow gcc to 164 * the CPUID. Add "volatile" to not allow gcc to
165 * optimize the subsequent calls to this function. 165 * optimize the subsequent calls to this function.
166 */ 166 */
167 asm volatile ("pushfl\n\t" 167 asm volatile ("pushfl \n\t"
168 "pushfl\n\t" 168 "pushfl \n\t"
169 "popl %0\n\t" 169 "popl %0 \n\t"
170 "movl %0,%1\n\t" 170 "movl %0, %1 \n\t"
171 "xorl %2,%0\n\t" 171 "xorl %2, %0 \n\t"
172 "pushl %0\n\t" 172 "pushl %0 \n\t"
173 "popfl\n\t" 173 "popfl \n\t"
174 "pushfl\n\t" 174 "pushfl \n\t"
175 "popl %0\n\t" 175 "popl %0 \n\t"
176 "popfl\n\t" 176 "popfl \n\t"
177
177 : "=&r" (f1), "=&r" (f2) 178 : "=&r" (f1), "=&r" (f2)
178 : "ir" (flag)); 179 : "ir" (flag));
179 180
@@ -188,18 +189,22 @@ static int __cpuinit have_cpuid_p(void)
188 189
189static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 190static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
190{ 191{
191 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { 192 unsigned long lo, hi;
192 /* Disable processor serial number */ 193
193 unsigned long lo, hi; 194 if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
194 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); 195 return;
195 lo |= 0x200000; 196
196 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); 197 /* Disable processor serial number: */
197 printk(KERN_NOTICE "CPU serial number disabled.\n"); 198
198 clear_cpu_cap(c, X86_FEATURE_PN); 199 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
199 200 lo |= 0x200000;
200 /* Disabling the serial number may affect the cpuid level */ 201 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
201 c->cpuid_level = cpuid_eax(0); 202
202 } 203 printk(KERN_NOTICE "CPU serial number disabled.\n");
204 clear_cpu_cap(c, X86_FEATURE_PN);
205
206 /* Disabling the serial number may affect the cpuid level */
207 c->cpuid_level = cpuid_eax(0);
203} 208}
204 209
205static int __init x86_serial_nr_setup(char *s) 210static int __init x86_serial_nr_setup(char *s)
@@ -232,6 +237,7 @@ struct cpuid_dependent_feature {
232 u32 feature; 237 u32 feature;
233 u32 level; 238 u32 level;
234}; 239};
240
235static const struct cpuid_dependent_feature __cpuinitconst 241static const struct cpuid_dependent_feature __cpuinitconst
236cpuid_dependent_features[] = { 242cpuid_dependent_features[] = {
237 { X86_FEATURE_MWAIT, 0x00000005 }, 243 { X86_FEATURE_MWAIT, 0x00000005 },
@@ -243,7 +249,11 @@ cpuid_dependent_features[] = {
243static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) 249static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
244{ 250{
245 const struct cpuid_dependent_feature *df; 251 const struct cpuid_dependent_feature *df;
252
246 for (df = cpuid_dependent_features; df->feature; df++) { 253 for (df = cpuid_dependent_features; df->feature; df++) {
254
255 if (!cpu_has(c, df->feature))
256 continue;
247 /* 257 /*
248 * Note: cpuid_level is set to -1 if unavailable, but 258 * Note: cpuid_level is set to -1 if unavailable, but
249 * extended_extended_level is set to 0 if unavailable 259 * extended_extended_level is set to 0 if unavailable
@@ -251,32 +261,32 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
251 * when signed; hence the weird messing around with 261 * when signed; hence the weird messing around with
252 * signs here... 262 * signs here...
253 */ 263 */
254 if (cpu_has(c, df->feature) && 264 if (!((s32)df->level < 0 ?
255 ((s32)df->level < 0 ?
256 (u32)df->level > (u32)c->extended_cpuid_level : 265 (u32)df->level > (u32)c->extended_cpuid_level :
257 (s32)df->level > (s32)c->cpuid_level)) { 266 (s32)df->level > (s32)c->cpuid_level))
258 clear_cpu_cap(c, df->feature); 267 continue;
259 if (warn) 268
260 printk(KERN_WARNING 269 clear_cpu_cap(c, df->feature);
261 "CPU: CPU feature %s disabled " 270 if (!warn)
262 "due to lack of CPUID level 0x%x\n", 271 continue;
263 x86_cap_flags[df->feature], 272
264 df->level); 273 printk(KERN_WARNING
265 } 274 "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
275 x86_cap_flags[df->feature], df->level);
266 } 276 }
267} 277}
268 278
269/* 279/*
270 * Naming convention should be: <Name> [(<Codename>)] 280 * Naming convention should be: <Name> [(<Codename>)]
271 * This table only is used unless init_<vendor>() below doesn't set it; 281 * This table only is used unless init_<vendor>() below doesn't set it;
272 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used 282 * in particular, if CPUID levels 0x80000002..4 are supported, this
273 * 283 * isn't used
274 */ 284 */
275 285
276/* Look up CPU names by table lookup. */ 286/* Look up CPU names by table lookup. */
277static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) 287static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
278{ 288{
279 struct cpu_model_info *info; 289 const struct cpu_model_info *info;
280 290
281 if (c->x86_model >= 16) 291 if (c->x86_model >= 16)
282 return NULL; /* Range check */ 292 return NULL; /* Range check */
@@ -307,8 +317,10 @@ void load_percpu_segment(int cpu)
307 load_stack_canary_segment(); 317 load_stack_canary_segment();
308} 318}
309 319
310/* Current gdt points %fs at the "master" per-cpu area: after this, 320/*
311 * it's on the real one. */ 321 * Current gdt points %fs at the "master" per-cpu area: after this,
322 * it's on the real one.
323 */
312void switch_to_new_gdt(int cpu) 324void switch_to_new_gdt(int cpu)
313{ 325{
314 struct desc_ptr gdt_descr; 326 struct desc_ptr gdt_descr;
@@ -321,7 +333,7 @@ void switch_to_new_gdt(int cpu)
321 load_percpu_segment(cpu); 333 load_percpu_segment(cpu);
322} 334}
323 335
324static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 336static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
325 337
326static void __cpuinit default_init(struct cpuinfo_x86 *c) 338static void __cpuinit default_init(struct cpuinfo_x86 *c)
327{ 339{
@@ -340,7 +352,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
340#endif 352#endif
341} 353}
342 354
343static struct cpu_dev __cpuinitdata default_cpu = { 355static const struct cpu_dev __cpuinitconst default_cpu = {
344 .c_init = default_init, 356 .c_init = default_init,
345 .c_vendor = "Unknown", 357 .c_vendor = "Unknown",
346 .c_x86_vendor = X86_VENDOR_UNKNOWN, 358 .c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -354,22 +366,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
354 if (c->extended_cpuid_level < 0x80000004) 366 if (c->extended_cpuid_level < 0x80000004)
355 return; 367 return;
356 368
357 v = (unsigned int *) c->x86_model_id; 369 v = (unsigned int *)c->x86_model_id;
358 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 370 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
359 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); 371 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
360 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); 372 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
361 c->x86_model_id[48] = 0; 373 c->x86_model_id[48] = 0;
362 374
363 /* Intel chips right-justify this string for some dumb reason; 375 /*
364 undo that brain damage */ 376 * Intel chips right-justify this string for some dumb reason;
377 * undo that brain damage:
378 */
365 p = q = &c->x86_model_id[0]; 379 p = q = &c->x86_model_id[0];
366 while (*p == ' ') 380 while (*p == ' ')
367 p++; 381 p++;
368 if (p != q) { 382 if (p != q) {
369 while (*p) 383 while (*p)
370 *q++ = *p++; 384 *q++ = *p++;
371 while (q <= &c->x86_model_id[48]) 385 while (q <= &c->x86_model_id[48])
372 *q++ = '\0'; /* Zero-pad the rest */ 386 *q++ = '\0'; /* Zero-pad the rest */
373 } 387 }
374} 388}
375 389
@@ -438,27 +452,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
438 452
439 if (smp_num_siblings == 1) { 453 if (smp_num_siblings == 1) {
440 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 454 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
441 } else if (smp_num_siblings > 1) { 455 goto out;
456 }
442 457
443 if (smp_num_siblings > nr_cpu_ids) { 458 if (smp_num_siblings <= 1)
444 printk(KERN_WARNING "CPU: Unsupported number of siblings %d", 459 goto out;
445 smp_num_siblings); 460
446 smp_num_siblings = 1; 461 if (smp_num_siblings > nr_cpu_ids) {
447 return; 462 pr_warning("CPU: Unsupported number of siblings %d",
448 } 463 smp_num_siblings);
464 smp_num_siblings = 1;
465 return;
466 }
449 467
450 index_msb = get_count_order(smp_num_siblings); 468 index_msb = get_count_order(smp_num_siblings);
451 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); 469 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
452 470
453 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 471 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
454 472
455 index_msb = get_count_order(smp_num_siblings); 473 index_msb = get_count_order(smp_num_siblings);
456 474
457 core_bits = get_count_order(c->x86_max_cores); 475 core_bits = get_count_order(c->x86_max_cores);
458 476
459 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & 477 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
460 ((1 << core_bits) - 1); 478 ((1 << core_bits) - 1);
461 }
462 479
463out: 480out:
464 if ((c->x86_max_cores * smp_num_siblings) > 1) { 481 if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -473,8 +490,8 @@ out:
473static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 490static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
474{ 491{
475 char *v = c->x86_vendor_id; 492 char *v = c->x86_vendor_id;
476 int i;
477 static int printed; 493 static int printed;
494 int i;
478 495
479 for (i = 0; i < X86_VENDOR_NUM; i++) { 496 for (i = 0; i < X86_VENDOR_NUM; i++) {
480 if (!cpu_devs[i]) 497 if (!cpu_devs[i])
@@ -483,6 +500,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
483 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 500 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
484 (cpu_devs[i]->c_ident[1] && 501 (cpu_devs[i]->c_ident[1] &&
485 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 502 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
503
486 this_cpu = cpu_devs[i]; 504 this_cpu = cpu_devs[i];
487 c->x86_vendor = this_cpu->c_x86_vendor; 505 c->x86_vendor = this_cpu->c_x86_vendor;
488 return; 506 return;
@@ -491,7 +509,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
491 509
492 if (!printed) { 510 if (!printed) {
493 printed++; 511 printed++;
494 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); 512 printk(KERN_ERR
513 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
514
495 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 515 printk(KERN_ERR "CPU: Your system may be unstable.\n");
496 } 516 }
497 517
@@ -511,14 +531,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
511 /* Intel-defined flags: level 0x00000001 */ 531 /* Intel-defined flags: level 0x00000001 */
512 if (c->cpuid_level >= 0x00000001) { 532 if (c->cpuid_level >= 0x00000001) {
513 u32 junk, tfms, cap0, misc; 533 u32 junk, tfms, cap0, misc;
534
514 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 535 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
515 c->x86 = (tfms >> 8) & 0xf; 536 c->x86 = (tfms >> 8) & 0xf;
516 c->x86_model = (tfms >> 4) & 0xf; 537 c->x86_model = (tfms >> 4) & 0xf;
517 c->x86_mask = tfms & 0xf; 538 c->x86_mask = tfms & 0xf;
539
518 if (c->x86 == 0xf) 540 if (c->x86 == 0xf)
519 c->x86 += (tfms >> 20) & 0xff; 541 c->x86 += (tfms >> 20) & 0xff;
520 if (c->x86 >= 0x6) 542 if (c->x86 >= 0x6)
521 c->x86_model += ((tfms >> 16) & 0xf) << 4; 543 c->x86_model += ((tfms >> 16) & 0xf) << 4;
544
522 if (cap0 & (1<<19)) { 545 if (cap0 & (1<<19)) {
523 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 546 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
524 c->x86_cache_alignment = c->x86_clflush_size; 547 c->x86_cache_alignment = c->x86_clflush_size;
@@ -534,6 +557,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
534 /* Intel-defined flags: level 0x00000001 */ 557 /* Intel-defined flags: level 0x00000001 */
535 if (c->cpuid_level >= 0x00000001) { 558 if (c->cpuid_level >= 0x00000001) {
536 u32 capability, excap; 559 u32 capability, excap;
560
537 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 561 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
538 c->x86_capability[0] = capability; 562 c->x86_capability[0] = capability;
539 c->x86_capability[4] = excap; 563 c->x86_capability[4] = excap;
@@ -542,6 +566,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
542 /* AMD-defined flags: level 0x80000001 */ 566 /* AMD-defined flags: level 0x80000001 */
543 xlvl = cpuid_eax(0x80000000); 567 xlvl = cpuid_eax(0x80000000);
544 c->extended_cpuid_level = xlvl; 568 c->extended_cpuid_level = xlvl;
569
545 if ((xlvl & 0xffff0000) == 0x80000000) { 570 if ((xlvl & 0xffff0000) == 0x80000000) {
546 if (xlvl >= 0x80000001) { 571 if (xlvl >= 0x80000001) {
547 c->x86_capability[1] = cpuid_edx(0x80000001); 572 c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -549,13 +574,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
549 } 574 }
550 } 575 }
551 576
552#ifdef CONFIG_X86_64
553 if (c->extended_cpuid_level >= 0x80000008) { 577 if (c->extended_cpuid_level >= 0x80000008) {
554 u32 eax = cpuid_eax(0x80000008); 578 u32 eax = cpuid_eax(0x80000008);
555 579
556 c->x86_virt_bits = (eax >> 8) & 0xff; 580 c->x86_virt_bits = (eax >> 8) & 0xff;
557 c->x86_phys_bits = eax & 0xff; 581 c->x86_phys_bits = eax & 0xff;
558 } 582 }
583#ifdef CONFIG_X86_32
584 else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
585 c->x86_phys_bits = 36;
559#endif 586#endif
560 587
561 if (c->extended_cpuid_level >= 0x80000007) 588 if (c->extended_cpuid_level >= 0x80000007)
@@ -602,8 +629,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
602{ 629{
603#ifdef CONFIG_X86_64 630#ifdef CONFIG_X86_64
604 c->x86_clflush_size = 64; 631 c->x86_clflush_size = 64;
632 c->x86_phys_bits = 36;
633 c->x86_virt_bits = 48;
605#else 634#else
606 c->x86_clflush_size = 32; 635 c->x86_clflush_size = 32;
636 c->x86_phys_bits = 32;
637 c->x86_virt_bits = 32;
607#endif 638#endif
608 c->x86_cache_alignment = c->x86_clflush_size; 639 c->x86_cache_alignment = c->x86_clflush_size;
609 640
@@ -634,12 +665,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
634 665
635void __init early_cpu_init(void) 666void __init early_cpu_init(void)
636{ 667{
637 struct cpu_dev **cdev; 668 const struct cpu_dev *const *cdev;
638 int count = 0; 669 int count = 0;
639 670
640 printk("KERNEL supported cpus:\n"); 671 printk(KERN_INFO "KERNEL supported cpus:\n");
641 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 672 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
642 struct cpu_dev *cpudev = *cdev; 673 const struct cpu_dev *cpudev = *cdev;
643 unsigned int j; 674 unsigned int j;
644 675
645 if (count >= X86_VENDOR_NUM) 676 if (count >= X86_VENDOR_NUM)
@@ -650,7 +681,7 @@ void __init early_cpu_init(void)
650 for (j = 0; j < 2; j++) { 681 for (j = 0; j < 2; j++) {
651 if (!cpudev->c_ident[j]) 682 if (!cpudev->c_ident[j])
652 continue; 683 continue;
653 printk(" %s %s\n", cpudev->c_vendor, 684 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
654 cpudev->c_ident[j]); 685 cpudev->c_ident[j]);
655 } 686 }
656 } 687 }
@@ -726,9 +757,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
726 c->x86_coreid_bits = 0; 757 c->x86_coreid_bits = 0;
727#ifdef CONFIG_X86_64 758#ifdef CONFIG_X86_64
728 c->x86_clflush_size = 64; 759 c->x86_clflush_size = 64;
760 c->x86_phys_bits = 36;
761 c->x86_virt_bits = 48;
729#else 762#else
730 c->cpuid_level = -1; /* CPUID not detected */ 763 c->cpuid_level = -1; /* CPUID not detected */
731 c->x86_clflush_size = 32; 764 c->x86_clflush_size = 32;
765 c->x86_phys_bits = 32;
766 c->x86_virt_bits = 32;
732#endif 767#endif
733 c->x86_cache_alignment = c->x86_clflush_size; 768 c->x86_cache_alignment = c->x86_clflush_size;
734 memset(&c->x86_capability, 0, sizeof c->x86_capability); 769 memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -759,8 +794,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
759 squash_the_stupid_serial_number(c); 794 squash_the_stupid_serial_number(c);
760 795
761 /* 796 /*
762 * The vendor-specific functions might have changed features. Now 797 * The vendor-specific functions might have changed features.
763 * we do "generic changes." 798 * Now we do "generic changes."
764 */ 799 */
765 800
766 /* Filter out anything that depends on CPUID levels we don't have */ 801 /* Filter out anything that depends on CPUID levels we don't have */
@@ -768,7 +803,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
768 803
769 /* If the model name is still unset, do table lookup. */ 804 /* If the model name is still unset, do table lookup. */
770 if (!c->x86_model_id[0]) { 805 if (!c->x86_model_id[0]) {
771 char *p; 806 const char *p;
772 p = table_lookup_model(c); 807 p = table_lookup_model(c);
773 if (p) 808 if (p)
774 strcpy(c->x86_model_id, p); 809 strcpy(c->x86_model_id, p);
@@ -843,11 +878,11 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
843} 878}
844 879
845struct msr_range { 880struct msr_range {
846 unsigned min; 881 unsigned min;
847 unsigned max; 882 unsigned max;
848}; 883};
849 884
850static struct msr_range msr_range_array[] __cpuinitdata = { 885static const struct msr_range msr_range_array[] __cpuinitconst = {
851 { 0x00000000, 0x00000418}, 886 { 0x00000000, 0x00000418},
852 { 0xc0000000, 0xc000040b}, 887 { 0xc0000000, 0xc000040b},
853 { 0xc0010000, 0xc0010142}, 888 { 0xc0010000, 0xc0010142},
@@ -856,14 +891,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = {
856 891
857static void __cpuinit print_cpu_msr(void) 892static void __cpuinit print_cpu_msr(void)
858{ 893{
894 unsigned index_min, index_max;
859 unsigned index; 895 unsigned index;
860 u64 val; 896 u64 val;
861 int i; 897 int i;
862 unsigned index_min, index_max;
863 898
864 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { 899 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
865 index_min = msr_range_array[i].min; 900 index_min = msr_range_array[i].min;
866 index_max = msr_range_array[i].max; 901 index_max = msr_range_array[i].max;
902
867 for (index = index_min; index < index_max; index++) { 903 for (index = index_min; index < index_max; index++) {
868 if (rdmsrl_amd_safe(index, &val)) 904 if (rdmsrl_amd_safe(index, &val))
869 continue; 905 continue;
@@ -873,6 +909,7 @@ static void __cpuinit print_cpu_msr(void)
873} 909}
874 910
875static int show_msr __cpuinitdata; 911static int show_msr __cpuinitdata;
912
876static __init int setup_show_msr(char *arg) 913static __init int setup_show_msr(char *arg)
877{ 914{
878 int num; 915 int num;
@@ -894,12 +931,14 @@ __setup("noclflush", setup_noclflush);
894 931
895void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 932void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
896{ 933{
897 char *vendor = NULL; 934 const char *vendor = NULL;
898 935
899 if (c->x86_vendor < X86_VENDOR_NUM) 936 if (c->x86_vendor < X86_VENDOR_NUM) {
900 vendor = this_cpu->c_vendor; 937 vendor = this_cpu->c_vendor;
901 else if (c->cpuid_level >= 0) 938 } else {
902 vendor = c->x86_vendor_id; 939 if (c->cpuid_level >= 0)
940 vendor = c->x86_vendor_id;
941 }
903 942
904 if (vendor && !strstr(c->x86_model_id, vendor)) 943 if (vendor && !strstr(c->x86_model_id, vendor))
905 printk(KERN_CONT "%s ", vendor); 944 printk(KERN_CONT "%s ", vendor);
@@ -926,10 +965,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
926static __init int setup_disablecpuid(char *arg) 965static __init int setup_disablecpuid(char *arg)
927{ 966{
928 int bit; 967 int bit;
968
929 if (get_option(&arg, &bit) && bit < NCAPINTS*32) 969 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
930 setup_clear_cpu_cap(bit); 970 setup_clear_cpu_cap(bit);
931 else 971 else
932 return 0; 972 return 0;
973
933 return 1; 974 return 1;
934} 975}
935__setup("clearcpuid=", setup_disablecpuid); 976__setup("clearcpuid=", setup_disablecpuid);
@@ -939,6 +980,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
939 980
940DEFINE_PER_CPU_FIRST(union irq_stack_union, 981DEFINE_PER_CPU_FIRST(union irq_stack_union,
941 irq_stack_union) __aligned(PAGE_SIZE); 982 irq_stack_union) __aligned(PAGE_SIZE);
983
942DEFINE_PER_CPU(char *, irq_stack_ptr) = 984DEFINE_PER_CPU(char *, irq_stack_ptr) =
943 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; 985 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
944 986
@@ -948,12 +990,21 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
948 990
949DEFINE_PER_CPU(unsigned int, irq_count) = -1; 991DEFINE_PER_CPU(unsigned int, irq_count) = -1;
950 992
993/*
994 * Special IST stacks which the CPU switches to when it calls
995 * an IST-marked descriptor entry. Up to 7 stacks (hardware
996 * limit), all of them are 4K, except the debug stack which
997 * is 8K.
998 */
999static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
1000 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1001 [DEBUG_STACK - 1] = DEBUG_STKSZ
1002};
1003
951static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks 1004static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
952 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) 1005 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
953 __aligned(PAGE_SIZE); 1006 __aligned(PAGE_SIZE);
954 1007
955extern asmlinkage void ignore_sysret(void);
956
957/* May not be marked __init: used by software suspend */ 1008/* May not be marked __init: used by software suspend */
958void syscall_init(void) 1009void syscall_init(void)
959{ 1010{
@@ -983,7 +1034,7 @@ unsigned long kernel_eflags;
983 */ 1034 */
984DEFINE_PER_CPU(struct orig_ist, orig_ist); 1035DEFINE_PER_CPU(struct orig_ist, orig_ist);
985 1036
986#else /* x86_64 */ 1037#else /* CONFIG_X86_64 */
987 1038
988#ifdef CONFIG_CC_STACKPROTECTOR 1039#ifdef CONFIG_CC_STACKPROTECTOR
989DEFINE_PER_CPU(unsigned long, stack_canary); 1040DEFINE_PER_CPU(unsigned long, stack_canary);
@@ -995,9 +1046,26 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
995 memset(regs, 0, sizeof(struct pt_regs)); 1046 memset(regs, 0, sizeof(struct pt_regs));
996 regs->fs = __KERNEL_PERCPU; 1047 regs->fs = __KERNEL_PERCPU;
997 regs->gs = __KERNEL_STACK_CANARY; 1048 regs->gs = __KERNEL_STACK_CANARY;
1049
998 return regs; 1050 return regs;
999} 1051}
1000#endif /* x86_64 */ 1052#endif /* CONFIG_X86_64 */
1053
1054/*
1055 * Clear all 6 debug registers:
1056 */
1057static void clear_all_debug_regs(void)
1058{
1059 int i;
1060
1061 for (i = 0; i < 8; i++) {
1062 /* Ignore db4, db5 */
1063 if ((i == 4) || (i == 5))
1064 continue;
1065
1066 set_debugreg(0, i);
1067 }
1068}
1001 1069
1002/* 1070/*
1003 * cpu_init() initializes state that is per-CPU. Some data is already 1071 * cpu_init() initializes state that is per-CPU. Some data is already
@@ -1007,15 +1075,20 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1007 * A lot of state is already set up in PDA init for 64 bit 1075 * A lot of state is already set up in PDA init for 64 bit
1008 */ 1076 */
1009#ifdef CONFIG_X86_64 1077#ifdef CONFIG_X86_64
1078
1010void __cpuinit cpu_init(void) 1079void __cpuinit cpu_init(void)
1011{ 1080{
1012 int cpu = stack_smp_processor_id(); 1081 struct orig_ist *orig_ist;
1013 struct tss_struct *t = &per_cpu(init_tss, cpu);
1014 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1015 unsigned long v;
1016 struct task_struct *me; 1082 struct task_struct *me;
1083 struct tss_struct *t;
1084 unsigned long v;
1085 int cpu;
1017 int i; 1086 int i;
1018 1087
1088 cpu = stack_smp_processor_id();
1089 t = &per_cpu(init_tss, cpu);
1090 orig_ist = &per_cpu(orig_ist, cpu);
1091
1019#ifdef CONFIG_NUMA 1092#ifdef CONFIG_NUMA
1020 if (cpu != 0 && percpu_read(node_number) == 0 && 1093 if (cpu != 0 && percpu_read(node_number) == 0 &&
1021 cpu_to_node(cpu) != NUMA_NO_NODE) 1094 cpu_to_node(cpu) != NUMA_NO_NODE)
@@ -1056,19 +1129,17 @@ void __cpuinit cpu_init(void)
1056 * set up and load the per-CPU TSS 1129 * set up and load the per-CPU TSS
1057 */ 1130 */
1058 if (!orig_ist->ist[0]) { 1131 if (!orig_ist->ist[0]) {
1059 static const unsigned int sizes[N_EXCEPTION_STACKS] = {
1060 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1061 [DEBUG_STACK - 1] = DEBUG_STKSZ
1062 };
1063 char *estacks = per_cpu(exception_stacks, cpu); 1132 char *estacks = per_cpu(exception_stacks, cpu);
1133
1064 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1134 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1065 estacks += sizes[v]; 1135 estacks += exception_stack_sizes[v];
1066 orig_ist->ist[v] = t->x86_tss.ist[v] = 1136 orig_ist->ist[v] = t->x86_tss.ist[v] =
1067 (unsigned long)estacks; 1137 (unsigned long)estacks;
1068 } 1138 }
1069 } 1139 }
1070 1140
1071 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 1141 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1142
1072 /* 1143 /*
1073 * <= is required because the CPU will access up to 1144 * <= is required because the CPU will access up to
1074 * 8 bits beyond the end of the IO permission bitmap. 1145 * 8 bits beyond the end of the IO permission bitmap.
@@ -1078,8 +1149,7 @@ void __cpuinit cpu_init(void)
1078 1149
1079 atomic_inc(&init_mm.mm_count); 1150 atomic_inc(&init_mm.mm_count);
1080 me->active_mm = &init_mm; 1151 me->active_mm = &init_mm;
1081 if (me->mm) 1152 BUG_ON(me->mm);
1082 BUG();
1083 enter_lazy_tlb(&init_mm, me); 1153 enter_lazy_tlb(&init_mm, me);
1084 1154
1085 load_sp0(t, &current->thread); 1155 load_sp0(t, &current->thread);
@@ -1098,17 +1168,7 @@ void __cpuinit cpu_init(void)
1098 arch_kgdb_ops.correct_hw_break(); 1168 arch_kgdb_ops.correct_hw_break();
1099 else 1169 else
1100#endif 1170#endif
1101 { 1171 clear_all_debug_regs();
1102 /*
1103 * Clear all 6 debug registers:
1104 */
1105 set_debugreg(0UL, 0);
1106 set_debugreg(0UL, 1);
1107 set_debugreg(0UL, 2);
1108 set_debugreg(0UL, 3);
1109 set_debugreg(0UL, 6);
1110 set_debugreg(0UL, 7);
1111 }
1112 1172
1113 fpu_init(); 1173 fpu_init();
1114 1174
@@ -1129,7 +1189,8 @@ void __cpuinit cpu_init(void)
1129 1189
1130 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { 1190 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
1131 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 1191 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1132 for (;;) local_irq_enable(); 1192 for (;;)
1193 local_irq_enable();
1133 } 1194 }
1134 1195
1135 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1196 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1145,8 +1206,7 @@ void __cpuinit cpu_init(void)
1145 */ 1206 */
1146 atomic_inc(&init_mm.mm_count); 1207 atomic_inc(&init_mm.mm_count);
1147 curr->active_mm = &init_mm; 1208 curr->active_mm = &init_mm;
1148 if (curr->mm) 1209 BUG_ON(curr->mm);
1149 BUG();
1150 enter_lazy_tlb(&init_mm, curr); 1210 enter_lazy_tlb(&init_mm, curr);
1151 1211
1152 load_sp0(t, thread); 1212 load_sp0(t, thread);
@@ -1159,13 +1219,7 @@ void __cpuinit cpu_init(void)
1159 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 1219 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1160#endif 1220#endif
1161 1221
1162 /* Clear all 6 debug registers: */ 1222 clear_all_debug_regs();
1163 set_debugreg(0, 0);
1164 set_debugreg(0, 1);
1165 set_debugreg(0, 2);
1166 set_debugreg(0, 3);
1167 set_debugreg(0, 6);
1168 set_debugreg(0, 7);
1169 1223
1170 /* 1224 /*
1171 * Force FPU initialization: 1225 * Force FPU initialization:
@@ -1185,6 +1239,4 @@ void __cpuinit cpu_init(void)
1185 1239
1186 xsave_init(); 1240 xsave_init();
1187} 1241}
1188
1189
1190#endif 1242#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a39210..6de9a908e400 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -3,33 +3,34 @@
3#define ARCH_X86_CPU_H 3#define ARCH_X86_CPU_H
4 4
5struct cpu_model_info { 5struct cpu_model_info {
6 int vendor; 6 int vendor;
7 int family; 7 int family;
8 char *model_names[16]; 8 const char *model_names[16];
9}; 9};
10 10
11/* attempt to consolidate cpu attributes */ 11/* attempt to consolidate cpu attributes */
12struct cpu_dev { 12struct cpu_dev {
13 char * c_vendor; 13 const char *c_vendor;
14 14
15 /* some have two possibilities for cpuid string */ 15 /* some have two possibilities for cpuid string */
16 char * c_ident[2]; 16 const char *c_ident[2];
17 17
18 struct cpu_model_info c_models[4]; 18 struct cpu_model_info c_models[4];
19 19
20 void (*c_early_init)(struct cpuinfo_x86 *c); 20 void (*c_early_init)(struct cpuinfo_x86 *);
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 *);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 *);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
24 int c_x86_vendor; 24 int c_x86_vendor;
25}; 25};
26 26
27#define cpu_dev_register(cpu_devX) \ 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \ 28 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \ 29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX; 30 &cpu_devX;
31 31
32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[];
33 34
34extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void display_cacheinfo(struct cpuinfo_x86 *c);
35 36
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 000000000000..46e29ab96c6a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,901 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38
39static DEFINE_MUTEX(cpu_debug_lock);
40
41static struct dentry *cpu_debugfs_dir;
42
43static struct cpu_debug_base cpu_base[] = {
44 { "mc", CPU_MC, 0 },
45 { "monitor", CPU_MONITOR, 0 },
46 { "time", CPU_TIME, 0 },
47 { "pmc", CPU_PMC, 1 },
48 { "platform", CPU_PLATFORM, 0 },
49 { "apic", CPU_APIC, 0 },
50 { "poweron", CPU_POWERON, 0 },
51 { "control", CPU_CONTROL, 0 },
52 { "features", CPU_FEATURES, 0 },
53 { "lastbranch", CPU_LBRANCH, 0 },
54 { "bios", CPU_BIOS, 0 },
55 { "freq", CPU_FREQ, 0 },
56 { "mtrr", CPU_MTRR, 0 },
57 { "perf", CPU_PERF, 0 },
58 { "cache", CPU_CACHE, 0 },
59 { "sysenter", CPU_SYSENTER, 0 },
60 { "therm", CPU_THERM, 0 },
61 { "misc", CPU_MISC, 0 },
62 { "debug", CPU_DEBUG, 0 },
63 { "pat", CPU_PAT, 0 },
64 { "vmx", CPU_VMX, 0 },
65 { "call", CPU_CALL, 0 },
66 { "base", CPU_BASE, 0 },
67 { "ver", CPU_VER, 0 },
68 { "conf", CPU_CONF, 0 },
69 { "smm", CPU_SMM, 0 },
70 { "svm", CPU_SVM, 0 },
71 { "osvm", CPU_OSVM, 0 },
72 { "tss", CPU_TSS, 0 },
73 { "cr", CPU_CR, 0 },
74 { "dt", CPU_DT, 0 },
75 { "registers", CPU_REG_ALL, 0 },
76};
77
78static struct cpu_file_base cpu_file[] = {
79 { "index", CPU_REG_ALL, 0 },
80 { "value", CPU_REG_ALL, 1 },
81};
82
83/* Intel Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
91
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
96
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
101
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
106
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
111
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
117
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
126
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
135
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
142
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
155
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178};
179
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{
355 unsigned vendor, modelflag;
356 int i, index;
357
358 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS)
360 return 1;
361
362 modelflag = per_cpu(cpu_modelflag, cpu);
363 vendor = per_cpu(cpu_model, cpu) >> 16;
364 index = get_cpu_range_count(cpu);
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 }
380
381 /* Invalid */
382 return 0;
383}
384
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag)
387{
388 unsigned modelflag;
389
390 modelflag = per_cpu(cpu_modelflag, cpu);
391 *max = 0;
392 switch (per_cpu(cpu_model, cpu) >> 16) {
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408
409 return *max;
410}
411
412/* This function can also be called with seq = NULL for printk */
413static void print_cpu_data(struct seq_file *seq, unsigned type,
414 u32 low, u32 high)
415{
416 struct cpu_private *priv;
417 u64 val = high;
418
419 if (seq) {
420 priv = seq->private;
421 if (priv->file) {
422 val = (val << 32) | low;
423 seq_printf(seq, "0x%llx\n", val);
424 } else
425 seq_printf(seq, " %08x: %08x_%08x\n",
426 type, high, low);
427 } else
428 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
429}
430
431/* This function can also be called with seq = NULL for printk */
432static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
433{
434 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv;
436 u32 low, high;
437 int i, range;
438
439 if (seq) {
440 priv = seq->private;
441 if (priv->file) {
442 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
443 &low, &high))
444 print_cpu_data(seq, priv->reg, low, high);
445 return;
446 }
447 }
448
449 range = get_cpu_range_count(cpu);
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue;
454
455 for (msr = msr_min; msr <= msr_max; msr++) {
456 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
457 continue;
458 print_cpu_data(seq, msr, low, high);
459 }
460 }
461}
462
463static void print_tss(void *arg)
464{
465 struct pt_regs *regs = task_pt_regs(current);
466 struct seq_file *seq = arg;
467 unsigned int seg;
468
469 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
470 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
471 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
472 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
473
474 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
475 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
476 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
477 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
478
479#ifdef CONFIG_X86_64
480 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
481 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
482 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
483 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
484 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
485 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
486 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
487 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
488#endif
489
490 asm("movl %%cs,%0" : "=r" (seg));
491 seq_printf(seq, " CS\t: %04x\n", seg);
492 asm("movl %%ds,%0" : "=r" (seg));
493 seq_printf(seq, " DS\t: %04x\n", seg);
494 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
495 asm("movl %%es,%0" : "=r" (seg));
496 seq_printf(seq, " ES\t: %04x\n", seg);
497 asm("movl %%fs,%0" : "=r" (seg));
498 seq_printf(seq, " FS\t: %04x\n", seg);
499 asm("movl %%gs,%0" : "=r" (seg));
500 seq_printf(seq, " GS\t: %04x\n", seg);
501
502 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
503
504 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
505}
506
507static void print_cr(void *arg)
508{
509 struct seq_file *seq = arg;
510
511 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
512 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
513 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
514 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
515#ifdef CONFIG_X86_64
516 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
517#endif
518}
519
520static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
521{
522 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
523}
524
525static void print_dt(void *seq)
526{
527 struct desc_ptr dt;
528 unsigned long ldt;
529
530 /* IDT */
531 store_idt((struct desc_ptr *)&dt);
532 print_desc_ptr("IDT", seq, dt);
533
534 /* GDT */
535 store_gdt((struct desc_ptr *)&dt);
536 print_desc_ptr("GDT", seq, dt);
537
538 /* LDT */
539 store_ldt(ldt);
540 seq_printf(seq, " LDT\t: %016lx\n", ldt);
541
542 /* TR */
543 store_tr(ldt);
544 seq_printf(seq, " TR\t: %016lx\n", ldt);
545}
546
547static void print_dr(void *arg)
548{
549 struct seq_file *seq = arg;
550 unsigned long dr;
551 int i;
552
553 for (i = 0; i < 8; i++) {
554 /* Ignore db4, db5 */
555 if ((i == 4) || (i == 5))
556 continue;
557 get_debugreg(dr, i);
558 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
559 }
560
561 seq_printf(seq, "\n MSR\t:\n");
562}
563
564static void print_apic(void *arg)
565{
566 struct seq_file *seq = arg;
567
568#ifdef CONFIG_X86_LOCAL_APIC
569 seq_printf(seq, " LAPIC\t:\n");
570 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
571 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
572 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
573 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
574 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
575 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
576 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
577 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
578 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
579 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
580 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
581 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
582 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
583 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
584 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
585 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
586 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
587 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
588 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
589 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
590 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
591#endif /* CONFIG_X86_LOCAL_APIC */
592
593 seq_printf(seq, "\n MSR\t:\n");
594}
595
596static int cpu_seq_show(struct seq_file *seq, void *v)
597{
598 struct cpu_private *priv = seq->private;
599
600 if (priv == NULL)
601 return -EINVAL;
602
603 switch (cpu_base[priv->type].flag) {
604 case CPU_TSS:
605 smp_call_function_single(priv->cpu, print_tss, seq, 1);
606 break;
607 case CPU_CR:
608 smp_call_function_single(priv->cpu, print_cr, seq, 1);
609 break;
610 case CPU_DT:
611 smp_call_function_single(priv->cpu, print_dt, seq, 1);
612 break;
613 case CPU_DEBUG:
614 if (priv->file == CPU_INDEX_BIT)
615 smp_call_function_single(priv->cpu, print_dr, seq, 1);
616 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
617 break;
618 case CPU_APIC:
619 if (priv->file == CPU_INDEX_BIT)
620 smp_call_function_single(priv->cpu, print_apic, seq, 1);
621 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
622 break;
623
624 default:
625 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
626 break;
627 }
628 seq_printf(seq, "\n");
629
630 return 0;
631}
632
633static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
634{
635 if (*pos == 0) /* One time is enough ;-) */
636 return seq;
637
638 return NULL;
639}
640
641static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
642{
643 (*pos)++;
644
645 return cpu_seq_start(seq, pos);
646}
647
648static void cpu_seq_stop(struct seq_file *seq, void *v)
649{
650}
651
652static const struct seq_operations cpu_seq_ops = {
653 .start = cpu_seq_start,
654 .next = cpu_seq_next,
655 .stop = cpu_seq_stop,
656 .show = cpu_seq_show,
657};
658
659static int cpu_seq_open(struct inode *inode, struct file *file)
660{
661 struct cpu_private *priv = inode->i_private;
662 struct seq_file *seq;
663 int err;
664
665 err = seq_open(file, &cpu_seq_ops);
666 if (!err) {
667 seq = file->private_data;
668 seq->private = priv;
669 }
670
671 return err;
672}
673
674static int write_msr(struct cpu_private *priv, u64 val)
675{
676 u32 low, high;
677
678 high = (val >> 32) & 0xffffffff;
679 low = val & 0xffffffff;
680
681 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
682 return 0;
683
684 return -EPERM;
685}
686
687static int write_cpu_register(struct cpu_private *priv, const char *buf)
688{
689 int ret = -EPERM;
690 u64 val;
691
692 ret = strict_strtoull(buf, 0, &val);
693 if (ret < 0)
694 return ret;
695
696 /* Supporting only MSRs */
697 if (priv->type < CPU_TSS_BIT)
698 return write_msr(priv, val);
699
700 return ret;
701}
702
703static ssize_t cpu_write(struct file *file, const char __user *ubuf,
704 size_t count, loff_t *off)
705{
706 struct seq_file *seq = file->private_data;
707 struct cpu_private *priv = seq->private;
708 char buf[19];
709
710 if ((priv == NULL) || (count >= sizeof(buf)))
711 return -EINVAL;
712
713 if (copy_from_user(&buf, ubuf, count))
714 return -EFAULT;
715
716 buf[count] = 0;
717
718 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
719 if (!write_cpu_register(priv, buf))
720 return count;
721
722 return -EACCES;
723}
724
725static const struct file_operations cpu_fops = {
726 .owner = THIS_MODULE,
727 .open = cpu_seq_open,
728 .read = seq_read,
729 .write = cpu_write,
730 .llseek = seq_lseek,
731 .release = seq_release,
732};
733
734static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
735 unsigned file, struct dentry *dentry)
736{
737 struct cpu_private *priv = NULL;
738
739 /* Already intialized */
740 if (file == CPU_INDEX_BIT)
741 if (per_cpu(cpu_arr[type].init, cpu))
742 return 0;
743
744 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
745 if (priv == NULL)
746 return -ENOMEM;
747
748 priv->cpu = cpu;
749 priv->type = type;
750 priv->reg = reg;
751 priv->file = file;
752 mutex_lock(&cpu_debug_lock);
753 per_cpu(priv_arr[type], cpu) = priv;
754 per_cpu(cpu_priv_count, cpu)++;
755 mutex_unlock(&cpu_debug_lock);
756
757 if (file)
758 debugfs_create_file(cpu_file[file].name, S_IRUGO,
759 dentry, (void *)priv, &cpu_fops);
760 else {
761 debugfs_create_file(cpu_base[type].name, S_IRUGO,
762 per_cpu(cpu_arr[type].dentry, cpu),
763 (void *)priv, &cpu_fops);
764 mutex_lock(&cpu_debug_lock);
765 per_cpu(cpu_arr[type].init, cpu) = 1;
766 mutex_unlock(&cpu_debug_lock);
767 }
768
769 return 0;
770}
771
772static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
773 struct dentry *dentry)
774{
775 unsigned file;
776 int err = 0;
777
778 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
779 err = cpu_create_file(cpu, type, reg, file, dentry);
780 if (err)
781 return err;
782 }
783
784 return err;
785}
786
787static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
788{
789 struct dentry *cpu_dentry = NULL;
790 unsigned reg, reg_min, reg_max;
791 int i, range, err = 0;
792 char reg_dir[12];
793 u32 low, high;
794
795 range = get_cpu_range_count(cpu);
796
797 for (i = 0; i < range; i++) {
798 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
799 cpu_base[type].flag))
800 continue;
801
802 for (reg = reg_min; reg <= reg_max; reg++) {
803 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
804 continue;
805
806 sprintf(reg_dir, "0x%x", reg);
807 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
808 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
809 if (err)
810 return err;
811 }
812 }
813
814 return err;
815}
816
817static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
818{
819 struct dentry *cpu_dentry = NULL;
820 unsigned type;
821 int err = 0;
822
823 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
824 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
825 continue;
826 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
827 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
828
829 if (type < CPU_TSS_BIT)
830 err = cpu_init_msr(cpu, type, cpu_dentry);
831 else
832 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
833 cpu_dentry);
834 if (err)
835 return err;
836 }
837
838 return err;
839}
840
841static int cpu_init_cpu(void)
842{
843 struct dentry *cpu_dentry = NULL;
844 struct cpuinfo_x86 *cpui;
845 char cpu_dir[12];
846 unsigned cpu;
847 int err = 0;
848
849 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
850 cpui = &cpu_data(cpu);
851 if (!cpu_has(cpui, X86_FEATURE_MSR))
852 continue;
853 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
854 (cpui->x86 << 8) |
855 (cpui->x86_model));
856 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
857
858 sprintf(cpu_dir, "cpu%d", cpu);
859 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
860 err = cpu_init_allreg(cpu, cpu_dentry);
861
862 pr_info("cpu%d(%d) debug files %d\n",
863 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
864 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
865 pr_err("Register files count %d exceeds limit %d\n",
866 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
867 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
868 err = -ENFILE;
869 }
870 if (err)
871 return err;
872 }
873
874 return err;
875}
876
877static int __init cpu_debug_init(void)
878{
879 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
880
881 return cpu_init_cpu();
882}
883
884static void __exit cpu_debug_exit(void)
885{
886 int i, cpu;
887
888 if (cpu_debugfs_dir)
889 debugfs_remove_recursive(cpu_debugfs_dir);
890
891 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
892 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
893 kfree(per_cpu(priv_arr[i], cpu));
894}
895
896module_init(cpu_debug_init);
897module_exit(cpu_debug_exit);
898
899MODULE_AUTHOR("Jaswinder Singh Rajput");
900MODULE_DESCRIPTION("CPU Debug module");
901MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071a..593171e967ef 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
61 */ 61 */
62static unsigned char Cx86_dir0_msb __cpuinitdata = 0; 62static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
63 63
64static char Cx86_model[][9] __cpuinitdata = { 64static const char __cpuinitconst Cx86_model[][9] = {
65 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", 65 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
66 "M II ", "Unknown" 66 "M II ", "Unknown"
67}; 67};
68static char Cx486_name[][5] __cpuinitdata = { 68static const char __cpuinitconst Cx486_name[][5] = {
69 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", 69 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
70 "SRx2", "DRx2" 70 "SRx2", "DRx2"
71}; 71};
72static char Cx486S_name[][4] __cpuinitdata = { 72static const char __cpuinitconst Cx486S_name[][4] = {
73 "S", "S2", "Se", "S2e" 73 "S", "S2", "Se", "S2e"
74}; 74};
75static char Cx486D_name[][4] __cpuinitdata = { 75static const char __cpuinitconst Cx486D_name[][4] = {
76 "DX", "DX2", "?", "?", "?", "DX4" 76 "DX", "DX2", "?", "?", "?", "DX4"
77}; 77};
78static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; 78static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
79static char cyrix_model_mult1[] __cpuinitdata = "12??43"; 79static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
80static char cyrix_model_mult2[] __cpuinitdata = "12233445"; 80static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
81 81
82/* 82/*
83 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old 83 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
435 } 435 }
436} 436}
437 437
438static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 438static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
439 .c_vendor = "Cyrix", 439 .c_vendor = "Cyrix",
440 .c_ident = { "CyrixInstead" }, 440 .c_ident = { "CyrixInstead" },
441 .c_early_init = early_init_cyrix, 441 .c_early_init = early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
446 446
447cpu_dev_register(cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
448 448
449static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
450 .c_vendor = "NSC", 450 .c_vendor = "NSC",
451 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
452 .c_init = init_nsc, 452 .c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1a89a2b68d15..7437fa133c02 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -14,6 +14,7 @@
14#include <asm/uaccess.h> 14#include <asm/uaccess.h>
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17#include <asm/cpu.h>
17 18
18#ifdef CONFIG_X86_64 19#ifdef CONFIG_X86_64
19#include <asm/topology.h> 20#include <asm/topology.h>
@@ -54,6 +55,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
54 c->x86_cache_alignment = 128; 55 c->x86_cache_alignment = 128;
55#endif 56#endif
56 57
58 /* CPUID workaround for 0F33/0F34 CPU */
59 if (c->x86 == 0xF && c->x86_model == 0x3
60 && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
61 c->x86_phys_bits = 36;
62
57 /* 63 /*
58 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate 64 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
59 * with P/T states and does not stop in deep C-states. 65 * with P/T states and does not stop in deep C-states.
@@ -116,6 +122,28 @@ static void __cpuinit trap_init_f00f_bug(void)
116} 122}
117#endif 123#endif
118 124
125static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
126{
127#ifdef CONFIG_SMP
128 /* calling is from identify_secondary_cpu() ? */
129 if (c->cpu_index == boot_cpu_id)
130 return;
131
132 /*
133 * Mask B, Pentium, but not Pentium MMX
134 */
135 if (c->x86 == 5 &&
136 c->x86_mask >= 1 && c->x86_mask <= 4 &&
137 c->x86_model <= 3) {
138 /*
139 * Remember we have B step Pentia with bugs
140 */
141 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
142 "with B stepping processors.\n");
143 }
144#endif
145}
146
119static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 147static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
120{ 148{
121 unsigned long lo, hi; 149 unsigned long lo, hi;
@@ -192,6 +220,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
192#ifdef CONFIG_X86_NUMAQ 220#ifdef CONFIG_X86_NUMAQ
193 numaq_tsc_disable(); 221 numaq_tsc_disable();
194#endif 222#endif
223
224 intel_smp_check(c);
195} 225}
196#else 226#else
197static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 227static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -391,7 +421,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
391} 421}
392#endif 422#endif
393 423
394static struct cpu_dev intel_cpu_dev __cpuinitdata = { 424static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
395 .c_vendor = "Intel", 425 .c_vendor = "Intel",
396 .c_ident = { "GenuineIntel" }, 426 .c_ident = { "GenuineIntel" },
397#ifdef CONFIG_X86_32 427#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7293508d8f5c..c471eb1a389c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
32}; 32};
33 33
34/* all the cache descriptor types we care about (no TLB or trace cache entries) */ 34/* all the cache descriptor types we care about (no TLB or trace cache entries) */
35static struct _cache_table cache_table[] __cpuinitdata = 35static const struct _cache_table __cpuinitconst cache_table[] =
36{ 36{
37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ 37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ 38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
@@ -206,15 +206,15 @@ union l3_cache {
206 unsigned val; 206 unsigned val;
207}; 207};
208 208
209static unsigned short assocs[] __cpuinitdata = { 209static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 210 [1] = 1, [2] = 2, [4] = 4, [6] = 8,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 211 [8] = 16, [0xa] = 32, [0xb] = 48,
212 [0xc] = 64, 212 [0xc] = 64,
213 [0xf] = 0xffff // ?? 213 [0xf] = 0xffff // ??
214}; 214};
215 215
216static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
217static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 217static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
218 218
219static void __cpuinit 219static void __cpuinit
220amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 220amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb69..b2f89829bbe8 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
60 } 60 }
61} 61}
62 62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str) 63static int __init mcheck_disable(char *str)
78{ 64{
79 mce_disabled = 1; 65 mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f2..ca14604611ec 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -24,6 +26,9 @@
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/kmod.h> 27#include <linux/kmod.h>
26#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
27#include <asm/processor.h> 32#include <asm/processor.h>
28#include <asm/msr.h> 33#include <asm/msr.h>
29#include <asm/mce.h> 34#include <asm/mce.h>
@@ -32,7 +37,6 @@
32#include <asm/idle.h> 37#include <asm/idle.h>
33 38
34#define MISC_MCELOG_MINOR 227 39#define MISC_MCELOG_MINOR 227
35#define NR_SYSFS_BANKS 6
36 40
37atomic_t mce_entry; 41atomic_t mce_entry;
38 42
@@ -47,7 +51,7 @@ static int mce_dont_init;
47 */ 51 */
48static int tolerant = 1; 52static int tolerant = 1;
49static int banks; 53static int banks;
50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; 54static u64 *bank;
51static unsigned long notify_user; 55static unsigned long notify_user;
52static int rip_msr; 56static int rip_msr;
53static int mce_bootlog = -1; 57static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
58 62
59static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
61/* 78/*
62 * Lockless MCE logging infrastructure. 79 * Lockless MCE logging infrastructure.
63 * This avoids deadlocks on printk locks without having to break locks. Also 80 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
119 print_symbol("{%s}", m->ip); 136 print_symbol("{%s}", m->ip);
120 printk("\n"); 137 printk("\n");
121 } 138 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc); 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
123 if (m->addr) 140 if (m->addr)
124 printk("ADDR %Lx ", m->addr); 141 printk("ADDR %llx ", m->addr);
125 if (m->misc) 142 if (m->misc)
126 printk("MISC %Lx ", m->misc); 143 printk("MISC %llx ", m->misc);
127 printk("\n"); 144 printk("\n");
128 printk(KERN_EMERG "This is not a software problem!\n"); 145 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG "Run through mcelog --ascii to decode " 146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
149 panic(msg); 166 panic(msg);
150} 167}
151 168
152static int mce_available(struct cpuinfo_x86 *c) 169int mce_available(struct cpuinfo_x86 *c)
153{ 170{
171 if (mce_dont_init)
172 return 0;
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155} 174}
156 175
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172} 191}
173 192
174/* 193/*
175 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
176 */ 265 */
177void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
178{ 267{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
190 * error. 279 * error.
191 */ 280 */
192 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
193 283
194 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
195 285
196 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
197 && notify_die(DIE_NMI, "machine check", regs, error_code,
198 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
199 || !banks) 288 goto out2;
289 if (!banks)
200 goto out2; 290 goto out2;
201 291
202 memset(&m, 0, sizeof(struct mce)); 292 mce_setup(&m);
203 m.cpu = smp_processor_id(); 293
204 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 /* if the restart IP is not valid, we're done for */ 295 /* if the restart IP is not valid, we're done for */
206 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
210 barrier(); 300 barrier();
211 301
212 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
213 if (i < NR_SYSFS_BANKS && !bank[i]) 303 __clear_bit(i, toclear);
304 if (!bank[i])
214 continue; 305 continue;
215 306
216 m.misc = 0; 307 m.misc = 0;
217 m.addr = 0; 308 m.addr = 0;
218 m.bank = i; 309 m.bank = i;
219 m.tsc = 0;
220 310
221 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
223 continue; 313 continue;
224 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
225 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
226 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
227 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
235 no_way_out = 1; 339 no_way_out = 1;
236 kill_it = 1; 340 kill_it = 1;
237 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
238 } 348 }
239 349
240 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 354
245 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
246 if (error_code >= 0) 356 mce_log(&m);
247 rdtscll(m.tsc);
248 if (error_code != -2)
249 mce_log(&m);
250 357
251 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
252 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
255 panicm = m; 362 panicm = m;
256 panicm_found = 1; 363 panicm_found = 1;
257 } 364 }
258
259 add_taint(TAINT_MACHINE_CHECK);
260 } 365 }
261 366
262 /* Never do anything final in the polling timer */
263 if (!regs)
264 goto out;
265
266 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
267 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
268 if (!panicm_found) 369 if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
309 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
310 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
311 412
312 out:
313 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
314 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
315 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
316 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
317 out2: 419 out2:
318 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
332 * and historically has been the register value of the 434 * and historically has been the register value of the
333 * MSR_IA32_THERMAL_STATUS (Intel) msr. 435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
334 */ 436 */
335void mce_log_therm_throt_event(unsigned int cpu, __u64 status) 437void mce_log_therm_throt_event(__u64 status)
336{ 438{
337 struct mce m; 439 struct mce m;
338 440
339 memset(&m, 0, sizeof(m)); 441 mce_setup(&m);
340 m.cpu = cpu;
341 m.bank = MCE_THERMAL_BANK; 442 m.bank = MCE_THERMAL_BANK;
342 m.status = status; 443 m.status = status;
343 rdtscll(m.tsc);
344 mce_log(&m); 444 mce_log(&m);
345} 445}
346#endif /* CONFIG_X86_MCE_INTEL */ 446#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
353 453
354static int check_interval = 5 * 60; /* 5 minutes */ 454static int check_interval = 5 * 60; /* 5 minutes */
355static int next_interval; /* in jiffies */ 455static int next_interval; /* in jiffies */
356static void mcheck_timer(struct work_struct *work); 456static void mcheck_timer(unsigned long);
357static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 457static DEFINE_PER_CPU(struct timer_list, mce_timer);
358 458
359static void mcheck_check_cpu(void *info) 459static void mcheck_timer(unsigned long data)
360{ 460{
361 if (mce_available(&current_cpu_data)) 461 struct timer_list *t = &per_cpu(mce_timer, data);
362 do_machine_check(NULL, 0);
363}
364 462
365static void mcheck_timer(struct work_struct *work) 463 WARN_ON(smp_processor_id() != data);
366{ 464
367 on_each_cpu(mcheck_check_cpu, NULL, 1); 465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP,
467 &__get_cpu_var(mce_poll_banks));
368 468
369 /* 469 /*
370 * Alert userspace if needed. If we logged an MCE, reduce the 470 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
377 (int)round_jiffies_relative(check_interval*HZ)); 477 (int)round_jiffies_relative(check_interval*HZ));
378 } 478 }
379 479
380 schedule_delayed_work(&mcheck_work, next_interval); 480 t->expires = jiffies + next_interval;
481 add_timer(t);
482}
483
484static void mce_do_trigger(struct work_struct *work)
485{
486 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
381} 487}
382 488
489static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
490
383/* 491/*
384 * This is only called from process context. This is where we do 492 * Notify the user(s) about new machine check events.
385 * anything we need to alert userspace about new MCEs. This is called 493 * Can be called from interrupt context, but not from machine check/NMI
386 * directly from the poller and also from entry.S and idle, thanks to 494 * context.
387 * TIF_MCE_NOTIFY.
388 */ 495 */
389int mce_notify_user(void) 496int mce_notify_user(void)
390{ 497{
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
500
391 clear_thread_flag(TIF_MCE_NOTIFY); 501 clear_thread_flag(TIF_MCE_NOTIFY);
392 if (test_and_clear_bit(0, &notify_user)) { 502 if (test_and_clear_bit(0, &notify_user)) {
393 static unsigned long last_print;
394 unsigned long now = jiffies;
395
396 wake_up_interruptible(&mce_wait); 503 wake_up_interruptible(&mce_wait);
397 if (trigger[0])
398 call_usermodehelper(trigger, trigger_argv, NULL,
399 UMH_NO_WAIT);
400 504
401 if (time_after_eq(now, last_print + (check_interval*HZ))) { 505 /*
402 last_print = now; 506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
508 * executed.
509 */
510 if (trigger[0] && !work_pending(&mce_trigger_work))
511 schedule_work(&mce_trigger_work);
512
513 if (__ratelimit(&ratelimit))
403 printk(KERN_INFO "Machine check events logged\n"); 514 printk(KERN_INFO "Machine check events logged\n");
404 }
405 515
406 return 1; 516 return 1;
407 } 517 }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
425 535
426static __init int periodic_mcheck_init(void) 536static __init int periodic_mcheck_init(void)
427{ 537{
428 next_interval = check_interval * HZ; 538 idle_notifier_register(&mce_idle_notifier);
429 if (next_interval) 539 return 0;
430 schedule_delayed_work(&mcheck_work,
431 round_jiffies_relative(next_interval));
432 idle_notifier_register(&mce_idle_notifier);
433 return 0;
434} 540}
435__initcall(periodic_mcheck_init); 541__initcall(periodic_mcheck_init);
436 542
437
438/* 543/*
439 * Initialize Machine Checks for a CPU. 544 * Initialize Machine Checks for a CPU.
440 */ 545 */
441static void mce_init(void *dummy) 546static int mce_cap_init(void)
442{ 547{
443 u64 cap; 548 u64 cap;
444 int i; 549 unsigned b;
445 550
446 rdmsrl(MSR_IA32_MCG_CAP, cap); 551 rdmsrl(MSR_IA32_MCG_CAP, cap);
447 banks = cap & 0xff; 552 b = cap & 0xff;
448 if (banks > MCE_EXTENDED_BANK) { 553 if (b > MAX_NR_BANKS) {
449 banks = MCE_EXTENDED_BANK; 554 printk(KERN_WARNING
450 printk(KERN_INFO "MCE: warning: using only %d banks\n", 555 "MCE: Using only %u machine check banks out of %u\n",
451 MCE_EXTENDED_BANK); 556 MAX_NR_BANKS, b);
557 b = MAX_NR_BANKS;
452 } 558 }
559
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks != 0 && b != banks);
562 banks = b;
563 if (!bank) {
564 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
565 if (!bank)
566 return -ENOMEM;
567 memset(bank, 0xff, banks * sizeof(u64));
568 }
569
453 /* Use accurate RIP reporting if available. */ 570 /* Use accurate RIP reporting if available. */
454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 571 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 rip_msr = MSR_IA32_MCG_EIP; 572 rip_msr = MSR_IA32_MCG_EIP;
456 573
457 /* Log the machine checks left over from the previous reset. 574 return 0;
458 This also clears all registers */ 575}
459 do_machine_check(NULL, mce_bootlog ? -1 : -2); 576
577static void mce_init(void *dummy)
578{
579 u64 cap;
580 int i;
581 mce_banks_t all_banks;
582
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 bitmap_fill(all_banks, MAX_NR_BANKS);
587 machine_check_poll(MCP_UC, &all_banks);
460 588
461 set_in_cr4(X86_CR4_MCE); 589 set_in_cr4(X86_CR4_MCE);
462 590
591 rdmsrl(MSR_IA32_MCG_CAP, cap);
463 if (cap & MCG_CTL_P) 592 if (cap & MCG_CTL_P)
464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 593 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 594
466 for (i = 0; i < banks; i++) { 595 for (i = 0; i < banks; i++) {
467 if (i < NR_SYSFS_BANKS) 596 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 597 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 } 598 }
474} 599}
475 600
476/* Add per CPU specific workarounds here */ 601/* Add per CPU specific workarounds here */
477static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 602static void mce_cpu_quirks(struct cpuinfo_x86 *c)
478{ 603{
479 /* This should be disabled by the BIOS, but isn't always */ 604 /* This should be disabled by the BIOS, but isn't always */
480 if (c->x86_vendor == X86_VENDOR_AMD) { 605 if (c->x86_vendor == X86_VENDOR_AMD) {
481 if(c->x86 == 15) 606 if (c->x86 == 15 && banks > 4)
482 /* disable GART TBL walk error reporting, which trips off 607 /* disable GART TBL walk error reporting, which trips off
483 incorrectly with the IOMMU & 3ware & Cerberus. */ 608 incorrectly with the IOMMU & 3ware & Cerberus. */
484 clear_bit(10, &bank[4]); 609 clear_bit(10, (unsigned long *)&bank[4]);
485 if(c->x86 <= 17 && mce_bootlog < 0) 610 if(c->x86 <= 17 && mce_bootlog < 0)
486 /* Lots of broken BIOS around that don't clear them 611 /* Lots of broken BIOS around that don't clear them
487 by default and leave crap in there. Don't log. */ 612 by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
504 } 629 }
505} 630}
506 631
632static void mce_init_timer(void)
633{
634 struct timer_list *t = &__get_cpu_var(mce_timer);
635
636 /* data race harmless because everyone sets to the same value */
637 if (!next_interval)
638 next_interval = check_interval * HZ;
639 if (!next_interval)
640 return;
641 setup_timer(t, mcheck_timer, smp_processor_id());
642 t->expires = round_jiffies(jiffies + next_interval);
643 add_timer(t);
644}
645
507/* 646/*
508 * Called for each booted CPU to set up machine checks. 647 * Called for each booted CPU to set up machine checks.
509 * Must be called with preempt off. 648 * Must be called with preempt off.
510 */ 649 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 650void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 651{
513 mce_cpu_quirks(c); 652 if (!mce_available(c))
653 return;
514 654
515 if (mce_dont_init || 655 if (mce_cap_init() < 0) {
516 !mce_available(c)) 656 mce_dont_init = 1;
517 return; 657 return;
658 }
659 mce_cpu_quirks(c);
518 660
519 mce_init(NULL); 661 mce_init(NULL);
520 mce_cpu_features(c); 662 mce_cpu_features(c);
663 mce_init_timer();
521} 664}
522 665
523/* 666/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573{ 716{
574 unsigned long *cpu_tsc; 717 unsigned long *cpu_tsc;
575 static DEFINE_MUTEX(mce_read_mutex); 718 static DEFINE_MUTEX(mce_read_mutex);
576 unsigned next; 719 unsigned prev, next;
577 char __user *buf = ubuf; 720 char __user *buf = ubuf;
578 int i, err; 721 int i, err;
579 722
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
592 } 735 }
593 736
594 err = 0; 737 err = 0;
595 for (i = 0; i < next; i++) { 738 prev = 0;
596 unsigned long start = jiffies; 739 do {
597 740 for (i = prev; i < next; i++) {
598 while (!mcelog.entry[i].finished) { 741 unsigned long start = jiffies;
599 if (time_after_eq(jiffies, start + 2)) { 742
600 memset(mcelog.entry + i,0, sizeof(struct mce)); 743 while (!mcelog.entry[i].finished) {
601 goto timeout; 744 if (time_after_eq(jiffies, start + 2)) {
745 memset(mcelog.entry + i, 0,
746 sizeof(struct mce));
747 goto timeout;
748 }
749 cpu_relax();
602 } 750 }
603 cpu_relax(); 751 smp_rmb();
752 err |= copy_to_user(buf, mcelog.entry + i,
753 sizeof(struct mce));
754 buf += sizeof(struct mce);
755timeout:
756 ;
604 } 757 }
605 smp_rmb();
606 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 buf += sizeof(struct mce);
608 timeout:
609 ;
610 }
611 758
612 memset(mcelog.entry, 0, next * sizeof(struct mce)); 759 memset(mcelog.entry + prev, 0,
613 mcelog.next = 0; 760 (next - prev) * sizeof(struct mce));
761 prev = next;
762 next = cmpxchg(&mcelog.next, prev, 0);
763 } while (next != prev);
614 764
615 synchronize_sched(); 765 synchronize_sched();
616 766
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
680 &mce_chrdev_ops, 830 &mce_chrdev_ops,
681}; 831};
682 832
683static unsigned long old_cr4 __initdata;
684
685void __init stop_mce(void)
686{
687 old_cr4 = read_cr4();
688 clear_in_cr4(X86_CR4_MCE);
689}
690
691void __init restart_mce(void)
692{
693 if (old_cr4 & X86_CR4_MCE)
694 set_in_cr4(X86_CR4_MCE);
695}
696
697/* 833/*
698 * Old style boot options parsing. Only for compatibility. 834 * Old style boot options parsing. Only for compatibility.
699 */ 835 */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
703 return 1; 839 return 1;
704} 840}
705 841
706/* mce=off disables machine check. Note you can re-enable it later 842/* mce=off disables machine check.
707 using sysfs.
708 mce=TOLERANCELEVEL (number, see above) 843 mce=TOLERANCELEVEL (number, see above)
709 mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710 mce=nobootlog Don't log MCEs from before booting. */ 845 mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
728 * Sysfs support 863 * Sysfs support
729 */ 864 */
730 865
866/*
867 * Disable machine checks on suspend and shutdown. We can't really handle
868 * them later.
869 */
870static int mce_disable(void)
871{
872 int i;
873
874 for (i = 0; i < banks; i++)
875 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
876 return 0;
877}
878
879static int mce_suspend(struct sys_device *dev, pm_message_t state)
880{
881 return mce_disable();
882}
883
884static int mce_shutdown(struct sys_device *dev)
885{
886 return mce_disable();
887}
888
731/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. 889/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732 Only one CPU is active at this time, the others get readded later using 890 Only one CPU is active at this time, the others get readded later using
733 CPU hotplug. */ 891 CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
738 return 0; 896 return 0;
739} 897}
740 898
899static void mce_cpu_restart(void *data)
900{
901 del_timer_sync(&__get_cpu_var(mce_timer));
902 if (mce_available(&current_cpu_data))
903 mce_init(NULL);
904 mce_init_timer();
905}
906
741/* Reinit MCEs after user configuration changes */ 907/* Reinit MCEs after user configuration changes */
742static void mce_restart(void) 908static void mce_restart(void)
743{ 909{
744 if (next_interval)
745 cancel_delayed_work(&mcheck_work);
746 /* Timer race is harmless here */
747 on_each_cpu(mce_init, NULL, 1);
748 next_interval = check_interval * HZ; 910 next_interval = check_interval * HZ;
749 if (next_interval) 911 on_each_cpu(mce_cpu_restart, NULL, 1);
750 schedule_delayed_work(&mcheck_work,
751 round_jiffies_relative(next_interval));
752} 912}
753 913
754static struct sysdev_class mce_sysclass = { 914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
755 .resume = mce_resume, 917 .resume = mce_resume,
756 .name = "machinecheck", 918 .name = "machinecheck",
757}; 919};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
778 } \ 940 } \
779 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
780 942
781/* 943static struct sysdev_attribute *bank_attrs;
782 * TBD should generate these dynamically based on number of available banks. 944
783 * Have only 6 contol banks in /sysfs until then. 945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
784 */ 946 char *buf)
785ACCESSOR(bank0ctl,bank[0],mce_restart()) 947{
786ACCESSOR(bank1ctl,bank[1],mce_restart()) 948 u64 b = bank[attr - bank_attrs];
787ACCESSOR(bank2ctl,bank[2],mce_restart()) 949 return sprintf(buf, "%llx\n", b);
788ACCESSOR(bank3ctl,bank[3],mce_restart()) 950}
789ACCESSOR(bank4ctl,bank[4],mce_restart()) 951
790ACCESSOR(bank5ctl,bank[5],mce_restart()) 952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
791 963
792static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, 964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
793 char *buf) 965 char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
814static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
815ACCESSOR(check_interval,check_interval,mce_restart()) 987ACCESSOR(check_interval,check_interval,mce_restart())
816static struct sysdev_attribute *mce_attributes[] = { 988static struct sysdev_attribute *mce_attributes[] = {
817 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
818 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
819 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
820 NULL 990 NULL
821}; 991};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
845 if (err) 1015 if (err)
846 goto error; 1016 goto error;
847 } 1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
848 cpu_set(cpu, mce_device_initialized); 1024 cpu_set(cpu, mce_device_initialized);
849 1025
850 return 0; 1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
851error: 1032error:
852 while (i--) { 1033 while (--i >= 0) {
853 sysdev_remove_file(&per_cpu(device_mce,cpu), 1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
854 mce_attributes[i]); 1035 mce_attributes[i]);
855 } 1036 }
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
868 for (i = 0; mce_attributes[i]; i++) 1049 for (i = 0; mce_attributes[i]; i++)
869 sysdev_remove_file(&per_cpu(device_mce,cpu), 1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
870 mce_attributes[i]); 1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
871 sysdev_unregister(&per_cpu(device_mce,cpu)); 1055 sysdev_unregister(&per_cpu(device_mce,cpu));
872 cpu_clear(cpu, mce_device_initialized); 1056 cpu_clear(cpu, mce_device_initialized);
873} 1057}
874 1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
875/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
876static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, 1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 unsigned long action, void *hcpu) 1088 unsigned long action, void *hcpu)
878{ 1089{
879 unsigned int cpu = (unsigned long)hcpu; 1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
880 1092
881 switch (action) { 1093 switch (action) {
882 case CPU_ONLINE: 1094 case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
891 threshold_cpu_callback(action, cpu); 1103 threshold_cpu_callback(action, cpu);
892 mce_remove_device(cpu); 1104 mce_remove_device(cpu);
893 break; 1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies + next_interval);
1114 add_timer_on(t, cpu);
1115 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116 break;
1117 case CPU_POST_DEAD:
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu);
1120 break;
894 } 1121 }
895 return NOTIFY_OK; 1122 return NOTIFY_OK;
896} 1123}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
899 .notifier_call = mce_cpu_callback, 1126 .notifier_call = mce_cpu_callback,
900}; 1127};
901 1128
1129static __init int mce_init_banks(void)
1130{
1131 int i;
1132
1133 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1134 GFP_KERNEL);
1135 if (!bank_attrs)
1136 return -ENOMEM;
1137
1138 for (i = 0; i < banks; i++) {
1139 struct sysdev_attribute *a = &bank_attrs[i];
1140 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1141 if (!a->attr.name)
1142 goto nomem;
1143 a->attr.mode = 0644;
1144 a->show = show_bank;
1145 a->store = set_bank;
1146 }
1147 return 0;
1148
1149nomem:
1150 while (--i >= 0)
1151 kfree(bank_attrs[i].attr.name);
1152 kfree(bank_attrs);
1153 bank_attrs = NULL;
1154 return -ENOMEM;
1155}
1156
902static __init int mce_init_device(void) 1157static __init int mce_init_device(void)
903{ 1158{
904 int err; 1159 int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
906 1161
907 if (!mce_available(&boot_cpu_data)) 1162 if (!mce_available(&boot_cpu_data))
908 return -EIO; 1163 return -EIO;
1164
1165 err = mce_init_banks();
1166 if (err)
1167 return err;
1168
909 err = sysdev_class_register(&mce_sysclass); 1169 err = sysdev_class_register(&mce_sysclass);
910 if (err) 1170 if (err)
911 return err; 1171 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd469..7d01be868870 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
79 79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81 81
82static void amd_threshold_interrupt(void);
83
82/* 84/*
83 * CPU Initialization 85 * CPU Initialization
84 */ 86 */
@@ -90,7 +92,8 @@ struct thresh_restart {
90}; 92};
91 93
92/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
93static long threshold_restart_bank(void *_tr) 95/* Called via smp_call_function_single() */
96static void threshold_restart_bank(void *_tr)
94{ 97{
95 struct thresh_restart *tr = _tr; 98 struct thresh_restart *tr = _tr;
96 u32 mci_misc_hi, mci_misc_lo; 99 u32 mci_misc_hi, mci_misc_lo;
@@ -117,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
117 120
118 mci_misc_hi |= MASK_COUNT_EN_HI; 121 mci_misc_hi |= MASK_COUNT_EN_HI;
119 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 122 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
120 return 0;
121} 123}
122 124
123/* cpu init entry point, called from mce.c with preempt off */ 125/* cpu init entry point, called from mce.c with preempt off */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
174 tr.reset = 0; 176 tr.reset = 0;
175 tr.old_limit = 0; 177 tr.old_limit = 0;
176 threshold_restart_bank(&tr); 178 threshold_restart_bank(&tr);
179
180 mce_threshold_vector = amd_threshold_interrupt;
177 } 181 }
178 } 182 }
179} 183}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
187 * the interrupt goes off when error_count reaches threshold_limit. 191 * the interrupt goes off when error_count reaches threshold_limit.
188 * the handler will simply log mcelog w/ software defined bank number. 192 * the handler will simply log mcelog w/ software defined bank number.
189 */ 193 */
190asmlinkage void mce_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
191{ 195{
192 unsigned int bank, block; 196 unsigned int bank, block;
193 struct mce m; 197 struct mce m;
194 u32 low = 0, high = 0, address = 0; 198 u32 low = 0, high = 0, address = 0;
195 199
196 ack_APIC_irq(); 200 mce_setup(&m);
197 exit_idle();
198 irq_enter();
199
200 memset(&m, 0, sizeof(m));
201 rdtscll(m.tsc);
202 m.cpu = smp_processor_id();
203 201
204 /* assume first bank caused it */ 202 /* assume first bank caused it */
205 for (bank = 0; bank < NR_BANKS; ++bank) { 203 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
233 231
234 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
235 event. */ 233 event. */
236 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks));
237 236
238 if (high & MASK_OVERFLOW_HI) { 237 if (high & MASK_OVERFLOW_HI) {
239 rdmsrl(address, m.misc); 238 rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
243 + bank * NR_BLOCKS 242 + bank * NR_BLOCKS
244 + block; 243 + block;
245 mce_log(&m); 244 mce_log(&m);
246 goto out; 245 return;
247 } 246 }
248 } 247 }
249 } 248 }
250out:
251 inc_irq_stat(irq_threshold_count);
252 irq_exit();
253} 249}
254 250
255/* 251/*
@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
283 tr.b = b; 279 tr.b = b;
284 tr.reset = 0; 280 tr.reset = 0;
285 tr.old_limit = 0; 281 tr.old_limit = 0;
286 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
287 283
288 return end - buf; 284 return end - buf;
289} 285}
@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
305 tr.b = b; 301 tr.b = b;
306 tr.reset = 0; 302 tr.reset = 0;
307 303
308 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
309 305
310 return end - buf; 306 return end - buf;
311} 307}
312 308
313static long local_error_count(void *_b) 309struct threshold_block_cross_cpu {
310 struct threshold_block *tb;
311 long retval;
312};
313
314static void local_error_count_handler(void *_tbcc)
314{ 315{
315 struct threshold_block *b = _b; 316 struct threshold_block_cross_cpu *tbcc = _tbcc;
317 struct threshold_block *b = tbcc->tb;
316 u32 low, high; 318 u32 low, high;
317 319
318 rdmsr(b->address, low, high); 320 rdmsr(b->address, low, high);
319 return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); 321 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
320} 322}
321 323
322static ssize_t show_error_count(struct threshold_block *b, char *buf) 324static ssize_t show_error_count(struct threshold_block *b, char *buf)
323{ 325{
324 return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); 326 struct threshold_block_cross_cpu tbcc = { .tb = b, };
327
328 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
329 return sprintf(buf, "%lx\n", tbcc.retval);
325} 330}
326 331
327static ssize_t store_error_count(struct threshold_block *b, 332static ssize_t store_error_count(struct threshold_block *b,
@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
329{ 334{
330 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; 335 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
331 336
332 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 337 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
333 return 1; 338 return 1;
334} 339}
335 340
@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
398 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
399 return 0; 404 return 0;
400 405
401 if (rdmsr_safe(address, &low, &high)) 406 if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
402 return 0; 407 return 0;
403 408
404 if (!(high & MASK_VALID_HI)) { 409 if (!(high & MASK_VALID_HI)) {
@@ -462,12 +467,11 @@ out_free:
462 return err; 467 return err;
463} 468}
464 469
465static __cpuinit long local_allocate_threshold_blocks(void *_bank) 470static __cpuinit long
471local_allocate_threshold_blocks(int cpu, unsigned int bank)
466{ 472{
467 unsigned int *bank = _bank; 473 return allocate_threshold_blocks(cpu, bank, 0,
468 474 MSR_IA32_MC0_MISC + bank * 4);
469 return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
470 MSR_IA32_MC0_MISC + *bank * 4);
471} 475}
472 476
473/* symlinks sibling shared banks to first core. first core owns dir/files. */ 477/* symlinks sibling shared banks to first core. first core owns dir/files. */
@@ -530,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
530 534
531 per_cpu(threshold_banks, cpu)[bank] = b; 535 per_cpu(threshold_banks, cpu)[bank] = b;
532 536
533 err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); 537 err = local_allocate_threshold_blocks(cpu, bank);
534 if (err) 538 if (err)
535 goto out_free; 539 goto out_free;
536 540
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e0..57df3d383470 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
1/* 1/*
2 * Intel specific MCE features. 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> 3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
4 */ 6 */
5 7
6#include <linux/init.h> 8#include <linux/init.h>
@@ -13,6 +15,7 @@
13#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
16 19
17asmlinkage void smp_thermal_interrupt(void) 20asmlinkage void smp_thermal_interrupt(void)
18{ 21{
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
25 28
26 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
27 if (therm_throt_process(msr_val & 1)) 30 if (therm_throt_process(msr_val & 1))
28 mce_log_therm_throt_event(smp_processor_id(), msr_val); 31 mce_log_therm_throt_event(msr_val);
29 32
30 inc_irq_stat(irq_thermal_count); 33 inc_irq_stat(irq_thermal_count);
31 irq_exit(); 34 irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
85 return; 88 return;
86} 89}
87 90
91/*
92 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened.
94 * Normally we pick those up using a regular polling timer.
95 * Also supports reliable discovery of shared banks.
96 */
97
98static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
99
100/*
101 * cmci_discover_lock protects against parallel discovery attempts
102 * which could race against each other.
103 */
104static DEFINE_SPINLOCK(cmci_discover_lock);
105
106#define CMCI_THRESHOLD 1
107
108static int cmci_supported(int *banks)
109{
110 u64 cap;
111
112 /*
113 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this
115 * makes sure none of the backdoors are entered otherwise.
116 */
117 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
118 return 0;
119 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
120 return 0;
121 rdmsrl(MSR_IA32_MCG_CAP, cap);
122 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
123 return !!(cap & MCG_CMCI_P);
124}
125
126/*
127 * The interrupt handler. This is called on every event.
128 * Just call the poller directly to log any events.
129 * This could in theory increase the threshold under high load,
130 * but doesn't for now.
131 */
132static void intel_threshold_interrupt(void)
133{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user();
136}
137
138static void print_update(char *type, int *hdr, int num)
139{
140 if (*hdr == 0)
141 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
142 *hdr = 1;
143 printk(KERN_CONT " %s:%d", type, num);
144}
145
146/*
147 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
148 * on this CPU. Use the algorithm recommended in the SDM to discover shared
149 * banks.
150 */
151static void cmci_discover(int banks, int boot)
152{
153 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
154 int hdr = 0;
155 int i;
156
157 spin_lock(&cmci_discover_lock);
158 for (i = 0; i < banks; i++) {
159 u64 val;
160
161 if (test_bit(i, owned))
162 continue;
163
164 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
165
166 /* Already owned by someone else? */
167 if (val & CMCI_EN) {
168 if (test_and_clear_bit(i, owned) || boot)
169 print_update("SHD", &hdr, i);
170 __clear_bit(i, __get_cpu_var(mce_poll_banks));
171 continue;
172 }
173
174 val |= CMCI_EN | CMCI_THRESHOLD;
175 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
176 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
177
178 /* Did the enable bit stick? -- the bank supports CMCI */
179 if (val & CMCI_EN) {
180 if (!test_and_set_bit(i, owned) || boot)
181 print_update("CMCI", &hdr, i);
182 __clear_bit(i, __get_cpu_var(mce_poll_banks));
183 } else {
184 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
185 }
186 }
187 spin_unlock(&cmci_discover_lock);
188 if (hdr)
189 printk(KERN_CONT "\n");
190}
191
192/*
193 * Just in case we missed an event during initialization check
194 * all the CMCI owned banks.
195 */
196void cmci_recheck(void)
197{
198 unsigned long flags;
199 int banks;
200
201 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
202 return;
203 local_irq_save(flags);
204 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
205 local_irq_restore(flags);
206}
207
208/*
209 * Disable CMCI on this CPU for all banks it owns when it goes down.
210 * This allows other CPUs to claim the banks on rediscovery.
211 */
212void cmci_clear(void)
213{
214 int i;
215 int banks;
216 u64 val;
217
218 if (!cmci_supported(&banks))
219 return;
220 spin_lock(&cmci_discover_lock);
221 for (i = 0; i < banks; i++) {
222 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
223 continue;
224 /* Disable CMCI */
225 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
226 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
227 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
228 __clear_bit(i, __get_cpu_var(mce_banks_owned));
229 }
230 spin_unlock(&cmci_discover_lock);
231}
232
233/*
234 * After a CPU went down cycle through all the others and rediscover
235 * Must run in process context.
236 */
237void cmci_rediscover(int dying)
238{
239 int banks;
240 int cpu;
241 cpumask_var_t old;
242
243 if (!cmci_supported(&banks))
244 return;
245 if (!alloc_cpumask_var(&old, GFP_KERNEL))
246 return;
247 cpumask_copy(old, &current->cpus_allowed);
248
249 for_each_online_cpu (cpu) {
250 if (cpu == dying)
251 continue;
252 if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
253 continue;
254 /* Recheck banks in case CPUs don't all have the same */
255 if (cmci_supported(&banks))
256 cmci_discover(banks, 0);
257 }
258
259 set_cpus_allowed_ptr(current, old);
260 free_cpumask_var(old);
261}
262
263/*
264 * Reenable CMCI on this CPU in case a CPU down failed.
265 */
266void cmci_reenable(void)
267{
268 int banks;
269 if (cmci_supported(&banks))
270 cmci_discover(banks, 0);
271}
272
273static void intel_init_cmci(void)
274{
275 int banks;
276
277 if (!cmci_supported(&banks))
278 return;
279
280 mce_threshold_vector = intel_threshold_interrupt;
281 cmci_discover(banks, 1);
282 /*
283 * For CPU #0 this runs with still disabled APIC, but that's
284 * ok because only the vector is set up. We still do another
285 * check for the banks later for CPU #0 just to make sure
286 * to not miss any events.
287 */
288 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
289 cmci_recheck();
290}
291
88void mce_intel_feature_init(struct cpuinfo_x86 *c) 292void mce_intel_feature_init(struct cpuinfo_x86 *c)
89{ 293{
90 intel_init_thermal(c); 294 intel_init_thermal(c);
295 intel_init_cmci();
91} 296}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 000000000000..23ee9e730f78
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
1/*
2 * Common corrected MCE threshold handler code:
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6
7#include <asm/irq_vectors.h>
8#include <asm/apic.h>
9#include <asm/idle.h>
10#include <asm/mce.h>
11
12static void default_threshold_interrupt(void)
13{
14 printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
15 THRESHOLD_APIC_VECTOR);
16}
17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19
20asmlinkage void mce_threshold_interrupt(void)
21{
22 exit_idle();
23 irq_enter();
24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector();
26 irq_exit();
27 /* Ack only at the end to avoid potential reentry */
28 ack_APIC_irq();
29}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc0533649..f4361b56f8e9 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 000000000000..ce0fe4b5c04f
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1101 @@
1/* MTRR (Memory Type Range Register) cleanup
2
3 Copyright (C) 2009 Yinghai Lu
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License as published by the Free Software Foundation; either
8 version 2 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with this library; if not, write to the Free
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18*/
19
20#include <linux/module.h>
21#include <linux/init.h>
22#include <linux/pci.h>
23#include <linux/smp.h>
24#include <linux/cpu.h>
25#include <linux/mutex.h>
26#include <linux/sort.h>
27
28#include <asm/e820.h>
29#include <asm/mtrr.h>
30#include <asm/uaccess.h>
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/kvm_para.h>
34#include "mtrr.h"
35
36/* should be related to MTRR_VAR_RANGES nums */
37#define RANGE_NUM 256
38
39struct res_range {
40 unsigned long start;
41 unsigned long end;
42};
43
44static int __init
45add_range(struct res_range *range, int nr_range, unsigned long start,
46 unsigned long end)
47{
48 /* out of slots */
49 if (nr_range >= RANGE_NUM)
50 return nr_range;
51
52 range[nr_range].start = start;
53 range[nr_range].end = end;
54
55 nr_range++;
56
57 return nr_range;
58}
59
60static int __init
61add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
62 unsigned long end)
63{
64 int i;
65
66 /* try to merge it with old one */
67 for (i = 0; i < nr_range; i++) {
68 unsigned long final_start, final_end;
69 unsigned long common_start, common_end;
70
71 if (!range[i].end)
72 continue;
73
74 common_start = max(range[i].start, start);
75 common_end = min(range[i].end, end);
76 if (common_start > common_end + 1)
77 continue;
78
79 final_start = min(range[i].start, start);
80 final_end = max(range[i].end, end);
81
82 range[i].start = final_start;
83 range[i].end = final_end;
84 return nr_range;
85 }
86
87 /* need to add that */
88 return add_range(range, nr_range, start, end);
89}
90
91static void __init
92subtract_range(struct res_range *range, unsigned long start, unsigned long end)
93{
94 int i, j;
95
96 for (j = 0; j < RANGE_NUM; j++) {
97 if (!range[j].end)
98 continue;
99
100 if (start <= range[j].start && end >= range[j].end) {
101 range[j].start = 0;
102 range[j].end = 0;
103 continue;
104 }
105
106 if (start <= range[j].start && end < range[j].end &&
107 range[j].start < end + 1) {
108 range[j].start = end + 1;
109 continue;
110 }
111
112
113 if (start > range[j].start && end >= range[j].end &&
114 range[j].end > start - 1) {
115 range[j].end = start - 1;
116 continue;
117 }
118
119 if (start > range[j].start && end < range[j].end) {
120 /* find the new spare */
121 for (i = 0; i < RANGE_NUM; i++) {
122 if (range[i].end == 0)
123 break;
124 }
125 if (i < RANGE_NUM) {
126 range[i].end = range[j].end;
127 range[i].start = end + 1;
128 } else {
129 printk(KERN_ERR "run of slot in ranges\n");
130 }
131 range[j].end = start - 1;
132 continue;
133 }
134 }
135}
136
137static int __init cmp_range(const void *x1, const void *x2)
138{
139 const struct res_range *r1 = x1;
140 const struct res_range *r2 = x2;
141 long start1, start2;
142
143 start1 = r1->start;
144 start2 = r2->start;
145
146 return start1 - start2;
147}
148
149struct var_mtrr_range_state {
150 unsigned long base_pfn;
151 unsigned long size_pfn;
152 mtrr_type type;
153};
154
155static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
156static int __initdata debug_print;
157
158static int __init
159x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
160 unsigned long extra_remove_base,
161 unsigned long extra_remove_size)
162{
163 unsigned long base, size;
164 mtrr_type type;
165 int i;
166
167 for (i = 0; i < num_var_ranges; i++) {
168 type = range_state[i].type;
169 if (type != MTRR_TYPE_WRBACK)
170 continue;
171 base = range_state[i].base_pfn;
172 size = range_state[i].size_pfn;
173 nr_range = add_range_with_merge(range, nr_range, base,
174 base + size - 1);
175 }
176 if (debug_print) {
177 printk(KERN_DEBUG "After WB checking\n");
178 for (i = 0; i < nr_range; i++)
179 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
180 range[i].start, range[i].end + 1);
181 }
182
183 /* take out UC ranges */
184 for (i = 0; i < num_var_ranges; i++) {
185 type = range_state[i].type;
186 if (type != MTRR_TYPE_UNCACHABLE &&
187 type != MTRR_TYPE_WRPROT)
188 continue;
189 size = range_state[i].size_pfn;
190 if (!size)
191 continue;
192 base = range_state[i].base_pfn;
193 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
194 (mtrr_state.enabled & 1)) {
195 /* Var MTRR contains UC entry below 1M? Skip it: */
196 printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
197 "contains strange UC entry under 1M, check "
198 "with your system vendor!\n", i);
199 if (base + size <= (1<<(20-PAGE_SHIFT)))
200 continue;
201 size -= (1<<(20-PAGE_SHIFT)) - base;
202 base = 1<<(20-PAGE_SHIFT);
203 }
204 subtract_range(range, base, base + size - 1);
205 }
206 if (extra_remove_size)
207 subtract_range(range, extra_remove_base,
208 extra_remove_base + extra_remove_size - 1);
209
210 /* get new range num */
211 nr_range = 0;
212 for (i = 0; i < RANGE_NUM; i++) {
213 if (!range[i].end)
214 continue;
215 nr_range++;
216 }
217 if (debug_print) {
218 printk(KERN_DEBUG "After UC checking\n");
219 for (i = 0; i < nr_range; i++)
220 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
221 range[i].start, range[i].end + 1);
222 }
223
224 /* sort the ranges */
225 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
226 if (debug_print) {
227 printk(KERN_DEBUG "After sorting\n");
228 for (i = 0; i < nr_range; i++)
229 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
230 range[i].start, range[i].end + 1);
231 }
232
233 /* clear those is not used */
234 for (i = nr_range; i < RANGE_NUM; i++)
235 memset(&range[i], 0, sizeof(range[i]));
236
237 return nr_range;
238}
239
240static struct res_range __initdata range[RANGE_NUM];
241static int __initdata nr_range;
242
243#ifdef CONFIG_MTRR_SANITIZER
244
245static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
246{
247 unsigned long sum;
248 int i;
249
250 sum = 0;
251 for (i = 0; i < nr_range; i++)
252 sum += range[i].end + 1 - range[i].start;
253
254 return sum;
255}
256
257static int enable_mtrr_cleanup __initdata =
258 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
259
260static int __init disable_mtrr_cleanup_setup(char *str)
261{
262 enable_mtrr_cleanup = 0;
263 return 0;
264}
265early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
266
267static int __init enable_mtrr_cleanup_setup(char *str)
268{
269 enable_mtrr_cleanup = 1;
270 return 0;
271}
272early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
273
274static int __init mtrr_cleanup_debug_setup(char *str)
275{
276 debug_print = 1;
277 return 0;
278}
279early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
280
281struct var_mtrr_state {
282 unsigned long range_startk;
283 unsigned long range_sizek;
284 unsigned long chunk_sizek;
285 unsigned long gran_sizek;
286 unsigned int reg;
287};
288
289static void __init
290set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
291 unsigned char type, unsigned int address_bits)
292{
293 u32 base_lo, base_hi, mask_lo, mask_hi;
294 u64 base, mask;
295
296 if (!sizek) {
297 fill_mtrr_var_range(reg, 0, 0, 0, 0);
298 return;
299 }
300
301 mask = (1ULL << address_bits) - 1;
302 mask &= ~((((u64)sizek) << 10) - 1);
303
304 base = ((u64)basek) << 10;
305
306 base |= type;
307 mask |= 0x800;
308
309 base_lo = base & ((1ULL<<32) - 1);
310 base_hi = base >> 32;
311
312 mask_lo = mask & ((1ULL<<32) - 1);
313 mask_hi = mask >> 32;
314
315 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
316}
317
318static void __init
319save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
320 unsigned char type)
321{
322 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
323 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
324 range_state[reg].type = type;
325}
326
327static void __init
328set_var_mtrr_all(unsigned int address_bits)
329{
330 unsigned long basek, sizek;
331 unsigned char type;
332 unsigned int reg;
333
334 for (reg = 0; reg < num_var_ranges; reg++) {
335 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
336 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
337 type = range_state[reg].type;
338
339 set_var_mtrr(reg, basek, sizek, type, address_bits);
340 }
341}
342
343static unsigned long to_size_factor(unsigned long sizek, char *factorp)
344{
345 char factor;
346 unsigned long base = sizek;
347
348 if (base & ((1<<10) - 1)) {
349 /* not MB alignment */
350 factor = 'K';
351 } else if (base & ((1<<20) - 1)) {
352 factor = 'M';
353 base >>= 10;
354 } else {
355 factor = 'G';
356 base >>= 20;
357 }
358
359 *factorp = factor;
360
361 return base;
362}
363
364static unsigned int __init
365range_to_mtrr(unsigned int reg, unsigned long range_startk,
366 unsigned long range_sizek, unsigned char type)
367{
368 if (!range_sizek || (reg >= num_var_ranges))
369 return reg;
370
371 while (range_sizek) {
372 unsigned long max_align, align;
373 unsigned long sizek;
374
375 /* Compute the maximum size I can make a range */
376 if (range_startk)
377 max_align = ffs(range_startk) - 1;
378 else
379 max_align = 32;
380 align = fls(range_sizek) - 1;
381 if (align > max_align)
382 align = max_align;
383
384 sizek = 1 << align;
385 if (debug_print) {
386 char start_factor = 'K', size_factor = 'K';
387 unsigned long start_base, size_base;
388
389 start_base = to_size_factor(range_startk,
390 &start_factor),
391 size_base = to_size_factor(sizek, &size_factor),
392
393 printk(KERN_DEBUG "Setting variable MTRR %d, "
394 "base: %ld%cB, range: %ld%cB, type %s\n",
395 reg, start_base, start_factor,
396 size_base, size_factor,
397 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
398 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
399 );
400 }
401 save_var_mtrr(reg++, range_startk, sizek, type);
402 range_startk += sizek;
403 range_sizek -= sizek;
404 if (reg >= num_var_ranges)
405 break;
406 }
407 return reg;
408}
409
410static unsigned __init
411range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
412 unsigned long sizek)
413{
414 unsigned long hole_basek, hole_sizek;
415 unsigned long second_basek, second_sizek;
416 unsigned long range0_basek, range0_sizek;
417 unsigned long range_basek, range_sizek;
418 unsigned long chunk_sizek;
419 unsigned long gran_sizek;
420
421 hole_basek = 0;
422 hole_sizek = 0;
423 second_basek = 0;
424 second_sizek = 0;
425 chunk_sizek = state->chunk_sizek;
426 gran_sizek = state->gran_sizek;
427
428 /* align with gran size, prevent small block used up MTRRs */
429 range_basek = ALIGN(state->range_startk, gran_sizek);
430 if ((range_basek > basek) && basek)
431 return second_sizek;
432 state->range_sizek -= (range_basek - state->range_startk);
433 range_sizek = ALIGN(state->range_sizek, gran_sizek);
434
435 while (range_sizek > state->range_sizek) {
436 range_sizek -= gran_sizek;
437 if (!range_sizek)
438 return 0;
439 }
440 state->range_sizek = range_sizek;
441
442 /* try to append some small hole */
443 range0_basek = state->range_startk;
444 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
445
446 /* no increase */
447 if (range0_sizek == state->range_sizek) {
448 if (debug_print)
449 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
450 range0_basek<<10,
451 (range0_basek + state->range_sizek)<<10);
452 state->reg = range_to_mtrr(state->reg, range0_basek,
453 state->range_sizek, MTRR_TYPE_WRBACK);
454 return 0;
455 }
456
457 /* only cut back, when it is not the last */
458 if (sizek) {
459 while (range0_basek + range0_sizek > (basek + sizek)) {
460 if (range0_sizek >= chunk_sizek)
461 range0_sizek -= chunk_sizek;
462 else
463 range0_sizek = 0;
464
465 if (!range0_sizek)
466 break;
467 }
468 }
469
470second_try:
471 range_basek = range0_basek + range0_sizek;
472
473 /* one hole in the middle */
474 if (range_basek > basek && range_basek <= (basek + sizek))
475 second_sizek = range_basek - basek;
476
477 if (range0_sizek > state->range_sizek) {
478
479 /* one hole in middle or at end */
480 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
481
482 /* hole size should be less than half of range0 size */
483 if (hole_sizek >= (range0_sizek >> 1) &&
484 range0_sizek >= chunk_sizek) {
485 range0_sizek -= chunk_sizek;
486 second_sizek = 0;
487 hole_sizek = 0;
488
489 goto second_try;
490 }
491 }
492
493 if (range0_sizek) {
494 if (debug_print)
495 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
496 range0_basek<<10,
497 (range0_basek + range0_sizek)<<10);
498 state->reg = range_to_mtrr(state->reg, range0_basek,
499 range0_sizek, MTRR_TYPE_WRBACK);
500 }
501
502 if (range0_sizek < state->range_sizek) {
503 /* need to handle left over */
504 range_sizek = state->range_sizek - range0_sizek;
505
506 if (debug_print)
507 printk(KERN_DEBUG "range: %016lx - %016lx\n",
508 range_basek<<10,
509 (range_basek + range_sizek)<<10);
510 state->reg = range_to_mtrr(state->reg, range_basek,
511 range_sizek, MTRR_TYPE_WRBACK);
512 }
513
514 if (hole_sizek) {
515 hole_basek = range_basek - hole_sizek - second_sizek;
516 if (debug_print)
517 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
518 hole_basek<<10,
519 (hole_basek + hole_sizek)<<10);
520 state->reg = range_to_mtrr(state->reg, hole_basek,
521 hole_sizek, MTRR_TYPE_UNCACHABLE);
522 }
523
524 return second_sizek;
525}
526
527static void __init
528set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
529 unsigned long size_pfn)
530{
531 unsigned long basek, sizek;
532 unsigned long second_sizek = 0;
533
534 if (state->reg >= num_var_ranges)
535 return;
536
537 basek = base_pfn << (PAGE_SHIFT - 10);
538 sizek = size_pfn << (PAGE_SHIFT - 10);
539
540 /* See if I can merge with the last range */
541 if ((basek <= 1024) ||
542 (state->range_startk + state->range_sizek == basek)) {
543 unsigned long endk = basek + sizek;
544 state->range_sizek = endk - state->range_startk;
545 return;
546 }
547 /* Write the range mtrrs */
548 if (state->range_sizek != 0)
549 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
550
551 /* Allocate an msr */
552 state->range_startk = basek + second_sizek;
553 state->range_sizek = sizek - second_sizek;
554}
555
556/* mininum size of mtrr block that can take hole */
557static u64 mtrr_chunk_size __initdata = (256ULL<<20);
558
559static int __init parse_mtrr_chunk_size_opt(char *p)
560{
561 if (!p)
562 return -EINVAL;
563 mtrr_chunk_size = memparse(p, &p);
564 return 0;
565}
566early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
567
568/* granity of mtrr of block */
569static u64 mtrr_gran_size __initdata;
570
571static int __init parse_mtrr_gran_size_opt(char *p)
572{
573 if (!p)
574 return -EINVAL;
575 mtrr_gran_size = memparse(p, &p);
576 return 0;
577}
578early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
579
580static int nr_mtrr_spare_reg __initdata =
581 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
582
583static int __init parse_mtrr_spare_reg(char *arg)
584{
585 if (arg)
586 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
587 return 0;
588}
589
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591
592static int __init
593x86_setup_var_mtrrs(struct res_range *range, int nr_range,
594 u64 chunk_size, u64 gran_size)
595{
596 struct var_mtrr_state var_state;
597 int i;
598 int num_reg;
599
600 var_state.range_startk = 0;
601 var_state.range_sizek = 0;
602 var_state.reg = 0;
603 var_state.chunk_sizek = chunk_size >> 10;
604 var_state.gran_sizek = gran_size >> 10;
605
606 memset(range_state, 0, sizeof(range_state));
607
608 /* Write the range etc */
609 for (i = 0; i < nr_range; i++)
610 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1);
612
613 /* Write the last range */
614 if (var_state.range_sizek != 0)
615 range_to_mtrr_with_hole(&var_state, 0, 0);
616
617 num_reg = var_state.reg;
618 /* Clear out the extra MTRR's */
619 while (var_state.reg < num_var_ranges) {
620 save_var_mtrr(var_state.reg, 0, 0, 0);
621 var_state.reg++;
622 }
623
624 return num_reg;
625}
626
627struct mtrr_cleanup_result {
628 unsigned long gran_sizek;
629 unsigned long chunk_sizek;
630 unsigned long lose_cover_sizek;
631 unsigned int num_reg;
632 int bad;
633};
634
635/*
636 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
637 * chunk size: gran_size, ..., 2G
638 * so we need (1+16)*8
639 */
640#define NUM_RESULT 136
641#define PSHIFT (PAGE_SHIFT - 10)
642
643static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
644static unsigned long __initdata min_loss_pfn[RANGE_NUM];
645
646static void __init print_out_mtrr_range_state(void)
647{
648 int i;
649 char start_factor = 'K', size_factor = 'K';
650 unsigned long start_base, size_base;
651 mtrr_type type;
652
653 for (i = 0; i < num_var_ranges; i++) {
654
655 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
656 if (!size_base)
657 continue;
658
659 size_base = to_size_factor(size_base, &size_factor),
660 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
661 start_base = to_size_factor(start_base, &start_factor),
662 type = range_state[i].type;
663
664 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
665 i, start_base, start_factor,
666 size_base, size_factor,
667 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
668 ((type == MTRR_TYPE_WRPROT) ? "WP" :
669 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
670 );
671 }
672}
673
674static int __init mtrr_need_cleanup(void)
675{
676 int i;
677 mtrr_type type;
678 unsigned long size;
679 /* extra one for all 0 */
680 int num[MTRR_NUM_TYPES + 1];
681
682 /* check entries number */
683 memset(num, 0, sizeof(num));
684 for (i = 0; i < num_var_ranges; i++) {
685 type = range_state[i].type;
686 size = range_state[i].size_pfn;
687 if (type >= MTRR_NUM_TYPES)
688 continue;
689 if (!size)
690 type = MTRR_NUM_TYPES;
691 if (type == MTRR_TYPE_WRPROT)
692 type = MTRR_TYPE_UNCACHABLE;
693 num[type]++;
694 }
695
696 /* check if we got UC entries */
697 if (!num[MTRR_TYPE_UNCACHABLE])
698 return 0;
699
700 /* check if we only had WB and UC */
701 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
702 num_var_ranges - num[MTRR_NUM_TYPES])
703 return 0;
704
705 return 1;
706}
707
708static unsigned long __initdata range_sums;
709static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
710 unsigned long extra_remove_base,
711 unsigned long extra_remove_size,
712 int i)
713{
714 int num_reg;
715 static struct res_range range_new[RANGE_NUM];
716 static int nr_range_new;
717 unsigned long range_sums_new;
718
719 /* convert ranges to var ranges state */
720 num_reg = x86_setup_var_mtrrs(range, nr_range,
721 chunk_size, gran_size);
722
723 /* we got new setting in range_state, check it */
724 memset(range_new, 0, sizeof(range_new));
725 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
726 extra_remove_base, extra_remove_size);
727 range_sums_new = sum_ranges(range_new, nr_range_new);
728
729 result[i].chunk_sizek = chunk_size >> 10;
730 result[i].gran_sizek = gran_size >> 10;
731 result[i].num_reg = num_reg;
732 if (range_sums < range_sums_new) {
733 result[i].lose_cover_sizek =
734 (range_sums_new - range_sums) << PSHIFT;
735 result[i].bad = 1;
736 } else
737 result[i].lose_cover_sizek =
738 (range_sums - range_sums_new) << PSHIFT;
739
740 /* double check it */
741 if (!result[i].bad && !result[i].lose_cover_sizek) {
742 if (nr_range_new != nr_range ||
743 memcmp(range, range_new, sizeof(range)))
744 result[i].bad = 1;
745 }
746
747 if (!result[i].bad && (range_sums - range_sums_new <
748 min_loss_pfn[num_reg])) {
749 min_loss_pfn[num_reg] =
750 range_sums - range_sums_new;
751 }
752}
753
754static void __init mtrr_print_out_one_result(int i)
755{
756 char gran_factor, chunk_factor, lose_factor;
757 unsigned long gran_base, chunk_base, lose_base;
758
759 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
760 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
761 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
762 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
763 result[i].bad ? "*BAD*" : " ",
764 gran_base, gran_factor, chunk_base, chunk_factor);
765 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
766 result[i].num_reg, result[i].bad ? "-" : "",
767 lose_base, lose_factor);
768}
769
770static int __init mtrr_search_optimal_index(void)
771{
772 int i;
773 int num_reg_good;
774 int index_good;
775
776 if (nr_mtrr_spare_reg >= num_var_ranges)
777 nr_mtrr_spare_reg = num_var_ranges - 1;
778 num_reg_good = -1;
779 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
780 if (!min_loss_pfn[i])
781 num_reg_good = i;
782 }
783
784 index_good = -1;
785 if (num_reg_good != -1) {
786 for (i = 0; i < NUM_RESULT; i++) {
787 if (!result[i].bad &&
788 result[i].num_reg == num_reg_good &&
789 !result[i].lose_cover_sizek) {
790 index_good = i;
791 break;
792 }
793 }
794 }
795
796 return index_good;
797}
798
799
800int __init mtrr_cleanup(unsigned address_bits)
801{
802 unsigned long extra_remove_base, extra_remove_size;
803 unsigned long base, size, def, dummy;
804 mtrr_type type;
805 u64 chunk_size, gran_size;
806 int index_good;
807 int i;
808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy);
812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0;
815
816 /* get it and store it aside */
817 memset(range_state, 0, sizeof(range_state));
818 for (i = 0; i < num_var_ranges; i++) {
819 mtrr_if->get(i, &base, &size, &type);
820 range_state[i].base_pfn = base;
821 range_state[i].size_pfn = size;
822 range_state[i].type = type;
823 }
824
825 /* check if we need handle it and can handle it */
826 if (!mtrr_need_cleanup())
827 return 0;
828
829 /* print original var MTRRs at first, for debugging: */
830 printk(KERN_DEBUG "original variable MTRRs\n");
831 print_out_mtrr_range_state();
832
833 memset(range, 0, sizeof(range));
834 extra_remove_size = 0;
835 extra_remove_base = 1 << (32 - PAGE_SHIFT);
836 if (mtrr_tom2)
837 extra_remove_size =
838 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
839 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
840 extra_remove_size);
841 /*
842 * [0, 1M) should always be coverred by var mtrr with WB
843 * and fixed mtrrs should take effective before var mtrr for it
844 */
845 nr_range = add_range_with_merge(range, nr_range, 0,
846 (1ULL<<(20 - PAGE_SHIFT)) - 1);
847 /* sort the ranges */
848 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
849
850 range_sums = sum_ranges(range, nr_range);
851 printk(KERN_INFO "total RAM coverred: %ldM\n",
852 range_sums >> (20 - PAGE_SHIFT));
853
854 if (mtrr_chunk_size && mtrr_gran_size) {
855 i = 0;
856 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
857 extra_remove_base, extra_remove_size, i);
858
859 mtrr_print_out_one_result(i);
860
861 if (!result[i].bad) {
862 set_var_mtrr_all(address_bits);
863 printk(KERN_DEBUG "New variable MTRRs\n");
864 print_out_mtrr_range_state();
865 return 1;
866 }
867 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
868 "will find optimal one\n");
869 }
870
871 i = 0;
872 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
873 memset(result, 0, sizeof(result));
874 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
875
876 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
877 chunk_size <<= 1) {
878
879 if (i >= NUM_RESULT)
880 continue;
881
882 mtrr_calc_range_state(chunk_size, gran_size,
883 extra_remove_base, extra_remove_size, i);
884 if (debug_print) {
885 mtrr_print_out_one_result(i);
886 printk(KERN_INFO "\n");
887 }
888
889 i++;
890 }
891 }
892
893 /* try to find the optimal index */
894 index_good = mtrr_search_optimal_index();
895
896 if (index_good != -1) {
897 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
898 i = index_good;
899 mtrr_print_out_one_result(i);
900
901 /* convert ranges to var ranges state */
902 chunk_size = result[i].chunk_sizek;
903 chunk_size <<= 10;
904 gran_size = result[i].gran_sizek;
905 gran_size <<= 10;
906 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
907 set_var_mtrr_all(address_bits);
908 printk(KERN_DEBUG "New variable MTRRs\n");
909 print_out_mtrr_range_state();
910 return 1;
911 } else {
912 /* print out all */
913 for (i = 0; i < NUM_RESULT; i++)
914 mtrr_print_out_one_result(i);
915 }
916
917 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
918 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
919
920 return 0;
921}
922#else
923int __init mtrr_cleanup(unsigned address_bits)
924{
925 return 0;
926}
927#endif
928
929static int disable_mtrr_trim;
930
931static int __init disable_mtrr_trim_setup(char *str)
932{
933 disable_mtrr_trim = 1;
934 return 0;
935}
936early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
937
938/*
939 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
940 * for memory >4GB. Check for that here.
941 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
942 * apply to are wrong, but so far we don't know of any such case in the wild.
943 */
944#define Tom2Enabled (1U << 21)
945#define Tom2ForceMemTypeWB (1U << 22)
946
947int __init amd_special_default_mtrr(void)
948{
949 u32 l, h;
950
951 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
952 return 0;
953 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
954 return 0;
955 /* In case some hypervisor doesn't pass SYSCFG through */
956 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
957 return 0;
958 /*
959 * Memory between 4GB and top of mem is forced WB by this magic bit.
960 * Reserved before K8RevF, but should be zero there.
961 */
962 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
963 (Tom2Enabled | Tom2ForceMemTypeWB))
964 return 1;
965 return 0;
966}
967
968static u64 __init real_trim_memory(unsigned long start_pfn,
969 unsigned long limit_pfn)
970{
971 u64 trim_start, trim_size;
972 trim_start = start_pfn;
973 trim_start <<= PAGE_SHIFT;
974 trim_size = limit_pfn;
975 trim_size <<= PAGE_SHIFT;
976 trim_size -= trim_start;
977
978 return e820_update_range(trim_start, trim_size, E820_RAM,
979 E820_RESERVED);
980}
981/**
982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
983 * @end_pfn: ending page frame number
984 *
985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
986 * memory configurations. This routine checks that the highest MTRR matches
987 * the end of memory, to make sure the MTRRs having a write back type cover
988 * all of the memory the kernel is intending to use. If not, it'll trim any
989 * memory off the end by adjusting end_pfn, removing it from the kernel's
990 * allocation pools, warning the user with an obnoxious message.
991 */
992int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
993{
994 unsigned long i, base, size, highest_pfn = 0, def, dummy;
995 mtrr_type type;
996 u64 total_trim_size;
997
998 /* extra one for all 0 */
999 int num[MTRR_NUM_TYPES + 1];
1000 /*
1001 * Make sure we only trim uncachable memory on machines that
1002 * support the Intel MTRR architecture:
1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy);
1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0;
1010
1011 /* get it and store it aside */
1012 memset(range_state, 0, sizeof(range_state));
1013 for (i = 0; i < num_var_ranges; i++) {
1014 mtrr_if->get(i, &base, &size, &type);
1015 range_state[i].base_pfn = base;
1016 range_state[i].size_pfn = size;
1017 range_state[i].type = type;
1018 }
1019
1020 /* Find highest cached pfn */
1021 for (i = 0; i < num_var_ranges; i++) {
1022 type = range_state[i].type;
1023 if (type != MTRR_TYPE_WRBACK)
1024 continue;
1025 base = range_state[i].base_pfn;
1026 size = range_state[i].size_pfn;
1027 if (highest_pfn < base + size)
1028 highest_pfn = base + size;
1029 }
1030
1031 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1032 if (!highest_pfn) {
1033 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1034 return 0;
1035 }
1036
1037 /* check entries number */
1038 memset(num, 0, sizeof(num));
1039 for (i = 0; i < num_var_ranges; i++) {
1040 type = range_state[i].type;
1041 if (type >= MTRR_NUM_TYPES)
1042 continue;
1043 size = range_state[i].size_pfn;
1044 if (!size)
1045 type = MTRR_NUM_TYPES;
1046 num[type]++;
1047 }
1048
1049 /* no entry for WB? */
1050 if (!num[MTRR_TYPE_WRBACK])
1051 return 0;
1052
1053 /* check if we only had WB and UC */
1054 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1055 num_var_ranges - num[MTRR_NUM_TYPES])
1056 return 0;
1057
1058 memset(range, 0, sizeof(range));
1059 nr_range = 0;
1060 if (mtrr_tom2) {
1061 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1062 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1063 if (highest_pfn < range[nr_range].end + 1)
1064 highest_pfn = range[nr_range].end + 1;
1065 nr_range++;
1066 }
1067 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1068
1069 total_trim_size = 0;
1070 /* check the head */
1071 if (range[0].start)
1072 total_trim_size += real_trim_memory(0, range[0].start);
1073 /* check the holes */
1074 for (i = 0; i < nr_range - 1; i++) {
1075 if (range[i].end + 1 < range[i+1].start)
1076 total_trim_size += real_trim_memory(range[i].end + 1,
1077 range[i+1].start);
1078 }
1079 /* check the top */
1080 i = nr_range - 1;
1081 if (range[i].end + 1 < end_pfn)
1082 total_trim_size += real_trim_memory(range[i].end + 1,
1083 end_pfn);
1084
1085 if (total_trim_size) {
1086 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1087 " all of memory, losing %lluMB of RAM.\n",
1088 total_trim_size >> 20);
1089
1090 if (!changed_by_mtrr_cleanup)
1091 WARN_ON(1);
1092
1093 printk(KERN_INFO "update e820 for mtrr\n");
1094 update_e820();
1095
1096 return 1;
1097 }
1098
1099 return 0;
1100}
1101
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95c..37f28fc7cf95 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
33struct mtrr_state_type mtrr_state = {}; 33struct mtrr_state_type mtrr_state = {};
34EXPORT_SYMBOL_GPL(mtrr_state); 34EXPORT_SYMBOL_GPL(mtrr_state);
35 35
36static int __initdata mtrr_show; 36/**
37static int __init mtrr_debug(char *opt) 37 * BIOS is expected to clear MtrrFixDramModEn bit, see for example
38 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
39 * Opteron Processors" (26094 Rev. 3.30 February 2006), section
40 * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
41 * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
42 * 0 for operation."
43 */
44static inline void k8_check_syscfg_dram_mod_en(void)
38{ 45{
39 mtrr_show = 1; 46 u32 lo, hi;
40 return 0; 47
48 if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
49 (boot_cpu_data.x86 >= 0x0f)))
50 return;
51
52 rdmsr(MSR_K8_SYSCFG, lo, hi);
53 if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
54 printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
55 " not cleared by BIOS, clearing this bit\n",
56 smp_processor_id());
57 lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
58 mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
59 }
41} 60}
42early_param("mtrr.show", mtrr_debug);
43 61
44/* 62/*
45 * Returns the effective MTRR type for the region 63 * Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
174 unsigned int *p = (unsigned int *) frs; 192 unsigned int *p = (unsigned int *) frs;
175 int i; 193 int i;
176 194
195 k8_check_syscfg_dram_mod_en();
196
177 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
178 198
179 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
188 get_fixed_ranges(mtrr_state.fixed_ranges); 208 get_fixed_ranges(mtrr_state.fixed_ranges);
189} 209}
190 210
191static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) 211static unsigned __initdata last_fixed_start;
212static unsigned __initdata last_fixed_end;
213static mtrr_type __initdata last_fixed_type;
214
215static void __init print_fixed_last(void)
216{
217 if (!last_fixed_end)
218 return;
219
220 printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start,
221 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
222
223 last_fixed_end = 0;
224}
225
226static void __init update_fixed_last(unsigned base, unsigned end,
227 mtrr_type type)
228{
229 last_fixed_start = base;
230 last_fixed_end = end;
231 last_fixed_type = type;
232}
233
234static void __init print_fixed(unsigned base, unsigned step,
235 const mtrr_type *types)
192{ 236{
193 unsigned i; 237 unsigned i;
194 238
195 for (i = 0; i < 8; ++i, ++types, base += step) 239 for (i = 0; i < 8; ++i, ++types, base += step) {
196 printk(KERN_INFO "MTRR %05X-%05X %s\n", 240 if (last_fixed_end == 0) {
197 base, base + step - 1, mtrr_attrib_to_str(*types)); 241 update_fixed_last(base, base + step, *types);
242 continue;
243 }
244 if (last_fixed_end == base && last_fixed_type == *types) {
245 last_fixed_end = base + step;
246 continue;
247 }
248 /* new segments: gap or different type */
249 print_fixed_last();
250 update_fixed_last(base, base + step, *types);
251 }
198} 252}
199 253
200static void prepare_set(void); 254static void prepare_set(void);
201static void post_set(void); 255static void post_set(void);
202 256
257static void __init print_mtrr_state(void)
258{
259 unsigned int i;
260 int high_width;
261
262 printk(KERN_DEBUG "MTRR default type: %s\n",
263 mtrr_attrib_to_str(mtrr_state.def_type));
264 if (mtrr_state.have_fixed) {
265 printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
266 mtrr_state.enabled & 1 ? "en" : "dis");
267 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
268 for (i = 0; i < 2; ++i)
269 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
270 for (i = 0; i < 8; ++i)
271 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
272
273 /* tail */
274 print_fixed_last();
275 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis");
278 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
279 for (i = 0; i < num_var_ranges; ++i) {
280 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
281 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
282 i,
283 high_width,
284 mtrr_state.var_ranges[i].base_hi,
285 mtrr_state.var_ranges[i].base_lo >> 12,
286 high_width,
287 mtrr_state.var_ranges[i].mask_hi,
288 mtrr_state.var_ranges[i].mask_lo >> 12,
289 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
290 else
291 printk(KERN_DEBUG " %u disabled\n", i);
292 }
293 if (mtrr_tom2) {
294 printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
295 mtrr_tom2, mtrr_tom2>>20);
296 }
297}
298
203/* Grab all of the MTRR state for this CPU into *state */ 299/* Grab all of the MTRR state for this CPU into *state */
204void __init get_mtrr_state(void) 300void __init get_mtrr_state(void)
205{ 301{
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
231 mtrr_tom2 |= low; 327 mtrr_tom2 |= low;
232 mtrr_tom2 &= 0xffffff800000ULL; 328 mtrr_tom2 &= 0xffffff800000ULL;
233 } 329 }
234 if (mtrr_show) { 330
235 int high_width; 331 print_mtrr_state();
236 332
237 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
238 if (mtrr_state.have_fixed) {
239 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
240 mtrr_state.enabled & 1 ? "en" : "dis");
241 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
242 for (i = 0; i < 2; ++i)
243 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
244 for (i = 0; i < 8; ++i)
245 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
246 }
247 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
248 mtrr_state.enabled & 2 ? "en" : "dis");
249 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
250 for (i = 0; i < num_var_ranges; ++i) {
251 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
252 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
253 i,
254 high_width,
255 mtrr_state.var_ranges[i].base_hi,
256 mtrr_state.var_ranges[i].base_lo >> 12,
257 high_width,
258 mtrr_state.var_ranges[i].mask_hi,
259 mtrr_state.var_ranges[i].mask_lo >> 12,
260 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
261 else
262 printk(KERN_INFO "MTRR %u disabled\n", i);
263 }
264 if (mtrr_tom2) {
265 printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
266 mtrr_tom2, mtrr_tom2>>20);
267 }
268 }
269 mtrr_state_set = 1; 333 mtrr_state_set = 1;
270 334
271 /* PAT setup for BP. We need to go through sync steps here */ 335 /* PAT setup for BP. We need to go through sync steps here */
@@ -308,27 +372,10 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
308} 372}
309 373
310/** 374/**
311 * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
312 * see AMD publication no. 24593, chapter 3.2.1 for more information
313 */
314static inline void k8_enable_fixed_iorrs(void)
315{
316 unsigned lo, hi;
317
318 rdmsr(MSR_K8_SYSCFG, lo, hi);
319 mtrr_wrmsr(MSR_K8_SYSCFG, lo
320 | K8_MTRRFIXRANGE_DRAM_ENABLE
321 | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
322}
323
324/**
325 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have 375 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
326 * @msr: MSR address of the MTTR which should be checked and updated 376 * @msr: MSR address of the MTTR which should be checked and updated
327 * @changed: pointer which indicates whether the MTRR needed to be changed 377 * @changed: pointer which indicates whether the MTRR needed to be changed
328 * @msrwords: pointer to the MSR values which the MSR should have 378 * @msrwords: pointer to the MSR values which the MSR should have
329 *
330 * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
331 * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
332 */ 379 */
333static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) 380static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
334{ 381{
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
337 rdmsr(msr, lo, hi); 384 rdmsr(msr, lo, hi);
338 385
339 if (lo != msrwords[0] || hi != msrwords[1]) { 386 if (lo != msrwords[0] || hi != msrwords[1]) {
340 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
341 (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
342 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
343 k8_enable_fixed_iorrs();
344 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 387 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
345 *changed = true; 388 *changed = true;
346 } 389 }
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
376{ 419{
377 unsigned int mask_lo, mask_hi, base_lo, base_hi; 420 unsigned int mask_lo, mask_hi, base_lo, base_hi;
378 unsigned int tmp, hi; 421 unsigned int tmp, hi;
422 int cpu;
423
424 /*
425 * get_mtrr doesn't need to update mtrr_state, also it could be called
426 * from any cpu, so try to print it out directly.
427 */
428 cpu = get_cpu();
379 429
380 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 430 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
431
381 if ((mask_lo & 0x800) == 0) { 432 if ((mask_lo & 0x800) == 0) {
382 /* Invalid (i.e. free) range */ 433 /* Invalid (i.e. free) range */
383 *base = 0; 434 *base = 0;
384 *size = 0; 435 *size = 0;
385 *type = 0; 436 *type = 0;
386 return; 437 goto out_put_cpu;
387 } 438 }
388 439
389 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 440 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
390 441
391 /* Work out the shifted address mask. */ 442 /* Work out the shifted address mask: */
392 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; 443 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
393 mask_lo = size_or_mask | tmp; 444 mask_lo = size_or_mask | tmp;
394 /* Expand tmp with high bits to all 1s*/ 445
446 /* Expand tmp with high bits to all 1s: */
395 hi = fls(tmp); 447 hi = fls(tmp);
396 if (hi > 0) { 448 if (hi > 0) {
397 tmp |= ~((1<<(hi - 1)) - 1); 449 tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
402 } 454 }
403 } 455 }
404 456
405 /* This works correctly if size is a power of two, i.e. a 457 /*
406 contiguous range. */ 458 * This works correctly if size is a power of two, i.e. a
459 * contiguous range:
460 */
407 *size = -mask_lo; 461 *size = -mask_lo;
408 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; 462 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
409 *type = base_lo & 0xff; 463 *type = base_lo & 0xff;
464
465 printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n",
466 cpu, reg, *base, *size,
467 mtrr_attrib_to_str(*type & 0xff));
468out_put_cpu:
469 put_cpu();
410} 470}
411 471
412/** 472/**
@@ -419,6 +479,8 @@ static int set_fixed_ranges(mtrr_type * frs)
419 bool changed = false; 479 bool changed = false;
420 int block=-1, range; 480 int block=-1, range;
421 481
482 k8_check_syscfg_dram_mod_en();
483
422 while (fixed_range_blocks[++block].ranges) 484 while (fixed_range_blocks[++block].ranges)
423 for (range=0; range < fixed_range_blocks[block].ranges; range++) 485 for (range=0; range < fixed_range_blocks[block].ranges; range++)
424 set_fixed_range(fixed_range_blocks[block].base_msr + range, 486 set_fixed_range(fixed_range_blocks[block].base_msr + range,
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b8259..03cda01f57c7 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
574 unsigned long lsize; 574 unsigned long lsize;
575}; 575};
576 576
577static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; 577static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
578 578
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 579static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
580{ 580{
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
582 582
583 for (i = 0; i < num_var_ranges; i++) { 583 for (i = 0; i < num_var_ranges; i++) {
584 mtrr_if->get(i, 584 mtrr_if->get(i,
585 &mtrr_state[i].lbase, 585 &mtrr_value[i].lbase,
586 &mtrr_state[i].lsize, 586 &mtrr_value[i].lsize,
587 &mtrr_state[i].ltype); 587 &mtrr_value[i].ltype);
588 } 588 }
589 return 0; 589 return 0;
590} 590}
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
594 int i; 594 int i;
595 595
596 for (i = 0; i < num_var_ranges; i++) { 596 for (i = 0; i < num_var_ranges; i++) {
597 if (mtrr_state[i].lsize) 597 if (mtrr_value[i].lsize)
598 set_mtrr(i, 598 set_mtrr(i,
599 mtrr_state[i].lbase, 599 mtrr_value[i].lbase,
600 mtrr_state[i].lsize, 600 mtrr_value[i].lsize,
601 mtrr_state[i].ltype); 601 mtrr_value[i].ltype);
602 } 602 }
603 return 0; 603 return 0;
604} 604}
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
610 .resume = mtrr_restore, 610 .resume = mtrr_restore,
611}; 611};
612 612
613/* should be related to MTRR_VAR_RANGES nums */ 613int __initdata changed_by_mtrr_cleanup;
614#define RANGE_NUM 256
615
616struct res_range {
617 unsigned long start;
618 unsigned long end;
619};
620
621static int __init
622add_range(struct res_range *range, int nr_range, unsigned long start,
623 unsigned long end)
624{
625 /* out of slots */
626 if (nr_range >= RANGE_NUM)
627 return nr_range;
628
629 range[nr_range].start = start;
630 range[nr_range].end = end;
631
632 nr_range++;
633
634 return nr_range;
635}
636
637static int __init
638add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
639 unsigned long end)
640{
641 int i;
642
643 /* try to merge it with old one */
644 for (i = 0; i < nr_range; i++) {
645 unsigned long final_start, final_end;
646 unsigned long common_start, common_end;
647
648 if (!range[i].end)
649 continue;
650
651 common_start = max(range[i].start, start);
652 common_end = min(range[i].end, end);
653 if (common_start > common_end + 1)
654 continue;
655
656 final_start = min(range[i].start, start);
657 final_end = max(range[i].end, end);
658
659 range[i].start = final_start;
660 range[i].end = final_end;
661 return nr_range;
662 }
663
664 /* need to add that */
665 return add_range(range, nr_range, start, end);
666}
667
668static void __init
669subtract_range(struct res_range *range, unsigned long start, unsigned long end)
670{
671 int i, j;
672
673 for (j = 0; j < RANGE_NUM; j++) {
674 if (!range[j].end)
675 continue;
676
677 if (start <= range[j].start && end >= range[j].end) {
678 range[j].start = 0;
679 range[j].end = 0;
680 continue;
681 }
682
683 if (start <= range[j].start && end < range[j].end &&
684 range[j].start < end + 1) {
685 range[j].start = end + 1;
686 continue;
687 }
688
689
690 if (start > range[j].start && end >= range[j].end &&
691 range[j].end > start - 1) {
692 range[j].end = start - 1;
693 continue;
694 }
695
696 if (start > range[j].start && end < range[j].end) {
697 /* find the new spare */
698 for (i = 0; i < RANGE_NUM; i++) {
699 if (range[i].end == 0)
700 break;
701 }
702 if (i < RANGE_NUM) {
703 range[i].end = range[j].end;
704 range[i].start = end + 1;
705 } else {
706 printk(KERN_ERR "run of slot in ranges\n");
707 }
708 range[j].end = start - 1;
709 continue;
710 }
711 }
712}
713
714static int __init cmp_range(const void *x1, const void *x2)
715{
716 const struct res_range *r1 = x1;
717 const struct res_range *r2 = x2;
718 long start1, start2;
719
720 start1 = r1->start;
721 start2 = r2->start;
722
723 return start1 - start2;
724}
725
726struct var_mtrr_range_state {
727 unsigned long base_pfn;
728 unsigned long size_pfn;
729 mtrr_type type;
730};
731
732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print;
734
735static int __init
736x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
737 unsigned long extra_remove_base,
738 unsigned long extra_remove_size)
739{
740 unsigned long i, base, size;
741 mtrr_type type;
742
743 for (i = 0; i < num_var_ranges; i++) {
744 type = range_state[i].type;
745 if (type != MTRR_TYPE_WRBACK)
746 continue;
747 base = range_state[i].base_pfn;
748 size = range_state[i].size_pfn;
749 nr_range = add_range_with_merge(range, nr_range, base,
750 base + size - 1);
751 }
752 if (debug_print) {
753 printk(KERN_DEBUG "After WB checking\n");
754 for (i = 0; i < nr_range; i++)
755 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
756 range[i].start, range[i].end + 1);
757 }
758
759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
764 continue;
765 size = range_state[i].size_pfn;
766 if (!size)
767 continue;
768 base = range_state[i].base_pfn;
769 subtract_range(range, base, base + size - 1);
770 }
771 if (extra_remove_size)
772 subtract_range(range, extra_remove_base,
773 extra_remove_base + extra_remove_size - 1);
774
775 /* get new range num */
776 nr_range = 0;
777 for (i = 0; i < RANGE_NUM; i++) {
778 if (!range[i].end)
779 continue;
780 nr_range++;
781 }
782 if (debug_print) {
783 printk(KERN_DEBUG "After UC checking\n");
784 for (i = 0; i < nr_range; i++)
785 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
786 range[i].start, range[i].end + 1);
787 }
788
789 /* sort the ranges */
790 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
791 if (debug_print) {
792 printk(KERN_DEBUG "After sorting\n");
793 for (i = 0; i < nr_range; i++)
794 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
795 range[i].start, range[i].end + 1);
796 }
797
798 /* clear those is not used */
799 for (i = nr_range; i < RANGE_NUM; i++)
800 memset(&range[i], 0, sizeof(range[i]));
801
802 return nr_range;
803}
804
805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
807
808#ifdef CONFIG_MTRR_SANITIZER
809
810static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
811{
812 unsigned long sum;
813 int i;
814
815 sum = 0;
816 for (i = 0; i < nr_range; i++)
817 sum += range[i].end + 1 - range[i].start;
818
819 return sum;
820}
821
822static int enable_mtrr_cleanup __initdata =
823 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
824
825static int __init disable_mtrr_cleanup_setup(char *str)
826{
827 enable_mtrr_cleanup = 0;
828 return 0;
829}
830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
831
832static int __init enable_mtrr_cleanup_setup(char *str)
833{
834 enable_mtrr_cleanup = 1;
835 return 0;
836}
837early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
838
839static int __init mtrr_cleanup_debug_setup(char *str)
840{
841 debug_print = 1;
842 return 0;
843}
844early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
845
846struct var_mtrr_state {
847 unsigned long range_startk;
848 unsigned long range_sizek;
849 unsigned long chunk_sizek;
850 unsigned long gran_sizek;
851 unsigned int reg;
852};
853
854static void __init
855set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
856 unsigned char type, unsigned int address_bits)
857{
858 u32 base_lo, base_hi, mask_lo, mask_hi;
859 u64 base, mask;
860
861 if (!sizek) {
862 fill_mtrr_var_range(reg, 0, 0, 0, 0);
863 return;
864 }
865
866 mask = (1ULL << address_bits) - 1;
867 mask &= ~((((u64)sizek) << 10) - 1);
868
869 base = ((u64)basek) << 10;
870
871 base |= type;
872 mask |= 0x800;
873
874 base_lo = base & ((1ULL<<32) - 1);
875 base_hi = base >> 32;
876
877 mask_lo = mask & ((1ULL<<32) - 1);
878 mask_hi = mask >> 32;
879
880 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
881}
882
883static void __init
884save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
885 unsigned char type)
886{
887 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
888 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
889 range_state[reg].type = type;
890}
891
892static void __init
893set_var_mtrr_all(unsigned int address_bits)
894{
895 unsigned long basek, sizek;
896 unsigned char type;
897 unsigned int reg;
898
899 for (reg = 0; reg < num_var_ranges; reg++) {
900 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
901 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
902 type = range_state[reg].type;
903
904 set_var_mtrr(reg, basek, sizek, type, address_bits);
905 }
906}
907
908static unsigned long to_size_factor(unsigned long sizek, char *factorp)
909{
910 char factor;
911 unsigned long base = sizek;
912
913 if (base & ((1<<10) - 1)) {
914 /* not MB alignment */
915 factor = 'K';
916 } else if (base & ((1<<20) - 1)){
917 factor = 'M';
918 base >>= 10;
919 } else {
920 factor = 'G';
921 base >>= 20;
922 }
923
924 *factorp = factor;
925
926 return base;
927}
928
929static unsigned int __init
930range_to_mtrr(unsigned int reg, unsigned long range_startk,
931 unsigned long range_sizek, unsigned char type)
932{
933 if (!range_sizek || (reg >= num_var_ranges))
934 return reg;
935
936 while (range_sizek) {
937 unsigned long max_align, align;
938 unsigned long sizek;
939
940 /* Compute the maximum size I can make a range */
941 if (range_startk)
942 max_align = ffs(range_startk) - 1;
943 else
944 max_align = 32;
945 align = fls(range_sizek) - 1;
946 if (align > max_align)
947 align = max_align;
948
949 sizek = 1 << align;
950 if (debug_print) {
951 char start_factor = 'K', size_factor = 'K';
952 unsigned long start_base, size_base;
953
954 start_base = to_size_factor(range_startk, &start_factor),
955 size_base = to_size_factor(sizek, &size_factor),
956
957 printk(KERN_DEBUG "Setting variable MTRR %d, "
958 "base: %ld%cB, range: %ld%cB, type %s\n",
959 reg, start_base, start_factor,
960 size_base, size_factor,
961 (type == MTRR_TYPE_UNCACHABLE)?"UC":
962 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
963 );
964 }
965 save_var_mtrr(reg++, range_startk, sizek, type);
966 range_startk += sizek;
967 range_sizek -= sizek;
968 if (reg >= num_var_ranges)
969 break;
970 }
971 return reg;
972}
973
974static unsigned __init
975range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
976 unsigned long sizek)
977{
978 unsigned long hole_basek, hole_sizek;
979 unsigned long second_basek, second_sizek;
980 unsigned long range0_basek, range0_sizek;
981 unsigned long range_basek, range_sizek;
982 unsigned long chunk_sizek;
983 unsigned long gran_sizek;
984
985 hole_basek = 0;
986 hole_sizek = 0;
987 second_basek = 0;
988 second_sizek = 0;
989 chunk_sizek = state->chunk_sizek;
990 gran_sizek = state->gran_sizek;
991
992 /* align with gran size, prevent small block used up MTRRs */
993 range_basek = ALIGN(state->range_startk, gran_sizek);
994 if ((range_basek > basek) && basek)
995 return second_sizek;
996 state->range_sizek -= (range_basek - state->range_startk);
997 range_sizek = ALIGN(state->range_sizek, gran_sizek);
998
999 while (range_sizek > state->range_sizek) {
1000 range_sizek -= gran_sizek;
1001 if (!range_sizek)
1002 return 0;
1003 }
1004 state->range_sizek = range_sizek;
1005
1006 /* try to append some small hole */
1007 range0_basek = state->range_startk;
1008 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1009
1010 /* no increase */
1011 if (range0_sizek == state->range_sizek) {
1012 if (debug_print)
1013 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
1014 range0_basek<<10,
1015 (range0_basek + state->range_sizek)<<10);
1016 state->reg = range_to_mtrr(state->reg, range0_basek,
1017 state->range_sizek, MTRR_TYPE_WRBACK);
1018 return 0;
1019 }
1020
1021 /* only cut back, when it is not the last */
1022 if (sizek) {
1023 while (range0_basek + range0_sizek > (basek + sizek)) {
1024 if (range0_sizek >= chunk_sizek)
1025 range0_sizek -= chunk_sizek;
1026 else
1027 range0_sizek = 0;
1028
1029 if (!range0_sizek)
1030 break;
1031 }
1032 }
1033
1034second_try:
1035 range_basek = range0_basek + range0_sizek;
1036
1037 /* one hole in the middle */
1038 if (range_basek > basek && range_basek <= (basek + sizek))
1039 second_sizek = range_basek - basek;
1040
1041 if (range0_sizek > state->range_sizek) {
1042
1043 /* one hole in middle or at end */
1044 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1045
1046 /* hole size should be less than half of range0 size */
1047 if (hole_sizek >= (range0_sizek >> 1) &&
1048 range0_sizek >= chunk_sizek) {
1049 range0_sizek -= chunk_sizek;
1050 second_sizek = 0;
1051 hole_sizek = 0;
1052
1053 goto second_try;
1054 }
1055 }
1056
1057 if (range0_sizek) {
1058 if (debug_print)
1059 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
1060 range0_basek<<10,
1061 (range0_basek + range0_sizek)<<10);
1062 state->reg = range_to_mtrr(state->reg, range0_basek,
1063 range0_sizek, MTRR_TYPE_WRBACK);
1064 }
1065
1066 if (range0_sizek < state->range_sizek) {
1067 /* need to handle left over */
1068 range_sizek = state->range_sizek - range0_sizek;
1069
1070 if (debug_print)
1071 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1072 range_basek<<10,
1073 (range_basek + range_sizek)<<10);
1074 state->reg = range_to_mtrr(state->reg, range_basek,
1075 range_sizek, MTRR_TYPE_WRBACK);
1076 }
1077
1078 if (hole_sizek) {
1079 hole_basek = range_basek - hole_sizek - second_sizek;
1080 if (debug_print)
1081 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1082 hole_basek<<10,
1083 (hole_basek + hole_sizek)<<10);
1084 state->reg = range_to_mtrr(state->reg, hole_basek,
1085 hole_sizek, MTRR_TYPE_UNCACHABLE);
1086 }
1087
1088 return second_sizek;
1089}
1090
1091static void __init
1092set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
1093 unsigned long size_pfn)
1094{
1095 unsigned long basek, sizek;
1096 unsigned long second_sizek = 0;
1097
1098 if (state->reg >= num_var_ranges)
1099 return;
1100
1101 basek = base_pfn << (PAGE_SHIFT - 10);
1102 sizek = size_pfn << (PAGE_SHIFT - 10);
1103
1104 /* See if I can merge with the last range */
1105 if ((basek <= 1024) ||
1106 (state->range_startk + state->range_sizek == basek)) {
1107 unsigned long endk = basek + sizek;
1108 state->range_sizek = endk - state->range_startk;
1109 return;
1110 }
1111 /* Write the range mtrrs */
1112 if (state->range_sizek != 0)
1113 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
1114
1115 /* Allocate an msr */
1116 state->range_startk = basek + second_sizek;
1117 state->range_sizek = sizek - second_sizek;
1118}
1119
1120/* mininum size of mtrr block that can take hole */
1121static u64 mtrr_chunk_size __initdata = (256ULL<<20);
1122
1123static int __init parse_mtrr_chunk_size_opt(char *p)
1124{
1125 if (!p)
1126 return -EINVAL;
1127 mtrr_chunk_size = memparse(p, &p);
1128 return 0;
1129}
1130early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
1131
1132/* granity of mtrr of block */
1133static u64 mtrr_gran_size __initdata;
1134
1135static int __init parse_mtrr_gran_size_opt(char *p)
1136{
1137 if (!p)
1138 return -EINVAL;
1139 mtrr_gran_size = memparse(p, &p);
1140 return 0;
1141}
1142early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
1143
1144static int nr_mtrr_spare_reg __initdata =
1145 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
1146
1147static int __init parse_mtrr_spare_reg(char *arg)
1148{
1149 if (arg)
1150 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
1151 return 0;
1152}
1153
1154early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
1155
1156static int __init
1157x86_setup_var_mtrrs(struct res_range *range, int nr_range,
1158 u64 chunk_size, u64 gran_size)
1159{
1160 struct var_mtrr_state var_state;
1161 int i;
1162 int num_reg;
1163
1164 var_state.range_startk = 0;
1165 var_state.range_sizek = 0;
1166 var_state.reg = 0;
1167 var_state.chunk_sizek = chunk_size >> 10;
1168 var_state.gran_sizek = gran_size >> 10;
1169
1170 memset(range_state, 0, sizeof(range_state));
1171
1172 /* Write the range etc */
1173 for (i = 0; i < nr_range; i++)
1174 set_var_mtrr_range(&var_state, range[i].start,
1175 range[i].end - range[i].start + 1);
1176
1177 /* Write the last range */
1178 if (var_state.range_sizek != 0)
1179 range_to_mtrr_with_hole(&var_state, 0, 0);
1180
1181 num_reg = var_state.reg;
1182 /* Clear out the extra MTRR's */
1183 while (var_state.reg < num_var_ranges) {
1184 save_var_mtrr(var_state.reg, 0, 0, 0);
1185 var_state.reg++;
1186 }
1187
1188 return num_reg;
1189}
1190
1191struct mtrr_cleanup_result {
1192 unsigned long gran_sizek;
1193 unsigned long chunk_sizek;
1194 unsigned long lose_cover_sizek;
1195 unsigned int num_reg;
1196 int bad;
1197};
1198
1199/*
1200 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1201 * chunk size: gran_size, ..., 2G
1202 * so we need (1+16)*8
1203 */
1204#define NUM_RESULT 136
1205#define PSHIFT (PAGE_SHIFT - 10)
1206
1207static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1208static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1209
1210static void __init print_out_mtrr_range_state(void)
1211{
1212 int i;
1213 char start_factor = 'K', size_factor = 'K';
1214 unsigned long start_base, size_base;
1215 mtrr_type type;
1216
1217 for (i = 0; i < num_var_ranges; i++) {
1218
1219 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1220 if (!size_base)
1221 continue;
1222
1223 size_base = to_size_factor(size_base, &size_factor),
1224 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1225 start_base = to_size_factor(start_base, &start_factor),
1226 type = range_state[i].type;
1227
1228 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1229 i, start_base, start_factor,
1230 size_base, size_factor,
1231 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1232 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1233 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1234 );
1235 }
1236}
1237
1238static int __init mtrr_need_cleanup(void)
1239{
1240 int i;
1241 mtrr_type type;
1242 unsigned long size;
1243 /* extra one for all 0 */
1244 int num[MTRR_NUM_TYPES + 1];
1245
1246 /* check entries number */
1247 memset(num, 0, sizeof(num));
1248 for (i = 0; i < num_var_ranges; i++) {
1249 type = range_state[i].type;
1250 size = range_state[i].size_pfn;
1251 if (type >= MTRR_NUM_TYPES)
1252 continue;
1253 if (!size)
1254 type = MTRR_NUM_TYPES;
1255 if (type == MTRR_TYPE_WRPROT)
1256 type = MTRR_TYPE_UNCACHABLE;
1257 num[type]++;
1258 }
1259
1260 /* check if we got UC entries */
1261 if (!num[MTRR_TYPE_UNCACHABLE])
1262 return 0;
1263
1264 /* check if we only had WB and UC */
1265 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1266 num_var_ranges - num[MTRR_NUM_TYPES])
1267 return 0;
1268
1269 return 1;
1270}
1271
1272static unsigned long __initdata range_sums;
1273static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 unsigned long extra_remove_base,
1275 unsigned long extra_remove_size,
1276 int i)
1277{
1278 int num_reg;
1279 static struct res_range range_new[RANGE_NUM];
1280 static int nr_range_new;
1281 unsigned long range_sums_new;
1282
1283 /* convert ranges to var ranges state */
1284 num_reg = x86_setup_var_mtrrs(range, nr_range,
1285 chunk_size, gran_size);
1286
1287 /* we got new setting in range_state, check it */
1288 memset(range_new, 0, sizeof(range_new));
1289 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1290 extra_remove_base, extra_remove_size);
1291 range_sums_new = sum_ranges(range_new, nr_range_new);
1292
1293 result[i].chunk_sizek = chunk_size >> 10;
1294 result[i].gran_sizek = gran_size >> 10;
1295 result[i].num_reg = num_reg;
1296 if (range_sums < range_sums_new) {
1297 result[i].lose_cover_sizek =
1298 (range_sums_new - range_sums) << PSHIFT;
1299 result[i].bad = 1;
1300 } else
1301 result[i].lose_cover_sizek =
1302 (range_sums - range_sums_new) << PSHIFT;
1303
1304 /* double check it */
1305 if (!result[i].bad && !result[i].lose_cover_sizek) {
1306 if (nr_range_new != nr_range ||
1307 memcmp(range, range_new, sizeof(range)))
1308 result[i].bad = 1;
1309 }
1310
1311 if (!result[i].bad && (range_sums - range_sums_new <
1312 min_loss_pfn[num_reg])) {
1313 min_loss_pfn[num_reg] =
1314 range_sums - range_sums_new;
1315 }
1316}
1317
1318static void __init mtrr_print_out_one_result(int i)
1319{
1320 char gran_factor, chunk_factor, lose_factor;
1321 unsigned long gran_base, chunk_base, lose_base;
1322
1323 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1324 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1325 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1326 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1327 result[i].bad ? "*BAD*" : " ",
1328 gran_base, gran_factor, chunk_base, chunk_factor);
1329 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1330 result[i].num_reg, result[i].bad ? "-" : "",
1331 lose_base, lose_factor);
1332}
1333
1334static int __init mtrr_search_optimal_index(void)
1335{
1336 int i;
1337 int num_reg_good;
1338 int index_good;
1339
1340 if (nr_mtrr_spare_reg >= num_var_ranges)
1341 nr_mtrr_spare_reg = num_var_ranges - 1;
1342 num_reg_good = -1;
1343 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1344 if (!min_loss_pfn[i])
1345 num_reg_good = i;
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 return index_good;
1361}
1362
1363
1364static int __init mtrr_cleanup(unsigned address_bits)
1365{
1366 unsigned long extra_remove_base, extra_remove_size;
1367 unsigned long base, size, def, dummy;
1368 mtrr_type type;
1369 u64 chunk_size, gran_size;
1370 int index_good;
1371 int i;
1372
1373 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1374 return 0;
1375 rdmsr(MTRRdefType_MSR, def, dummy);
1376 def &= 0xff;
1377 if (def != MTRR_TYPE_UNCACHABLE)
1378 return 0;
1379
1380 /* get it and store it aside */
1381 memset(range_state, 0, sizeof(range_state));
1382 for (i = 0; i < num_var_ranges; i++) {
1383 mtrr_if->get(i, &base, &size, &type);
1384 range_state[i].base_pfn = base;
1385 range_state[i].size_pfn = size;
1386 range_state[i].type = type;
1387 }
1388
1389 /* check if we need handle it and can handle it */
1390 if (!mtrr_need_cleanup())
1391 return 0;
1392
1393 /* print original var MTRRs at first, for debugging: */
1394 printk(KERN_DEBUG "original variable MTRRs\n");
1395 print_out_mtrr_range_state();
1396
1397 memset(range, 0, sizeof(range));
1398 extra_remove_size = 0;
1399 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1400 if (mtrr_tom2)
1401 extra_remove_size =
1402 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1403 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1404 extra_remove_size);
1405 /*
1406 * [0, 1M) should always be coverred by var mtrr with WB
1407 * and fixed mtrrs should take effective before var mtrr for it
1408 */
1409 nr_range = add_range_with_merge(range, nr_range, 0,
1410 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1411 /* sort the ranges */
1412 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1413
1414 range_sums = sum_ranges(range, nr_range);
1415 printk(KERN_INFO "total RAM coverred: %ldM\n",
1416 range_sums >> (20 - PAGE_SHIFT));
1417
1418 if (mtrr_chunk_size && mtrr_gran_size) {
1419 i = 0;
1420 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1421 extra_remove_base, extra_remove_size, i);
1422
1423 mtrr_print_out_one_result(i);
1424
1425 if (!result[i].bad) {
1426 set_var_mtrr_all(address_bits);
1427 return 1;
1428 }
1429 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1430 "will find optimal one\n");
1431 }
1432
1433 i = 0;
1434 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1435 memset(result, 0, sizeof(result));
1436 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1437
1438 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1439 chunk_size <<= 1) {
1440
1441 if (i >= NUM_RESULT)
1442 continue;
1443
1444 mtrr_calc_range_state(chunk_size, gran_size,
1445 extra_remove_base, extra_remove_size, i);
1446 if (debug_print) {
1447 mtrr_print_out_one_result(i);
1448 printk(KERN_INFO "\n");
1449 }
1450
1451 i++;
1452 }
1453 }
1454
1455 /* try to find the optimal index */
1456 index_good = mtrr_search_optimal_index();
1457
1458 if (index_good != -1) {
1459 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1460 i = index_good;
1461 mtrr_print_out_one_result(i);
1462
1463 /* convert ranges to var ranges state */
1464 chunk_size = result[i].chunk_sizek;
1465 chunk_size <<= 10;
1466 gran_size = result[i].gran_sizek;
1467 gran_size <<= 10;
1468 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1469 set_var_mtrr_all(address_bits);
1470 printk(KERN_DEBUG "New variable MTRRs\n");
1471 print_out_mtrr_range_state();
1472 return 1;
1473 } else {
1474 /* print out all */
1475 for (i = 0; i < NUM_RESULT; i++)
1476 mtrr_print_out_one_result(i);
1477 }
1478
1479 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
1480 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
1481
1482 return 0;
1483}
1484#else
1485static int __init mtrr_cleanup(unsigned address_bits)
1486{
1487 return 0;
1488}
1489#endif
1490
1491static int __initdata changed_by_mtrr_cleanup;
1492
1493static int disable_mtrr_trim;
1494
1495static int __init disable_mtrr_trim_setup(char *str)
1496{
1497 disable_mtrr_trim = 1;
1498 return 0;
1499}
1500early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
1501
1502/*
1503 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
1504 * for memory >4GB. Check for that here.
1505 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
1506 * apply to are wrong, but so far we don't know of any such case in the wild.
1507 */
1508#define Tom2Enabled (1U << 21)
1509#define Tom2ForceMemTypeWB (1U << 22)
1510
1511int __init amd_special_default_mtrr(void)
1512{
1513 u32 l, h;
1514
1515 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
1516 return 0;
1517 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
1518 return 0;
1519 /* In case some hypervisor doesn't pass SYSCFG through */
1520 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
1521 return 0;
1522 /*
1523 * Memory between 4GB and top of mem is forced WB by this magic bit.
1524 * Reserved before K8RevF, but should be zero there.
1525 */
1526 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
1527 (Tom2Enabled | Tom2ForceMemTypeWB))
1528 return 1;
1529 return 0;
1530}
1531
1532static u64 __init real_trim_memory(unsigned long start_pfn,
1533 unsigned long limit_pfn)
1534{
1535 u64 trim_start, trim_size;
1536 trim_start = start_pfn;
1537 trim_start <<= PAGE_SHIFT;
1538 trim_size = limit_pfn;
1539 trim_size <<= PAGE_SHIFT;
1540 trim_size -= trim_start;
1541
1542 return e820_update_range(trim_start, trim_size, E820_RAM,
1543 E820_RESERVED);
1544}
1545/**
1546 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
1547 * @end_pfn: ending page frame number
1548 *
1549 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
1550 * memory configurations. This routine checks that the highest MTRR matches
1551 * the end of memory, to make sure the MTRRs having a write back type cover
1552 * all of the memory the kernel is intending to use. If not, it'll trim any
1553 * memory off the end by adjusting end_pfn, removing it from the kernel's
1554 * allocation pools, warning the user with an obnoxious message.
1555 */
1556int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1557{
1558 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1559 mtrr_type type;
1560 u64 total_trim_size;
1561
1562 /* extra one for all 0 */
1563 int num[MTRR_NUM_TYPES + 1];
1564 /*
1565 * Make sure we only trim uncachable memory on machines that
1566 * support the Intel MTRR architecture:
1567 */
1568 if (!is_cpu(INTEL) || disable_mtrr_trim)
1569 return 0;
1570 rdmsr(MTRRdefType_MSR, def, dummy);
1571 def &= 0xff;
1572 if (def != MTRR_TYPE_UNCACHABLE)
1573 return 0;
1574
1575 /* get it and store it aside */
1576 memset(range_state, 0, sizeof(range_state));
1577 for (i = 0; i < num_var_ranges; i++) {
1578 mtrr_if->get(i, &base, &size, &type);
1579 range_state[i].base_pfn = base;
1580 range_state[i].size_pfn = size;
1581 range_state[i].type = type;
1582 }
1583
1584 /* Find highest cached pfn */
1585 for (i = 0; i < num_var_ranges; i++) {
1586 type = range_state[i].type;
1587 if (type != MTRR_TYPE_WRBACK)
1588 continue;
1589 base = range_state[i].base_pfn;
1590 size = range_state[i].size_pfn;
1591 if (highest_pfn < base + size)
1592 highest_pfn = base + size;
1593 }
1594
1595 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1596 if (!highest_pfn) {
1597 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1598 return 0;
1599 }
1600
1601 /* check entries number */
1602 memset(num, 0, sizeof(num));
1603 for (i = 0; i < num_var_ranges; i++) {
1604 type = range_state[i].type;
1605 if (type >= MTRR_NUM_TYPES)
1606 continue;
1607 size = range_state[i].size_pfn;
1608 if (!size)
1609 type = MTRR_NUM_TYPES;
1610 num[type]++;
1611 }
1612
1613 /* no entry for WB? */
1614 if (!num[MTRR_TYPE_WRBACK])
1615 return 0;
1616
1617 /* check if we only had WB and UC */
1618 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1619 num_var_ranges - num[MTRR_NUM_TYPES])
1620 return 0;
1621
1622 memset(range, 0, sizeof(range));
1623 nr_range = 0;
1624 if (mtrr_tom2) {
1625 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1626 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1627 if (highest_pfn < range[nr_range].end + 1)
1628 highest_pfn = range[nr_range].end + 1;
1629 nr_range++;
1630 }
1631 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1632
1633 total_trim_size = 0;
1634 /* check the head */
1635 if (range[0].start)
1636 total_trim_size += real_trim_memory(0, range[0].start);
1637 /* check the holes */
1638 for (i = 0; i < nr_range - 1; i++) {
1639 if (range[i].end + 1 < range[i+1].start)
1640 total_trim_size += real_trim_memory(range[i].end + 1,
1641 range[i+1].start);
1642 }
1643 /* check the top */
1644 i = nr_range - 1;
1645 if (range[i].end + 1 < end_pfn)
1646 total_trim_size += real_trim_memory(range[i].end + 1,
1647 end_pfn);
1648
1649 if (total_trim_size) {
1650 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1651 " all of memory, losing %lluMB of RAM.\n",
1652 total_trim_size >> 20);
1653
1654 if (!changed_by_mtrr_cleanup)
1655 WARN_ON(1);
1656
1657 printk(KERN_INFO "update e820 for mtrr\n");
1658 update_e820();
1659
1660 return 1;
1661 }
1662
1663 return 0;
1664}
1665 614
1666/** 615/**
1667 * mtrr_bp_init - initialize mtrrs on the boot CPU 616 * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6d..77f67f7b347a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
79 79
80extern unsigned int num_var_ranges; 80extern unsigned int num_var_ranges;
81extern u64 mtrr_tom2; 81extern u64 mtrr_tom2;
82extern struct mtrr_state_type mtrr_state;
82 83
83void mtrr_state_warn(void); 84void mtrr_state_warn(void);
84const char *mtrr_attrib_to_str(int x); 85const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
88int amd_init_mtrr(void); 89int amd_init_mtrr(void);
89int cyrix_init_mtrr(void); 90int cyrix_init_mtrr(void);
90int centaur_init_mtrr(void); 91int centaur_init_mtrr(void);
92
93extern int changed_by_mtrr_cleanup;
94extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5af..bb62b3e5caad 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
98#endif 98#endif
99} 99}
100 100
101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
102 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
103 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta, 104 .c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e0960..fd2c37bf7acb 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
8 * so no special init takes place. 8 * so no special init takes place.
9 */ 9 */
10 10
11static struct cpu_dev umc_cpu_dev __cpuinitdata = { 11static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
12 .c_vendor = "UMC", 12 .c_vendor = "UMC",
13 .c_ident = { "UMC UMC UMC" }, 13 .c_ident = { "UMC UMC UMC" },
14 .c_models = { 14 .c_models = {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 508bec1cee27..ef2c3563357d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
110/* 110/*
111 * Add a memory region to the kernel e820 map. 111 * Add a memory region to the kernel e820 map.
112 */ 112 */
113void __init e820_add_region(u64 start, u64 size, int type) 113static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
114 int type)
114{ 115{
115 int x = e820.nr_map; 116 int x = e820x->nr_map;
116 117
117 if (x == ARRAY_SIZE(e820.map)) { 118 if (x == ARRAY_SIZE(e820x->map)) {
118 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); 119 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
119 return; 120 return;
120 } 121 }
121 122
122 e820.map[x].addr = start; 123 e820x->map[x].addr = start;
123 e820.map[x].size = size; 124 e820x->map[x].size = size;
124 e820.map[x].type = type; 125 e820x->map[x].type = type;
125 e820.nr_map++; 126 e820x->nr_map++;
127}
128
129void __init e820_add_region(u64 start, u64 size, int type)
130{
131 __e820_add_region(&e820, start, size, type);
132}
133
134static void __init e820_print_type(u32 type)
135{
136 switch (type) {
137 case E820_RAM:
138 case E820_RESERVED_KERN:
139 printk(KERN_CONT "(usable)");
140 break;
141 case E820_RESERVED:
142 printk(KERN_CONT "(reserved)");
143 break;
144 case E820_ACPI:
145 printk(KERN_CONT "(ACPI data)");
146 break;
147 case E820_NVS:
148 printk(KERN_CONT "(ACPI NVS)");
149 break;
150 case E820_UNUSABLE:
151 printk(KERN_CONT "(unusable)");
152 break;
153 default:
154 printk(KERN_CONT "type %u", type);
155 break;
156 }
126} 157}
127 158
128void __init e820_print_map(char *who) 159void __init e820_print_map(char *who)
@@ -134,27 +165,8 @@ void __init e820_print_map(char *who)
134 (unsigned long long) e820.map[i].addr, 165 (unsigned long long) e820.map[i].addr,
135 (unsigned long long) 166 (unsigned long long)
136 (e820.map[i].addr + e820.map[i].size)); 167 (e820.map[i].addr + e820.map[i].size));
137 switch (e820.map[i].type) { 168 e820_print_type(e820.map[i].type);
138 case E820_RAM: 169 printk(KERN_CONT "\n");
139 case E820_RESERVED_KERN:
140 printk(KERN_CONT "(usable)\n");
141 break;
142 case E820_RESERVED:
143 printk(KERN_CONT "(reserved)\n");
144 break;
145 case E820_ACPI:
146 printk(KERN_CONT "(ACPI data)\n");
147 break;
148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n");
150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
154 default:
155 printk(KERN_CONT "type %u\n", e820.map[i].type);
156 break;
157 }
158 } 170 }
159} 171}
160 172
@@ -221,7 +233,7 @@ void __init e820_print_map(char *who)
221 */ 233 */
222 234
223int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, 235int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
224 int *pnr_map) 236 u32 *pnr_map)
225{ 237{
226 struct change_member { 238 struct change_member {
227 struct e820entry *pbios; /* pointer to original bios entry */ 239 struct e820entry *pbios; /* pointer to original bios entry */
@@ -417,11 +429,12 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
417 return __append_e820_map(biosmap, nr_map); 429 return __append_e820_map(biosmap, nr_map);
418} 430}
419 431
420static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, 432static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
421 u64 size, unsigned old_type, 433 u64 size, unsigned old_type,
422 unsigned new_type) 434 unsigned new_type)
423{ 435{
424 int i; 436 u64 end;
437 unsigned int i;
425 u64 real_updated_size = 0; 438 u64 real_updated_size = 0;
426 439
427 BUG_ON(old_type == new_type); 440 BUG_ON(old_type == new_type);
@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
429 if (size > (ULLONG_MAX - start)) 442 if (size > (ULLONG_MAX - start))
430 size = ULLONG_MAX - start; 443 size = ULLONG_MAX - start;
431 444
432 for (i = 0; i < e820.nr_map; i++) { 445 end = start + size;
446 printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
447 (unsigned long long) start,
448 (unsigned long long) end);
449 e820_print_type(old_type);
450 printk(KERN_CONT " ==> ");
451 e820_print_type(new_type);
452 printk(KERN_CONT "\n");
453
454 for (i = 0; i < e820x->nr_map; i++) {
433 struct e820entry *ei = &e820x->map[i]; 455 struct e820entry *ei = &e820x->map[i];
434 u64 final_start, final_end; 456 u64 final_start, final_end;
457 u64 ei_end;
458
435 if (ei->type != old_type) 459 if (ei->type != old_type)
436 continue; 460 continue;
437 /* totally covered? */ 461
438 if (ei->addr >= start && 462 ei_end = ei->addr + ei->size;
439 (ei->addr + ei->size) <= (start + size)) { 463 /* totally covered by new range? */
464 if (ei->addr >= start && ei_end <= end) {
440 ei->type = new_type; 465 ei->type = new_type;
441 real_updated_size += ei->size; 466 real_updated_size += ei->size;
442 continue; 467 continue;
443 } 468 }
469
470 /* new range is totally covered? */
471 if (ei->addr < start && ei_end > end) {
472 __e820_add_region(e820x, start, size, new_type);
473 __e820_add_region(e820x, end, ei_end - end, ei->type);
474 ei->size = start - ei->addr;
475 real_updated_size += size;
476 continue;
477 }
478
444 /* partially covered */ 479 /* partially covered */
445 final_start = max(start, ei->addr); 480 final_start = max(start, ei->addr);
446 final_end = min(start + size, ei->addr + ei->size); 481 final_end = min(end, ei_end);
447 if (final_start >= final_end) 482 if (final_start >= final_end)
448 continue; 483 continue;
449 e820_add_region(final_start, final_end - final_start, 484
450 new_type); 485 __e820_add_region(e820x, final_start, final_end - final_start,
486 new_type);
487
451 real_updated_size += final_end - final_start; 488 real_updated_size += final_end - final_start;
452 489
490 /*
491 * left range could be head or tail, so need to update
492 * size at first.
493 */
453 ei->size -= final_end - final_start; 494 ei->size -= final_end - final_start;
454 if (ei->addr < final_start) 495 if (ei->addr < final_start)
455 continue; 496 continue;
@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
461u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, 502u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
462 unsigned new_type) 503 unsigned new_type)
463{ 504{
464 return e820_update_range_map(&e820, start, size, old_type, new_type); 505 return __e820_update_range(&e820, start, size, old_type, new_type);
465} 506}
466 507
467static u64 __init e820_update_range_saved(u64 start, u64 size, 508static u64 __init e820_update_range_saved(u64 start, u64 size,
468 unsigned old_type, unsigned new_type) 509 unsigned old_type, unsigned new_type)
469{ 510{
470 return e820_update_range_map(&e820_saved, start, size, old_type, 511 return __e820_update_range(&e820_saved, start, size, old_type,
471 new_type); 512 new_type);
472} 513}
473 514
@@ -511,7 +552,7 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
511 552
512void __init update_e820(void) 553void __init update_e820(void)
513{ 554{
514 int nr_map; 555 u32 nr_map;
515 556
516 nr_map = e820.nr_map; 557 nr_map = e820.nr_map;
517 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) 558 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -522,7 +563,7 @@ void __init update_e820(void)
522} 563}
523static void __init update_e820_saved(void) 564static void __init update_e820_saved(void)
524{ 565{
525 int nr_map; 566 u32 nr_map;
526 567
527 nr_map = e820_saved.nr_map; 568 nr_map = e820_saved.nr_map;
528 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) 569 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -1020,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1020 continue; 1061 continue;
1021 return addr; 1062 return addr;
1022 } 1063 }
1023 return -1UL;
1024 1064
1065 return -1ULL;
1025} 1066}
1026 1067
1027/* 1068/*
@@ -1034,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
1034 u64 start; 1075 u64 start;
1035 1076
1036 start = startt; 1077 start = startt;
1037 while (size < sizet) 1078 while (size < sizet && (start + 1))
1038 start = find_e820_area_size(start, &size, align); 1079 start = find_e820_area_size(start, &size, align);
1039 1080
1040 if (size < sizet) 1081 if (size < sizet)
1041 return 0; 1082 return 0;
1042 1083
1084#ifdef CONFIG_X86_32
1085 if (start >= MAXMEM)
1086 return 0;
1087 if (start + size > MAXMEM)
1088 size = MAXMEM - start;
1089#endif
1090
1043 addr = round_down(start + size - sizet, align); 1091 addr = round_down(start + size - sizet, align);
1092 if (addr < start)
1093 return 0;
1044 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); 1094 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
1045 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); 1095 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
1046 printk(KERN_INFO "update e820 for early_reserve_e820\n"); 1096 printk(KERN_INFO "update e820 for early_reserve_e820\n");
@@ -1253,7 +1303,7 @@ early_param("memmap", parse_memmap_opt);
1253void __init finish_e820_parsing(void) 1303void __init finish_e820_parsing(void)
1254{ 1304{
1255 if (userdef) { 1305 if (userdef) {
1256 int nr = e820.nr_map; 1306 u32 nr = e820.nr_map;
1257 1307
1258 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) 1308 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
1259 early_panic("Invalid user supplied memory map"); 1309 early_panic("Invalid user supplied memory map");
@@ -1336,7 +1386,7 @@ void __init e820_reserve_resources_late(void)
1336char *__init default_machine_specific_memory_setup(void) 1386char *__init default_machine_specific_memory_setup(void)
1337{ 1387{
1338 char *who = "BIOS-e820"; 1388 char *who = "BIOS-e820";
1339 int new_nr; 1389 u32 new_nr;
1340 /* 1390 /*
1341 * Try to copy the BIOS-supplied E820-map. 1391 * Try to copy the BIOS-supplied E820-map.
1342 * 1392 *
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 639ad98238a2..335f049d110f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); 250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251} 251}
252 252
253static void dbgp_mdelay(int ms) 253static void __init dbgp_mdelay(int ms)
254{ 254{
255 int i; 255 int i;
256 256
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
311 writel(hi, &ehci_debug->data47); 311 writel(hi, &ehci_debug->data47);
312} 312}
313 313
314static void dbgp_get_data(void *buf, int size) 314static void __init dbgp_get_data(void *buf, int size)
315{ 315{
316 unsigned char *bytes = buf; 316 unsigned char *bytes = buf;
317 u32 lo, hi; 317 u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
355 return ret; 355 return ret;
356} 356}
357 357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, 358static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size) 359 int size)
360{ 360{
361 u32 pids, addr, ctrl; 361 u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
386 return ret; 386 return ret;
387} 387}
388 388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request, 389static int __init dbgp_control_msg(unsigned devnum, int requesttype,
390 int value, int index, void *data, int size) 390 int request, int value, int index, void *data, int size)
391{ 391{
392 u32 pids, addr, ctrl; 392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req; 393 struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
489 return 0; 489 return 0;
490} 490}
491 491
492static int ehci_reset_port(int port) 492static int __init ehci_reset_port(int port)
493{ 493{
494 u32 portsc; 494 u32 portsc;
495 u32 delay_time, delay; 495 u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
532 return -EBUSY; 532 return -EBUSY;
533} 533}
534 534
535static int ehci_wait_for_port(int port) 535static int __init ehci_wait_for_port(int port)
536{ 536{
537 u32 status; 537 u32 status;
538 int ret, reps; 538 int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
557 557
558typedef void (*set_debug_port_t)(int port); 558typedef void (*set_debug_port_t)(int port);
559 559
560static void default_set_debug_port(int port) 560static void __init default_set_debug_port(int port)
561{ 561{
562} 562}
563 563
564static set_debug_port_t set_debug_port = default_set_debug_port; 564static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
565 565
566static void nvidia_set_debug_port(int port) 566static void __init nvidia_set_debug_port(int port)
567{ 567{
568 u32 dword; 568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 899e8938e79f..c929add475c9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -442,8 +442,7 @@ sysenter_past_esp:
442 442
443 GET_THREAD_INFO(%ebp) 443 GET_THREAD_INFO(%ebp)
444 444
445 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 445 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
446 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
447 jnz sysenter_audit 446 jnz sysenter_audit
448sysenter_do_call: 447sysenter_do_call:
449 cmpl $(nr_syscalls), %eax 448 cmpl $(nr_syscalls), %eax
@@ -454,7 +453,7 @@ sysenter_do_call:
454 DISABLE_INTERRUPTS(CLBR_ANY) 453 DISABLE_INTERRUPTS(CLBR_ANY)
455 TRACE_IRQS_OFF 454 TRACE_IRQS_OFF
456 movl TI_flags(%ebp), %ecx 455 movl TI_flags(%ebp), %ecx
457 testw $_TIF_ALLWORK_MASK, %cx 456 testl $_TIF_ALLWORK_MASK, %ecx
458 jne sysexit_audit 457 jne sysexit_audit
459sysenter_exit: 458sysenter_exit:
460/* if something modifies registers it must also disable sysexit */ 459/* if something modifies registers it must also disable sysexit */
@@ -468,7 +467,7 @@ sysenter_exit:
468 467
469#ifdef CONFIG_AUDITSYSCALL 468#ifdef CONFIG_AUDITSYSCALL
470sysenter_audit: 469sysenter_audit:
471 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 470 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
472 jnz syscall_trace_entry 471 jnz syscall_trace_entry
473 addl $4,%esp 472 addl $4,%esp
474 CFI_ADJUST_CFA_OFFSET -4 473 CFI_ADJUST_CFA_OFFSET -4
@@ -485,7 +484,7 @@ sysenter_audit:
485 jmp sysenter_do_call 484 jmp sysenter_do_call
486 485
487sysexit_audit: 486sysexit_audit:
488 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 487 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
489 jne syscall_exit_work 488 jne syscall_exit_work
490 TRACE_IRQS_ON 489 TRACE_IRQS_ON
491 ENABLE_INTERRUPTS(CLBR_ANY) 490 ENABLE_INTERRUPTS(CLBR_ANY)
@@ -498,7 +497,7 @@ sysexit_audit:
498 DISABLE_INTERRUPTS(CLBR_ANY) 497 DISABLE_INTERRUPTS(CLBR_ANY)
499 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
500 movl TI_flags(%ebp), %ecx 499 movl TI_flags(%ebp), %ecx
501 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 500 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
502 jne syscall_exit_work 501 jne syscall_exit_work
503 movl PT_EAX(%esp),%eax /* reload syscall return value */ 502 movl PT_EAX(%esp),%eax /* reload syscall return value */
504 jmp sysenter_exit 503 jmp sysenter_exit
@@ -523,8 +522,7 @@ ENTRY(system_call)
523 SAVE_ALL 522 SAVE_ALL
524 GET_THREAD_INFO(%ebp) 523 GET_THREAD_INFO(%ebp)
525 # system call tracing in operation / emulation 524 # system call tracing in operation / emulation
526 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 525 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
527 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
528 jnz syscall_trace_entry 526 jnz syscall_trace_entry
529 cmpl $(nr_syscalls), %eax 527 cmpl $(nr_syscalls), %eax
530 jae syscall_badsys 528 jae syscall_badsys
@@ -538,7 +536,7 @@ syscall_exit:
538 # between sampling and the iret 536 # between sampling and the iret
539 TRACE_IRQS_OFF 537 TRACE_IRQS_OFF
540 movl TI_flags(%ebp), %ecx 538 movl TI_flags(%ebp), %ecx
541 testw $_TIF_ALLWORK_MASK, %cx # current->work 539 testl $_TIF_ALLWORK_MASK, %ecx # current->work
542 jne syscall_exit_work 540 jne syscall_exit_work
543 541
544restore_all: 542restore_all:
@@ -673,7 +671,7 @@ END(syscall_trace_entry)
673 # perform syscall exit tracing 671 # perform syscall exit tracing
674 ALIGN 672 ALIGN
675syscall_exit_work: 673syscall_exit_work:
676 testb $_TIF_WORK_SYSCALL_EXIT, %cl 674 testl $_TIF_WORK_SYSCALL_EXIT, %ecx
677 jz work_pending 675 jz work_pending
678 TRACE_IRQS_ON 676 TRACE_IRQS_ON
679 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 677 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 83d1836b9467..a331ec38af9e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -368,6 +368,7 @@ ENTRY(save_rest)
368END(save_rest) 368END(save_rest)
369 369
370/* save complete stack frame */ 370/* save complete stack frame */
371 .pushsection .kprobes.text, "ax"
371ENTRY(save_paranoid) 372ENTRY(save_paranoid)
372 XCPT_FRAME 1 RDI+8 373 XCPT_FRAME 1 RDI+8
373 cld 374 cld
@@ -396,6 +397,7 @@ ENTRY(save_paranoid)
3961: ret 3971: ret
397 CFI_ENDPROC 398 CFI_ENDPROC
398END(save_paranoid) 399END(save_paranoid)
400 .popsection
399 401
400/* 402/*
401 * A newly forked process directly context switches into this address. 403 * A newly forked process directly context switches into this address.
@@ -416,7 +418,6 @@ ENTRY(ret_from_fork)
416 418
417 GET_THREAD_INFO(%rcx) 419 GET_THREAD_INFO(%rcx)
418 420
419 CFI_REMEMBER_STATE
420 RESTORE_REST 421 RESTORE_REST
421 422
422 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 423 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -428,7 +429,6 @@ ENTRY(ret_from_fork)
428 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 429 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
429 jmp ret_from_sys_call # go to the SYSRET fastpath 430 jmp ret_from_sys_call # go to the SYSRET fastpath
430 431
431 CFI_RESTORE_STATE
432 CFI_ENDPROC 432 CFI_ENDPROC
433END(ret_from_fork) 433END(ret_from_fork)
434 434
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
984#endif 984#endif
985apicinterrupt LOCAL_TIMER_VECTOR \ 985apicinterrupt LOCAL_TIMER_VECTOR \
986 apic_timer_interrupt smp_apic_timer_interrupt 986 apic_timer_interrupt smp_apic_timer_interrupt
987apicinterrupt GENERIC_INTERRUPT_VECTOR \
988 generic_interrupt smp_generic_interrupt
987 989
988#ifdef CONFIG_SMP 990#ifdef CONFIG_SMP
989apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 991apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe182..3f8579f8d42c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
18{ 18{
19 reserve_trampoline_memory(); 19 reserve_trampoline_memory();
20 20
21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
22 22
23#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
24 /* Reserve INITRD */ 24 /* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 30 }
31#endif 31#endif
32 reserve_early(init_pg_tables_start, init_pg_tables_end,
33 "INIT_PG_TABLE");
34
35 reserve_ebda_region(); 32 reserve_ebda_region();
36 33
37 /* 34 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b272247690..70eaa852c732 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
100 100
101 reserve_trampoline_memory(); 101 reserve_trampoline_memory();
102 102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 104
105#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */ 106 /* Reserve INITRD */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c32ca19d591a..30683883e0cd 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -38,42 +38,40 @@
38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id 38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
39 39
40/* 40/*
41 * This is how much memory *in addition to the memory covered up to 41 * This is how much memory in addition to the memory covered up to
42 * and including _end* we need mapped initially. 42 * and including _end we need mapped initially.
43 * We need: 43 * We need:
44 * - one bit for each possible page, but only in low memory, which means 44 * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
45 * 2^32/4096/8 = 128K worst case (4G/4G split.) 45 * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
46 * - enough space to map all low memory, which means
47 * (2^32/4096) / 1024 pages (worst case, non PAE)
48 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
49 * - a few pages for allocator use before the kernel pagetable has
50 * been set up
51 * 46 *
52 * Modulo rounding, each megabyte assigned here requires a kilobyte of 47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
53 * memory, which is currently unreclaimed. 48 * memory, which is currently unreclaimed.
54 * 49 *
55 * This should be a multiple of a page. 50 * This should be a multiple of a page.
51 *
52 * KERNEL_IMAGE_SIZE should be greater than pa(_end)
53 * and small than max_low_pfn, otherwise will waste some page table entries
56 */ 54 */
57LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
58
59/*
60 * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
61 * pagetables from above the 16MB DMA limit, so we'll have to set
62 * up pagetables 16MB more (worst-case):
63 */
64#ifdef CONFIG_DEBUG_PAGEALLOC
65LOW_PAGES = LOW_PAGES + 0x1000000
66#endif
67 55
68#if PTRS_PER_PMD > 1 56#if PTRS_PER_PMD > 1
69PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD 57#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
70#else 58#else
71PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) 59#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
72#endif 60#endif
73BOOTBITMAP_SIZE = LOW_PAGES / 8
74ALLOCATOR_SLOP = 4
75 61
76INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm 62/* Enough space to fit pagetables for the low memory linear map */
63MAPPING_BEYOND_END = \
64 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
65
66/*
67 * Worst-case size of the kernel mapping we need to make:
68 * the worst-case size of the kernel itself, plus the extra we need
69 * to map for the linear map.
70 */
71KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
72
73INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
74RESERVE_BRK(pagetables, INIT_MAP_SIZE)
77 75
78/* 76/*
79 * 32-bit kernel entrypoint; only used by the boot CPU. On entry, 77 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -166,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
166 164
167/* 165/*
168 * Initialize page tables. This creates a PDE and a set of page 166 * Initialize page tables. This creates a PDE and a set of page
169 * tables, which are located immediately beyond _end. The variable 167 * tables, which are located immediately beyond __brk_base. The variable
170 * init_pg_tables_end is set up to point to the first "safe" location. 168 * _brk_end is set up to point to the first "safe" location.
171 * Mappings are created both at virtual address 0 (identity mapping) 169 * Mappings are created both at virtual address 0 (identity mapping)
172 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. 170 * and PAGE_OFFSET for up to _end.
173 * 171 *
174 * Note that the stack is not yet set up! 172 * Note that the stack is not yet set up!
175 */ 173 */
@@ -190,8 +188,7 @@ default_entry:
190 188
191 xorl %ebx,%ebx /* %ebx is kept at zero */ 189 xorl %ebx,%ebx /* %ebx is kept at zero */
192 190
193 movl $pa(pg0), %edi 191 movl $pa(__brk_base), %edi
194 movl %edi, pa(init_pg_tables_start)
195 movl $pa(swapper_pg_pmd), %edx 192 movl $pa(swapper_pg_pmd), %edx
196 movl $PTE_IDENT_ATTR, %eax 193 movl $PTE_IDENT_ATTR, %eax
19710: 19410:
@@ -209,14 +206,14 @@ default_entry:
209 loop 11b 206 loop 11b
210 207
211 /* 208 /*
212 * End condition: we must map up to and including INIT_MAP_BEYOND_END 209 * End condition: we must map up to the end + MAPPING_BEYOND_END.
213 * bytes beyond the end of our own page tables.
214 */ 210 */
215 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 211 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
216 cmpl %ebp,%eax 212 cmpl %ebp,%eax
217 jb 10b 213 jb 10b
2181: 2141:
219 movl %edi,pa(init_pg_tables_end) 215 addl $__PAGE_OFFSET, %edi
216 movl %edi, pa(_brk_end)
220 shrl $12, %eax 217 shrl $12, %eax
221 movl %eax, pa(max_pfn_mapped) 218 movl %eax, pa(max_pfn_mapped)
222 219
@@ -227,8 +224,7 @@ default_entry:
227 224
228page_pde_offset = (__PAGE_OFFSET >> 20); 225page_pde_offset = (__PAGE_OFFSET >> 20);
229 226
230 movl $pa(pg0), %edi 227 movl $pa(__brk_base), %edi
231 movl %edi, pa(init_pg_tables_start)
232 movl $pa(swapper_pg_dir), %edx 228 movl $pa(swapper_pg_dir), %edx
233 movl $PTE_IDENT_ATTR, %eax 229 movl $PTE_IDENT_ATTR, %eax
23410: 23010:
@@ -242,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
242 addl $0x1000,%eax 238 addl $0x1000,%eax
243 loop 11b 239 loop 11b
244 /* 240 /*
245 * End condition: we must map up to and including INIT_MAP_BEYOND_END 241 * End condition: we must map up to the end + MAPPING_BEYOND_END.
246 * bytes beyond the end of our own page tables; the +0x007 is
247 * the attribute bits
248 */ 242 */
249 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 243 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
250 cmpl %ebp,%eax 244 cmpl %ebp,%eax
251 jb 10b 245 jb 10b
252 movl %edi,pa(init_pg_tables_end) 246 addl $__PAGE_OFFSET, %edi
247 movl %edi, pa(_brk_end)
253 shrl $12, %eax 248 shrl $12, %eax
254 movl %eax, pa(max_pfn_mapped) 249 movl %eax, pa(max_pfn_mapped)
255 250
@@ -636,6 +631,7 @@ swapper_pg_fixmap:
636 .fill 1024,4,0 631 .fill 1024,4,0
637ENTRY(empty_zero_page) 632ENTRY(empty_zero_page)
638 .fill 4096,1,0 633 .fill 4096,1,0
634
639/* 635/*
640 * This starts the data section. 636 * This starts the data section.
641 */ 637 */
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 10f92fb532f3..3475440baa54 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,17 +3,17 @@
3 * 3 *
4 */ 4 */
5#include <linux/clockchips.h> 5#include <linux/clockchips.h>
6#include <linux/init.h>
7#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/spinlock.h> 10#include <linux/delay.h>
11#include <linux/init.h>
12#include <linux/io.h>
11 13
12#include <asm/smp.h>
13#include <asm/delay.h>
14#include <asm/i8253.h> 14#include <asm/i8253.h>
15#include <asm/io.h>
16#include <asm/hpet.h> 15#include <asm/hpet.h>
16#include <asm/smp.h>
17 17
18DEFINE_SPINLOCK(i8253_lock); 18DEFINE_SPINLOCK(i8253_lock);
19EXPORT_SYMBOL(i8253_lock); 19EXPORT_SYMBOL(i8253_lock);
@@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_event_mode mode,
40{ 40{
41 spin_lock(&i8253_lock); 41 spin_lock(&i8253_lock);
42 42
43 switch(mode) { 43 switch (mode) {
44 case CLOCK_EVT_MODE_PERIODIC: 44 case CLOCK_EVT_MODE_PERIODIC:
45 /* binary, mode 2, LSB/MSB, ch 0 */ 45 /* binary, mode 2, LSB/MSB, ch 0 */
46 outb_pit(0x34, PIT_MODE); 46 outb_pit(0x34, PIT_MODE);
@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
95 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - 95 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
96 * !using_apic_timer decisions in do_timer_interrupt_hook() 96 * !using_apic_timer decisions in do_timer_interrupt_hook()
97 */ 97 */
98static struct clock_event_device pit_clockevent = { 98static struct clock_event_device pit_ce = {
99 .name = "pit", 99 .name = "pit",
100 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 100 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
101 .set_mode = init_pit_timer, 101 .set_mode = init_pit_timer,
@@ -114,15 +114,13 @@ void __init setup_pit_timer(void)
114 * Start pit with the boot cpu mask and make it global after the 114 * Start pit with the boot cpu mask and make it global after the
115 * IO_APIC has been initialized. 115 * IO_APIC has been initialized.
116 */ 116 */
117 pit_clockevent.cpumask = cpumask_of(smp_processor_id()); 117 pit_ce.cpumask = cpumask_of(smp_processor_id());
118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 118 pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
119 pit_clockevent.shift); 119 pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
120 pit_clockevent.max_delta_ns = 120 pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
121 clockevent_delta2ns(0x7FFF, &pit_clockevent); 121
122 pit_clockevent.min_delta_ns = 122 clockevents_register_device(&pit_ce);
123 clockevent_delta2ns(0xF, &pit_clockevent); 123 global_clock_event = &pit_ce;
124 clockevents_register_device(&pit_clockevent);
125 global_clock_event = &pit_clockevent;
126} 124}
127 125
128#ifndef CONFIG_X86_64 126#ifndef CONFIG_X86_64
@@ -133,11 +131,11 @@ void __init setup_pit_timer(void)
133 */ 131 */
134static cycle_t pit_read(void) 132static cycle_t pit_read(void)
135{ 133{
134 static int old_count;
135 static u32 old_jifs;
136 unsigned long flags; 136 unsigned long flags;
137 int count; 137 int count;
138 u32 jifs; 138 u32 jifs;
139 static int old_count;
140 static u32 old_jifs;
141 139
142 spin_lock_irqsave(&i8253_lock, flags); 140 spin_lock_irqsave(&i8253_lock, flags);
143 /* 141 /*
@@ -179,9 +177,9 @@ static cycle_t pit_read(void)
179 * Previous attempts to handle these cases intelligently were 177 * Previous attempts to handle these cases intelligently were
180 * buggy, so we just do the simple thing now. 178 * buggy, so we just do the simple thing now.
181 */ 179 */
182 if (count > old_count && jifs == old_jifs) { 180 if (count > old_count && jifs == old_jifs)
183 count = old_count; 181 count = old_count;
184 } 182
185 old_count = count; 183 old_count = count;
186 old_jifs = jifs; 184 old_jifs = jifs;
187 185
@@ -192,13 +190,13 @@ static cycle_t pit_read(void)
192 return (cycle_t)(jifs * LATCH) + count; 190 return (cycle_t)(jifs * LATCH) + count;
193} 191}
194 192
195static struct clocksource clocksource_pit = { 193static struct clocksource pit_cs = {
196 .name = "pit", 194 .name = "pit",
197 .rating = 110, 195 .rating = 110,
198 .read = pit_read, 196 .read = pit_read,
199 .mask = CLOCKSOURCE_MASK(32), 197 .mask = CLOCKSOURCE_MASK(32),
200 .mult = 0, 198 .mult = 0,
201 .shift = 20, 199 .shift = 20,
202}; 200};
203 201
204static void pit_disable_clocksource(void) 202static void pit_disable_clocksource(void)
@@ -206,9 +204,9 @@ static void pit_disable_clocksource(void)
206 /* 204 /*
207 * Use mult to check whether it is registered or not 205 * Use mult to check whether it is registered or not
208 */ 206 */
209 if (clocksource_pit.mult) { 207 if (pit_cs.mult) {
210 clocksource_unregister(&clocksource_pit); 208 clocksource_unregister(&pit_cs);
211 clocksource_pit.mult = 0; 209 pit_cs.mult = 0;
212 } 210 }
213} 211}
214 212
@@ -222,13 +220,13 @@ static int __init init_pit_clocksource(void)
222 * - when local APIC timer is active (PIT is switched off) 220 * - when local APIC timer is active (PIT is switched off)
223 */ 221 */
224 if (num_possible_cpus() > 1 || is_hpet_enabled() || 222 if (num_possible_cpus() > 1 || is_hpet_enabled() ||
225 pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) 223 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
226 return 0; 224 return 0;
227 225
228 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 226 pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
229 clocksource_pit.shift); 227
230 return clocksource_register(&clocksource_pit); 228 return clocksource_register(&pit_cs);
231} 229}
232arch_initcall(init_pit_clocksource); 230arch_initcall(init_pit_clocksource);
233 231
234#endif 232#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 720d2607aacb..a979b5bd2fc0 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -7,10 +7,10 @@
7 */ 7 */
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/delay.h> 10#include <linux/delay.h>
11#include <linux/init.h>
12#include <linux/dmi.h> 12#include <linux/dmi.h>
13#include <asm/io.h> 13#include <linux/io.h>
14 14
15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; 15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
16 16
@@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay);
47static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) 47static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
48{ 48{
49 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { 49 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
50 printk(KERN_NOTICE "%s: using 0xed I/O delay port\n", 50 pr_notice("%s: using 0xed I/O delay port\n", id->ident);
51 id->ident);
52 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; 51 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
53 } 52 }
54 53
@@ -64,40 +63,40 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
64 .callback = dmi_io_delay_0xed_port, 63 .callback = dmi_io_delay_0xed_port,
65 .ident = "Compaq Presario V6000", 64 .ident = "Compaq Presario V6000",
66 .matches = { 65 .matches = {
67 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 66 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
68 DMI_MATCH(DMI_BOARD_NAME, "30B7") 67 DMI_MATCH(DMI_BOARD_NAME, "30B7")
69 } 68 }
70 }, 69 },
71 { 70 {
72 .callback = dmi_io_delay_0xed_port, 71 .callback = dmi_io_delay_0xed_port,
73 .ident = "HP Pavilion dv9000z", 72 .ident = "HP Pavilion dv9000z",
74 .matches = { 73 .matches = {
75 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 74 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
76 DMI_MATCH(DMI_BOARD_NAME, "30B9") 75 DMI_MATCH(DMI_BOARD_NAME, "30B9")
77 } 76 }
78 }, 77 },
79 { 78 {
80 .callback = dmi_io_delay_0xed_port, 79 .callback = dmi_io_delay_0xed_port,
81 .ident = "HP Pavilion dv6000", 80 .ident = "HP Pavilion dv6000",
82 .matches = { 81 .matches = {
83 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 82 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
84 DMI_MATCH(DMI_BOARD_NAME, "30B8") 83 DMI_MATCH(DMI_BOARD_NAME, "30B8")
85 } 84 }
86 }, 85 },
87 { 86 {
88 .callback = dmi_io_delay_0xed_port, 87 .callback = dmi_io_delay_0xed_port,
89 .ident = "HP Pavilion tx1000", 88 .ident = "HP Pavilion tx1000",
90 .matches = { 89 .matches = {
91 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 90 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 91 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 92 }
94 }, 93 },
95 { 94 {
96 .callback = dmi_io_delay_0xed_port, 95 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700", 96 .ident = "Presario F700",
98 .matches = { 97 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 98 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3") 99 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 } 100 }
102 }, 101 },
103 { } 102 { }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f13ca1650aaf..3aaf7b9e3a8b 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
15 15
16atomic_t irq_err_count; 16atomic_t irq_err_count;
17 17
18/* Function pointer for generic interrupt vector handling */
19void (*generic_interrupt_extension)(void) = NULL;
20
18/* 21/*
19 * 'what should we do if we get a hw irq event on an illegal vector'. 22 * 'what should we do if we get a hw irq event on an illegal vector'.
20 * each architecture has to answer this themselves. 23 * each architecture has to answer this themselves.
@@ -42,55 +45,60 @@ void ack_bad_irq(unsigned int irq)
42/* 45/*
43 * /proc/interrupts printing: 46 * /proc/interrupts printing:
44 */ 47 */
45static int show_other_interrupts(struct seq_file *p) 48static int show_other_interrupts(struct seq_file *p, int prec)
46{ 49{
47 int j; 50 int j;
48 51
49 seq_printf(p, "NMI: "); 52 seq_printf(p, "%*s: ", prec, "NMI");
50 for_each_online_cpu(j) 53 for_each_online_cpu(j)
51 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); 54 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
52 seq_printf(p, " Non-maskable interrupts\n"); 55 seq_printf(p, " Non-maskable interrupts\n");
53#ifdef CONFIG_X86_LOCAL_APIC 56#ifdef CONFIG_X86_LOCAL_APIC
54 seq_printf(p, "LOC: "); 57 seq_printf(p, "%*s: ", prec, "LOC");
55 for_each_online_cpu(j) 58 for_each_online_cpu(j)
56 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 59 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
57 seq_printf(p, " Local timer interrupts\n"); 60 seq_printf(p, " Local timer interrupts\n");
61
62 seq_printf(p, "%*s: ", prec, "SPU");
63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n");
58#endif 66#endif
67 if (generic_interrupt_extension) {
68 seq_printf(p, "PLT: ");
69 for_each_online_cpu(j)
70 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
71 seq_printf(p, " Platform interrupts\n");
72 }
59#ifdef CONFIG_SMP 73#ifdef CONFIG_SMP
60 seq_printf(p, "RES: "); 74 seq_printf(p, "%*s: ", prec, "RES");
61 for_each_online_cpu(j) 75 for_each_online_cpu(j)
62 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); 76 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
63 seq_printf(p, " Rescheduling interrupts\n"); 77 seq_printf(p, " Rescheduling interrupts\n");
64 seq_printf(p, "CAL: "); 78 seq_printf(p, "%*s: ", prec, "CAL");
65 for_each_online_cpu(j) 79 for_each_online_cpu(j)
66 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); 80 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
67 seq_printf(p, " Function call interrupts\n"); 81 seq_printf(p, " Function call interrupts\n");
68 seq_printf(p, "TLB: "); 82 seq_printf(p, "%*s: ", prec, "TLB");
69 for_each_online_cpu(j) 83 for_each_online_cpu(j)
70 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 84 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
71 seq_printf(p, " TLB shootdowns\n"); 85 seq_printf(p, " TLB shootdowns\n");
72#endif 86#endif
73#ifdef CONFIG_X86_MCE 87#ifdef CONFIG_X86_MCE
74 seq_printf(p, "TRM: "); 88 seq_printf(p, "%*s: ", prec, "TRM");
75 for_each_online_cpu(j) 89 for_each_online_cpu(j)
76 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 90 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
77 seq_printf(p, " Thermal event interrupts\n"); 91 seq_printf(p, " Thermal event interrupts\n");
78# ifdef CONFIG_X86_64 92# ifdef CONFIG_X86_64
79 seq_printf(p, "THR: "); 93 seq_printf(p, "%*s: ", prec, "THR");
80 for_each_online_cpu(j) 94 for_each_online_cpu(j)
81 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 95 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
82 seq_printf(p, " Threshold APIC interrupts\n"); 96 seq_printf(p, " Threshold APIC interrupts\n");
83# endif 97# endif
84#endif 98#endif
85#ifdef CONFIG_X86_LOCAL_APIC 99 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
86 seq_printf(p, "SPU: ");
87 for_each_online_cpu(j)
88 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
89 seq_printf(p, " Spurious interrupts\n");
90#endif
91 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
92#if defined(CONFIG_X86_IO_APIC) 100#if defined(CONFIG_X86_IO_APIC)
93 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); 101 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
94#endif 102#endif
95 return 0; 103 return 0;
96} 104}
@@ -98,19 +106,22 @@ static int show_other_interrupts(struct seq_file *p)
98int show_interrupts(struct seq_file *p, void *v) 106int show_interrupts(struct seq_file *p, void *v)
99{ 107{
100 unsigned long flags, any_count = 0; 108 unsigned long flags, any_count = 0;
101 int i = *(loff_t *) v, j; 109 int i = *(loff_t *) v, j, prec;
102 struct irqaction *action; 110 struct irqaction *action;
103 struct irq_desc *desc; 111 struct irq_desc *desc;
104 112
105 if (i > nr_irqs) 113 if (i > nr_irqs)
106 return 0; 114 return 0;
107 115
116 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
117 j *= 10;
118
108 if (i == nr_irqs) 119 if (i == nr_irqs)
109 return show_other_interrupts(p); 120 return show_other_interrupts(p, prec);
110 121
111 /* print header */ 122 /* print header */
112 if (i == 0) { 123 if (i == 0) {
113 seq_printf(p, " "); 124 seq_printf(p, "%*s", prec + 8, "");
114 for_each_online_cpu(j) 125 for_each_online_cpu(j)
115 seq_printf(p, "CPU%-8d", j); 126 seq_printf(p, "CPU%-8d", j);
116 seq_putc(p, '\n'); 127 seq_putc(p, '\n');
@@ -121,23 +132,15 @@ int show_interrupts(struct seq_file *p, void *v)
121 return 0; 132 return 0;
122 133
123 spin_lock_irqsave(&desc->lock, flags); 134 spin_lock_irqsave(&desc->lock, flags);
124#ifndef CONFIG_SMP
125 any_count = kstat_irqs(i);
126#else
127 for_each_online_cpu(j) 135 for_each_online_cpu(j)
128 any_count |= kstat_irqs_cpu(i, j); 136 any_count |= kstat_irqs_cpu(i, j);
129#endif
130 action = desc->action; 137 action = desc->action;
131 if (!action && !any_count) 138 if (!action && !any_count)
132 goto out; 139 goto out;
133 140
134 seq_printf(p, "%3d: ", i); 141 seq_printf(p, "%*d: ", prec, i);
135#ifndef CONFIG_SMP
136 seq_printf(p, "%10u ", kstat_irqs(i));
137#else
138 for_each_online_cpu(j) 142 for_each_online_cpu(j)
139 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 143 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
140#endif
141 seq_printf(p, " %8s", desc->chip->name); 144 seq_printf(p, " %8s", desc->chip->name);
142 seq_printf(p, "-%-8s", desc->name); 145 seq_printf(p, "-%-8s", desc->name);
143 146
@@ -162,7 +165,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
162 165
163#ifdef CONFIG_X86_LOCAL_APIC 166#ifdef CONFIG_X86_LOCAL_APIC
164 sum += irq_stats(cpu)->apic_timer_irqs; 167 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count;
165#endif 169#endif
170 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs;
166#ifdef CONFIG_SMP 172#ifdef CONFIG_SMP
167 sum += irq_stats(cpu)->irq_resched_count; 173 sum += irq_stats(cpu)->irq_resched_count;
168 sum += irq_stats(cpu)->irq_call_count; 174 sum += irq_stats(cpu)->irq_call_count;
@@ -174,9 +180,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
174 sum += irq_stats(cpu)->irq_threshold_count; 180 sum += irq_stats(cpu)->irq_threshold_count;
175#endif 181#endif
176#endif 182#endif
177#ifdef CONFIG_X86_LOCAL_APIC
178 sum += irq_stats(cpu)->irq_spurious_count;
179#endif
180 return sum; 183 return sum;
181} 184}
182 185
@@ -226,4 +229,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
226 return 1; 229 return 1;
227} 230}
228 231
232/*
233 * Handler for GENERIC_INTERRUPT_VECTOR.
234 */
235void smp_generic_interrupt(struct pt_regs *regs)
236{
237 struct pt_regs *old_regs = set_irq_regs(regs);
238
239 ack_APIC_irq();
240
241 exit_idle();
242
243 irq_enter();
244
245 inc_irq_stat(generic_irqs);
246
247 if (generic_interrupt_extension)
248 generic_interrupt_extension();
249
250 irq_exit();
251
252 set_irq_regs(old_regs);
253}
254
229EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 255EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 50b8c3a3006c..bc1326105448 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -175,6 +175,9 @@ void __init native_init_IRQ(void)
175 /* self generated IPI for local APIC timer */ 175 /* self generated IPI for local APIC timer */
176 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 176 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
177 177
178 /* generic IPI for platform specific use */
179 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
180
178 /* IPI vectors for APIC spurious and error interrupts */ 181 /* IPI vectors for APIC spurious and error interrupts */
179 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 182 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
180 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 183 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f30..c7a49e0ffbfb 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
147 /* self generated IPI for local APIC timer */ 147 /* self generated IPI for local APIC timer */
148 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 148 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
149 149
150 /* generic IPI for platform specific use */
151 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
152
150 /* IPI vectors for APIC spurious and error interrupts */ 153 /* IPI vectors for APIC spurious and error interrupts */
151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 154 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 155 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index ff7d3b0124f1..e444357375ce 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -8,11 +8,11 @@
8 */ 8 */
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/stat.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/stat.h>
13#include <linux/io.h> 14#include <linux/io.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/module.h>
16 16
17#include <asm/setup.h> 17#include <asm/setup.h>
18 18
@@ -26,9 +26,8 @@ struct setup_data_node {
26 u32 len; 26 u32 len;
27}; 27};
28 28
29static ssize_t 29static ssize_t setup_data_read(struct file *file, char __user *user_buf,
30setup_data_read(struct file *file, char __user *user_buf, size_t count, 30 size_t count, loff_t *ppos)
31 loff_t *ppos)
32{ 31{
33 struct setup_data_node *node = file->private_data; 32 struct setup_data_node *node = file->private_data;
34 unsigned long remain; 33 unsigned long remain;
@@ -39,20 +38,21 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
39 38
40 if (pos < 0) 39 if (pos < 0)
41 return -EINVAL; 40 return -EINVAL;
41
42 if (pos >= node->len) 42 if (pos >= node->len)
43 return 0; 43 return 0;
44 44
45 if (count > node->len - pos) 45 if (count > node->len - pos)
46 count = node->len - pos; 46 count = node->len - pos;
47
47 pa = node->paddr + sizeof(struct setup_data) + pos; 48 pa = node->paddr + sizeof(struct setup_data) + pos;
48 pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); 49 pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
49 if (PageHighMem(pg)) { 50 if (PageHighMem(pg)) {
50 p = ioremap_cache(pa, count); 51 p = ioremap_cache(pa, count);
51 if (!p) 52 if (!p)
52 return -ENXIO; 53 return -ENXIO;
53 } else { 54 } else
54 p = __va(pa); 55 p = __va(pa);
55 }
56 56
57 remain = copy_to_user(user_buf, p, count); 57 remain = copy_to_user(user_buf, p, count);
58 58
@@ -70,12 +70,13 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
70static int setup_data_open(struct inode *inode, struct file *file) 70static int setup_data_open(struct inode *inode, struct file *file)
71{ 71{
72 file->private_data = inode->i_private; 72 file->private_data = inode->i_private;
73
73 return 0; 74 return 0;
74} 75}
75 76
76static const struct file_operations fops_setup_data = { 77static const struct file_operations fops_setup_data = {
77 .read = setup_data_read, 78 .read = setup_data_read,
78 .open = setup_data_open, 79 .open = setup_data_open,
79}; 80};
80 81
81static int __init 82static int __init
@@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *parent, int no,
84{ 85{
85 struct dentry *d, *type, *data; 86 struct dentry *d, *type, *data;
86 char buf[16]; 87 char buf[16];
87 int error;
88 88
89 sprintf(buf, "%d", no); 89 sprintf(buf, "%d", no);
90 d = debugfs_create_dir(buf, parent); 90 d = debugfs_create_dir(buf, parent);
91 if (!d) { 91 if (!d)
92 error = -ENOMEM; 92 return -ENOMEM;
93 goto err_return; 93
94 }
95 type = debugfs_create_x32("type", S_IRUGO, d, &node->type); 94 type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
96 if (!type) { 95 if (!type)
97 error = -ENOMEM;
98 goto err_dir; 96 goto err_dir;
99 } 97
100 data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); 98 data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
101 if (!data) { 99 if (!data)
102 error = -ENOMEM;
103 goto err_type; 100 goto err_type;
104 } 101
105 return 0; 102 return 0;
106 103
107err_type: 104err_type:
108 debugfs_remove(type); 105 debugfs_remove(type);
109err_dir: 106err_dir:
110 debugfs_remove(d); 107 debugfs_remove(d);
111err_return: 108 return -ENOMEM;
112 return error;
113} 109}
114 110
115static int __init create_setup_data_nodes(struct dentry *parent) 111static int __init create_setup_data_nodes(struct dentry *parent)
116{ 112{
117 struct setup_data_node *node; 113 struct setup_data_node *node;
118 struct setup_data *data; 114 struct setup_data *data;
119 int error, no = 0; 115 int error = -ENOMEM;
120 struct dentry *d; 116 struct dentry *d;
121 struct page *pg; 117 struct page *pg;
122 u64 pa_data; 118 u64 pa_data;
119 int no = 0;
123 120
124 d = debugfs_create_dir("setup_data", parent); 121 d = debugfs_create_dir("setup_data", parent);
125 if (!d) { 122 if (!d)
126 error = -ENOMEM; 123 return -ENOMEM;
127 goto err_return;
128 }
129 124
130 pa_data = boot_params.hdr.setup_data; 125 pa_data = boot_params.hdr.setup_data;
131 126
132 while (pa_data) { 127 while (pa_data) {
133 node = kmalloc(sizeof(*node), GFP_KERNEL); 128 node = kmalloc(sizeof(*node), GFP_KERNEL);
134 if (!node) { 129 if (!node)
135 error = -ENOMEM;
136 goto err_dir; 130 goto err_dir;
137 } 131
138 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); 132 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
139 if (PageHighMem(pg)) { 133 if (PageHighMem(pg)) {
140 data = ioremap_cache(pa_data, sizeof(*data)); 134 data = ioremap_cache(pa_data, sizeof(*data));
@@ -143,9 +137,8 @@ static int __init create_setup_data_nodes(struct dentry *parent)
143 error = -ENXIO; 137 error = -ENXIO;
144 goto err_dir; 138 goto err_dir;
145 } 139 }
146 } else { 140 } else
147 data = __va(pa_data); 141 data = __va(pa_data);
148 }
149 142
150 node->paddr = pa_data; 143 node->paddr = pa_data;
151 node->type = data->type; 144 node->type = data->type;
@@ -159,11 +152,11 @@ static int __init create_setup_data_nodes(struct dentry *parent)
159 goto err_dir; 152 goto err_dir;
160 no++; 153 no++;
161 } 154 }
155
162 return 0; 156 return 0;
163 157
164err_dir: 158err_dir:
165 debugfs_remove(d); 159 debugfs_remove(d);
166err_return:
167 return error; 160 return error;
168} 161}
169 162
@@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_params_blob = {
175static int __init boot_params_kdebugfs_init(void) 168static int __init boot_params_kdebugfs_init(void)
176{ 169{
177 struct dentry *dbp, *version, *data; 170 struct dentry *dbp, *version, *data;
178 int error; 171 int error = -ENOMEM;
179 172
180 dbp = debugfs_create_dir("boot_params", NULL); 173 dbp = debugfs_create_dir("boot_params", NULL);
181 if (!dbp) { 174 if (!dbp)
182 error = -ENOMEM; 175 return -ENOMEM;
183 goto err_return; 176
184 }
185 version = debugfs_create_x16("version", S_IRUGO, dbp, 177 version = debugfs_create_x16("version", S_IRUGO, dbp,
186 &boot_params.hdr.version); 178 &boot_params.hdr.version);
187 if (!version) { 179 if (!version)
188 error = -ENOMEM;
189 goto err_dir; 180 goto err_dir;
190 } 181
191 data = debugfs_create_blob("data", S_IRUGO, dbp, 182 data = debugfs_create_blob("data", S_IRUGO, dbp,
192 &boot_params_blob); 183 &boot_params_blob);
193 if (!data) { 184 if (!data)
194 error = -ENOMEM;
195 goto err_version; 185 goto err_version;
196 } 186
197 error = create_setup_data_nodes(dbp); 187 error = create_setup_data_nodes(dbp);
198 if (error) 188 if (error)
199 goto err_data; 189 goto err_data;
190
200 return 0; 191 return 0;
201 192
202err_data: 193err_data:
@@ -205,10 +196,9 @@ err_version:
205 debugfs_remove(version); 196 debugfs_remove(version);
206err_dir: 197err_dir:
207 debugfs_remove(dbp); 198 debugfs_remove(dbp);
208err_return:
209 return error; 199 return error;
210} 200}
211#endif 201#endif /* CONFIG_DEBUG_BOOT_PARAMS */
212 202
213static int __init arch_kdebugfs_init(void) 203static int __init arch_kdebugfs_init(void)
214{ 204{
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 4558dd3918cf..55b94614e348 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,7 +193,7 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
193 kprobe_opcode_t opcode; 193 kprobe_opcode_t opcode;
194 kprobe_opcode_t *orig_opcodes = opcodes; 194 kprobe_opcode_t *orig_opcodes = opcodes;
195 195
196 if (search_exception_tables(opcodes)) 196 if (search_exception_tables((unsigned long)opcodes))
197 return 0; /* Page fault may occur on this address. */ 197 return 0; /* Page fault may occur on this address. */
198 198
199retry: 199retry:
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986eca..33019ddb56b4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
138 kvm_mmu_write(ptep, pte_val(pte)); 138 kvm_mmu_write(ptep, pte_val(pte));
139} 139}
140 140
141static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
142 pte_t *ptep, pte_t pte)
143{
144 kvm_mmu_write(ptep, pte_val(pte));
145}
146
147static void kvm_pte_clear(struct mm_struct *mm, 141static void kvm_pte_clear(struct mm_struct *mm,
148 unsigned long addr, pte_t *ptep) 142 unsigned long addr, pte_t *ptep)
149{ 143{
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
220#if PAGETABLE_LEVELS >= 3 214#if PAGETABLE_LEVELS >= 3
221#ifdef CONFIG_X86_PAE 215#ifdef CONFIG_X86_PAE
222 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; 216 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
223 pv_mmu_ops.set_pte_present = kvm_set_pte_present;
224 pv_mmu_ops.pte_clear = kvm_pte_clear; 217 pv_mmu_ops.pte_clear = kvm_pte_clear;
225 pv_mmu_ops.pmd_clear = kvm_pmd_clear; 218 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
226#endif 219#endif
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c781a62..e7368c1da01d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h> 15#include <linux/suspend.h>
16#include <linux/gfp.h> 16#include <linux/gfp.h>
17#include <linux/io.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19#include <asm/pgalloc.h> 20#include <asm/pgalloc.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/io.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/cpufeature.h> 24#include <asm/cpufeature.h>
25#include <asm/desc.h> 25#include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
63 "\tmovl %%eax,%%fs\n" 63 "\tmovl %%eax,%%fs\n"
64 "\tmovl %%eax,%%gs\n" 64 "\tmovl %%eax,%%gs\n"
65 "\tmovl %%eax,%%ss\n" 65 "\tmovl %%eax,%%ss\n"
66 ::: "eax", "memory"); 66 : : : "eax", "memory");
67#undef STR 67#undef STR
68#undef __STR 68#undef __STR
69} 69}
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
208 /* We need to put APICs in legacy mode so that we can 208 /*
209 * We need to put APICs in legacy mode so that we can
209 * get timer interrupts in second kernel. kexec/kdump 210 * get timer interrupts in second kernel. kexec/kdump
210 * paths already have calls to disable_IO_APIC() in 211 * paths already have calls to disable_IO_APIC() in
211 * one form or other. kexec jump path also need 212 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
227 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) 228 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
228 << PAGE_SHIFT); 229 << PAGE_SHIFT);
229 230
230 /* The segment registers are funny things, they have both a 231 /*
232 * The segment registers are funny things, they have both a
231 * visible and an invisible part. Whenever the visible part is 233 * visible and an invisible part. Whenever the visible part is
232 * set to a specific selector, the invisible part is loaded 234 * set to a specific selector, the invisible part is loaded
233 * with from a table in memory. At no other time is the 235 * with from a table in memory. At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
237 * segments, before I zap the gdt with an invalid value. 239 * segments, before I zap the gdt with an invalid value.
238 */ 240 */
239 load_segments(); 241 load_segments();
240 /* The gdt & idt are now invalid. 242 /*
243 * The gdt & idt are now invalid.
241 * If you want to load them you must set up your own idt & gdt. 244 * If you want to load them you must set up your own idt & gdt.
242 */ 245 */
243 set_gdt(phys_to_virt(0),0); 246 set_gdt(phys_to_virt(0), 0);
244 set_idt(phys_to_virt(0),0); 247 set_idt(phys_to_virt(0), 0);
245 248
246 /* now call it */ 249 /* now call it */
247 image->start = relocate_kernel_ptr((unsigned long)image->head, 250 image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51b7fd8..89cea4d44679 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,47 @@
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/io.h>
16#include <linux/suspend.h>
15 17
16#include <asm/pgtable.h> 18#include <asm/pgtable.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
18#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
19#include <asm/io.h> 21
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr)
24{
25 pud_t *pud;
26 pmd_t *pmd;
27 struct page *page;
28 int result = -ENOMEM;
29
30 addr &= PMD_MASK;
31 pgd += pgd_index(addr);
32 if (!pgd_present(*pgd)) {
33 page = kimage_alloc_control_pages(image, 0);
34 if (!page)
35 goto out;
36 pud = (pud_t *)page_address(page);
37 memset(pud, 0, PAGE_SIZE);
38 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
39 }
40 pud = pud_offset(pgd, addr);
41 if (!pud_present(*pud)) {
42 page = kimage_alloc_control_pages(image, 0);
43 if (!page)
44 goto out;
45 pmd = (pmd_t *)page_address(page);
46 memset(pmd, 0, PAGE_SIZE);
47 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
48 }
49 pmd = pmd_offset(pud, addr);
50 if (!pmd_present(*pmd))
51 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
52 result = 0;
53out:
54 return result;
55}
20 56
21static void init_level2_page(pmd_t *level2p, unsigned long addr) 57static void init_level2_page(pmd_t *level2p, unsigned long addr)
22{ 58{
@@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
83 } 119 }
84 level3p = (pud_t *)page_address(page); 120 level3p = (pud_t *)page_address(page);
85 result = init_level3_page(image, level3p, addr, last_addr); 121 result = init_level3_page(image, level3p, addr, last_addr);
86 if (result) { 122 if (result)
87 goto out; 123 goto out;
88 }
89 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); 124 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
90 addr += PGDIR_SIZE; 125 addr += PGDIR_SIZE;
91 } 126 }
@@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
156 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 191 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
157 if (result) 192 if (result)
158 return result; 193 return result;
194 /*
195 * image->start may be outside 0 ~ max_pfn, for example when
196 * jump back to original kernel from kexeced kernel
197 */
198 result = init_one_level2_page(image, level4p, image->start);
199 if (result)
200 return result;
159 return init_transition_pgtable(image, level4p); 201 return init_transition_pgtable(image, level4p);
160} 202}
161 203
@@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image)
229{ 271{
230 unsigned long page_list[PAGES_NR]; 272 unsigned long page_list[PAGES_NR];
231 void *control_page; 273 void *control_page;
274 int save_ftrace_enabled;
232 275
233 tracer_disable(); 276#ifdef CONFIG_KEXEC_JUMP
277 if (kexec_image->preserve_context)
278 save_processor_state();
279#endif
280
281 save_ftrace_enabled = __ftrace_enabled_save();
234 282
235 /* Interrupts aren't acceptable while we reboot */ 283 /* Interrupts aren't acceptable while we reboot */
236 local_irq_disable(); 284 local_irq_disable();
237 285
286 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC
288 /*
289 * We need to put APICs in legacy mode so that we can
290 * get timer interrupts in second kernel. kexec/kdump
291 * paths already have calls to disable_IO_APIC() in
292 * one form or other. kexec jump path also need
293 * one.
294 */
295 disable_IO_APIC();
296#endif
297 }
298
238 control_page = page_address(image->control_code_page) + PAGE_SIZE; 299 control_page = page_address(image->control_code_page) + PAGE_SIZE;
239 memcpy(control_page, relocate_kernel, PAGE_SIZE); 300 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
240 301
241 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); 302 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
303 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
242 page_list[PA_TABLE_PAGE] = 304 page_list[PA_TABLE_PAGE] =
243 (unsigned long)__pa(page_address(image->control_code_page)); 305 (unsigned long)__pa(page_address(image->control_code_page));
244 306
245 /* The segment registers are funny things, they have both a 307 if (image->type == KEXEC_TYPE_DEFAULT)
308 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
309 << PAGE_SHIFT);
310
311 /*
312 * The segment registers are funny things, they have both a
246 * visible and an invisible part. Whenever the visible part is 313 * visible and an invisible part. Whenever the visible part is
247 * set to a specific selector, the invisible part is loaded 314 * set to a specific selector, the invisible part is loaded
248 * with from a table in memory. At no other time is the 315 * with from a table in memory. At no other time is the
@@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image)
252 * segments, before I zap the gdt with an invalid value. 319 * segments, before I zap the gdt with an invalid value.
253 */ 320 */
254 load_segments(); 321 load_segments();
255 /* The gdt & idt are now invalid. 322 /*
323 * The gdt & idt are now invalid.
256 * If you want to load them you must set up your own idt & gdt. 324 * If you want to load them you must set up your own idt & gdt.
257 */ 325 */
258 set_gdt(phys_to_virt(0),0); 326 set_gdt(phys_to_virt(0), 0);
259 set_idt(phys_to_virt(0),0); 327 set_idt(phys_to_virt(0), 0);
260 328
261 /* now call it */ 329 /* now call it */
262 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 330 image->start = relocate_kernel((unsigned long)image->head,
263 image->start); 331 (unsigned long)page_list,
332 image->start,
333 image->preserve_context);
334
335#ifdef CONFIG_KEXEC_JUMP
336 if (kexec_image->preserve_context)
337 restore_processor_state();
338#endif
339
340 __ftrace_enabled_restore(save_ftrace_enabled);
264} 341}
265 342
266void arch_crash_save_vmcoreinfo(void) 343void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f9..712d15fdc416 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
226 return 0; 226 return 0;
227} 227}
228 228
229static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { 229static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
230 { 230 {
231 .callback = set_check_enable_amd_mmconf, 231 .callback = set_check_enable_amd_mmconf,
232 .ident = "Sun Microsystems Machine", 232 .ident = "Sun Microsystems Machine",
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1baf..dce99dca6cf8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -109,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
109 } else 109 } else
110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
111} 111}
112#endif
113
114#ifdef CONFIG_X86_IO_APIC
115 112
116static int bad_ioapic(unsigned long address) 113static int bad_ioapic(unsigned long address)
117{ 114{
@@ -224,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
224 if (++mp_irq_entries == MAX_IRQ_SOURCES) 221 if (++mp_irq_entries == MAX_IRQ_SOURCES)
225 panic("Max # of irq sources exceeded!!\n"); 222 panic("Max # of irq sources exceeded!!\n");
226} 223}
224#else /* CONFIG_X86_IO_APIC */
225static inline void __init MP_bus_info(struct mpc_bus *m) {}
226static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
227static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
228#endif /* CONFIG_X86_IO_APIC */
227 229
228#endif
229 230
230static void __init MP_lintsrc_info(struct mpc_lintsrc *m) 231static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
231{ 232{
@@ -275,6 +276,20 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
275 return 1; 276 return 1;
276} 277}
277 278
279static void skip_entry(unsigned char **ptr, int *count, int size)
280{
281 *ptr += size;
282 *count += size;
283}
284
285static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
286{
287 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
288 "type %x\n", *mpt);
289 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
290 1, mpc, mpc->length, 1);
291}
292
278static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 293static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
279{ 294{
280 char str[16]; 295 char str[16];
@@ -310,61 +325,30 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
310 while (count < mpc->length) { 325 while (count < mpc->length) {
311 switch (*mpt) { 326 switch (*mpt) {
312 case MP_PROCESSOR: 327 case MP_PROCESSOR:
313 { 328 /* ACPI may have already provided this data */
314 struct mpc_cpu *m = (struct mpc_cpu *)mpt; 329 if (!acpi_lapic)
315 /* ACPI may have already provided this data */ 330 MP_processor_info((struct mpc_cpu *)mpt);
316 if (!acpi_lapic) 331 skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
317 MP_processor_info(m); 332 break;
318 mpt += sizeof(*m);
319 count += sizeof(*m);
320 break;
321 }
322 case MP_BUS: 333 case MP_BUS:
323 { 334 MP_bus_info((struct mpc_bus *)mpt);
324 struct mpc_bus *m = (struct mpc_bus *)mpt; 335 skip_entry(&mpt, &count, sizeof(struct mpc_bus));
325#ifdef CONFIG_X86_IO_APIC 336 break;
326 MP_bus_info(m);
327#endif
328 mpt += sizeof(*m);
329 count += sizeof(*m);
330 break;
331 }
332 case MP_IOAPIC: 337 case MP_IOAPIC:
333 { 338 MP_ioapic_info((struct mpc_ioapic *)mpt);
334#ifdef CONFIG_X86_IO_APIC 339 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
335 struct mpc_ioapic *m = (struct mpc_ioapic *)mpt; 340 break;
336 MP_ioapic_info(m);
337#endif
338 mpt += sizeof(struct mpc_ioapic);
339 count += sizeof(struct mpc_ioapic);
340 break;
341 }
342 case MP_INTSRC: 341 case MP_INTSRC:
343 { 342 MP_intsrc_info((struct mpc_intsrc *)mpt);
344#ifdef CONFIG_X86_IO_APIC 343 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
345 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 344 break;
346
347 MP_intsrc_info(m);
348#endif
349 mpt += sizeof(struct mpc_intsrc);
350 count += sizeof(struct mpc_intsrc);
351 break;
352 }
353 case MP_LINTSRC: 345 case MP_LINTSRC:
354 { 346 MP_lintsrc_info((struct mpc_lintsrc *)mpt);
355 struct mpc_lintsrc *m = 347 skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
356 (struct mpc_lintsrc *)mpt; 348 break;
357 MP_lintsrc_info(m);
358 mpt += sizeof(*m);
359 count += sizeof(*m);
360 break;
361 }
362 default: 349 default:
363 /* wrong mptable */ 350 /* wrong mptable */
364 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 351 smp_dump_mptable(mpc, mpt);
365 printk(KERN_ERR "type %x\n", *mpt);
366 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
367 1, mpc, mpc->length, 1);
368 count = mpc->length; 352 count = mpc->length;
369 break; 353 break;
370 } 354 }
@@ -558,6 +542,68 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
558 542
559static struct mpf_intel *mpf_found; 543static struct mpf_intel *mpf_found;
560 544
545static unsigned long __init get_mpc_size(unsigned long physptr)
546{
547 struct mpc_table *mpc;
548 unsigned long size;
549
550 mpc = early_ioremap(physptr, PAGE_SIZE);
551 size = mpc->length;
552 early_iounmap(mpc, PAGE_SIZE);
553 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
554
555 return size;
556}
557
558static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
559{
560 struct mpc_table *mpc;
561 unsigned long size;
562
563 size = get_mpc_size(mpf->physptr);
564 mpc = early_ioremap(mpf->physptr, size);
565 /*
566 * Read the physical hardware table. Anything here will
567 * override the defaults.
568 */
569 if (!smp_read_mpc(mpc, early)) {
570#ifdef CONFIG_X86_LOCAL_APIC
571 smp_found_config = 0;
572#endif
573 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
574 "... disabling SMP support. (tell your hw vendor)\n");
575 early_iounmap(mpc, size);
576 return -1;
577 }
578 early_iounmap(mpc, size);
579
580 if (early)
581 return -1;
582
583#ifdef CONFIG_X86_IO_APIC
584 /*
585 * If there are no explicit MP IRQ entries, then we are
586 * broken. We set up most of the low 16 IO-APIC pins to
587 * ISA defaults and hope it will work.
588 */
589 if (!mp_irq_entries) {
590 struct mpc_bus bus;
591
592 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
593 "using default mptable. (tell your hw vendor)\n");
594
595 bus.type = MP_BUS;
596 bus.busid = 0;
597 memcpy(bus.bustype, "ISA ", 6);
598 MP_bus_info(&bus);
599
600 construct_default_ioirq_mptable(0);
601 }
602#endif
603
604 return 0;
605}
606
561/* 607/*
562 * Scan the memory blocks for an SMP configuration block. 608 * Scan the memory blocks for an SMP configuration block.
563 */ 609 */
@@ -611,45 +657,8 @@ static void __init __get_smp_config(unsigned int early)
611 construct_default_ISA_mptable(mpf->feature1); 657 construct_default_ISA_mptable(mpf->feature1);
612 658
613 } else if (mpf->physptr) { 659 } else if (mpf->physptr) {
614 660 if (check_physptr(mpf, early))
615 /*
616 * Read the physical hardware table. Anything here will
617 * override the defaults.
618 */
619 if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
620#ifdef CONFIG_X86_LOCAL_APIC
621 smp_found_config = 0;
622#endif
623 printk(KERN_ERR
624 "BIOS bug, MP table errors detected!...\n");
625 printk(KERN_ERR "... disabling SMP support. "
626 "(tell your hw vendor)\n");
627 return;
628 }
629
630 if (early)
631 return; 661 return;
632#ifdef CONFIG_X86_IO_APIC
633 /*
634 * If there are no explicit MP IRQ entries, then we are
635 * broken. We set up most of the low 16 IO-APIC pins to
636 * ISA defaults and hope it will work.
637 */
638 if (!mp_irq_entries) {
639 struct mpc_bus bus;
640
641 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
642 "using default mptable. "
643 "(tell your hw vendor)\n");
644
645 bus.type = MP_BUS;
646 bus.busid = 0;
647 memcpy(bus.bustype, "ISA ", 6);
648 MP_bus_info(&bus);
649
650 construct_default_ioirq_mptable(0);
651 }
652#endif
653 } else 662 } else
654 BUG(); 663 BUG();
655 664
@@ -670,6 +679,31 @@ void __init get_smp_config(void)
670 __get_smp_config(0); 679 __get_smp_config(0);
671} 680}
672 681
682static void smp_reserve_bootmem(struct mpf_intel *mpf)
683{
684 unsigned long size = get_mpc_size(mpf->physptr);
685#ifdef CONFIG_X86_32
686 /*
687 * We cannot access to MPC table to compute table size yet,
688 * as only few megabytes from the bottom is mapped now.
689 * PC-9800's MPC table places on the very last of physical
690 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
691 * yields BUG() in reserve_bootmem.
692 * also need to make sure physptr is below than max_low_pfn
693 * we don't need reserve the area above max_low_pfn
694 */
695 unsigned long end = max_low_pfn * PAGE_SIZE;
696
697 if (mpf->physptr < end) {
698 if (mpf->physptr + size > end)
699 size = end - mpf->physptr;
700 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
701 }
702#else
703 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
704#endif
705}
706
673static int __init smp_scan_config(unsigned long base, unsigned long length, 707static int __init smp_scan_config(unsigned long base, unsigned long length,
674 unsigned reserve) 708 unsigned reserve)
675{ 709{
@@ -697,36 +731,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
697 731
698 if (!reserve) 732 if (!reserve)
699 return 1; 733 return 1;
700 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 734 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
701 BOOTMEM_DEFAULT);
702 if (mpf->physptr) {
703 unsigned long size = PAGE_SIZE;
704#ifdef CONFIG_X86_32
705 /*
706 * We cannot access to MPC table to compute
707 * table size yet, as only few megabytes from
708 * the bottom is mapped now.
709 * PC-9800's MPC table places on the very last
710 * of physical memory; so that simply reserving
711 * PAGE_SIZE from mpf->physptr yields BUG()
712 * in reserve_bootmem.
713 * also need to make sure physptr is below than
714 * max_low_pfn
715 * we don't need reserve the area above max_low_pfn
716 */
717 unsigned long end = max_low_pfn * PAGE_SIZE;
718
719 if (mpf->physptr < end) {
720 if (mpf->physptr + size > end)
721 size = end - mpf->physptr;
722 reserve_bootmem_generic(mpf->physptr, size,
723 BOOTMEM_DEFAULT);
724 }
725#else
726 reserve_bootmem_generic(mpf->physptr, size,
727 BOOTMEM_DEFAULT); 735 BOOTMEM_DEFAULT);
728#endif 736 if (mpf->physptr)
729 } 737 smp_reserve_bootmem(mpf);
730 738
731 return 1; 739 return 1;
732 } 740 }
@@ -829,7 +837,57 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
829#define SPARE_SLOT_NUM 20 837#define SPARE_SLOT_NUM 20
830 838
831static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; 839static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
832#endif 840
841static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
842{
843 int i;
844
845 apic_printk(APIC_VERBOSE, "OLD ");
846 print_MP_intsrc_info(m);
847
848 i = get_MP_intsrc_index(m);
849 if (i > 0) {
850 assign_to_mpc_intsrc(&mp_irqs[i], m);
851 apic_printk(APIC_VERBOSE, "NEW ");
852 print_mp_irq_info(&mp_irqs[i]);
853 return;
854 }
855 if (!i) {
856 /* legacy, do nothing */
857 return;
858 }
859 if (*nr_m_spare < SPARE_SLOT_NUM) {
860 /*
861 * not found (-1), or duplicated (-2) are invalid entries,
862 * we need to use the slot later
863 */
864 m_spare[*nr_m_spare] = m;
865 *nr_m_spare += 1;
866 }
867}
868#else /* CONFIG_X86_IO_APIC */
869static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
870#endif /* CONFIG_X86_IO_APIC */
871
872static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
873 int count)
874{
875 if (!mpc_new_phys) {
876 pr_info("No spare slots, try to append...take your risk, "
877 "new mpc_length %x\n", count);
878 } else {
879 if (count <= mpc_new_length)
880 pr_info("No spare slots, try to append..., "
881 "new mpc_length %x\n", count);
882 else {
883 pr_err("mpc_new_length %lx is too small\n",
884 mpc_new_length);
885 return -1;
886 }
887 }
888
889 return 0;
890}
833 891
834static int __init replace_intsrc_all(struct mpc_table *mpc, 892static int __init replace_intsrc_all(struct mpc_table *mpc,
835 unsigned long mpc_new_phys, 893 unsigned long mpc_new_phys,
@@ -837,77 +895,33 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
837{ 895{
838#ifdef CONFIG_X86_IO_APIC 896#ifdef CONFIG_X86_IO_APIC
839 int i; 897 int i;
840 int nr_m_spare = 0;
841#endif 898#endif
842
843 int count = sizeof(*mpc); 899 int count = sizeof(*mpc);
900 int nr_m_spare = 0;
844 unsigned char *mpt = ((unsigned char *)mpc) + count; 901 unsigned char *mpt = ((unsigned char *)mpc) + count;
845 902
846 printk(KERN_INFO "mpc_length %x\n", mpc->length); 903 printk(KERN_INFO "mpc_length %x\n", mpc->length);
847 while (count < mpc->length) { 904 while (count < mpc->length) {
848 switch (*mpt) { 905 switch (*mpt) {
849 case MP_PROCESSOR: 906 case MP_PROCESSOR:
850 { 907 skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
851 struct mpc_cpu *m = (struct mpc_cpu *)mpt; 908 break;
852 mpt += sizeof(*m);
853 count += sizeof(*m);
854 break;
855 }
856 case MP_BUS: 909 case MP_BUS:
857 { 910 skip_entry(&mpt, &count, sizeof(struct mpc_bus));
858 struct mpc_bus *m = (struct mpc_bus *)mpt; 911 break;
859 mpt += sizeof(*m);
860 count += sizeof(*m);
861 break;
862 }
863 case MP_IOAPIC: 912 case MP_IOAPIC:
864 { 913 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
865 mpt += sizeof(struct mpc_ioapic); 914 break;
866 count += sizeof(struct mpc_ioapic);
867 break;
868 }
869 case MP_INTSRC: 915 case MP_INTSRC:
870 { 916 check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
871#ifdef CONFIG_X86_IO_APIC 917 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
872 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 918 break;
873
874 printk(KERN_INFO "OLD ");
875 print_MP_intsrc_info(m);
876 i = get_MP_intsrc_index(m);
877 if (i > 0) {
878 assign_to_mpc_intsrc(&mp_irqs[i], m);
879 printk(KERN_INFO "NEW ");
880 print_mp_irq_info(&mp_irqs[i]);
881 } else if (!i) {
882 /* legacy, do nothing */
883 } else if (nr_m_spare < SPARE_SLOT_NUM) {
884 /*
885 * not found (-1), or duplicated (-2)
886 * are invalid entries,
887 * we need to use the slot later
888 */
889 m_spare[nr_m_spare] = m;
890 nr_m_spare++;
891 }
892#endif
893 mpt += sizeof(struct mpc_intsrc);
894 count += sizeof(struct mpc_intsrc);
895 break;
896 }
897 case MP_LINTSRC: 919 case MP_LINTSRC:
898 { 920 skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
899 struct mpc_lintsrc *m = 921 break;
900 (struct mpc_lintsrc *)mpt;
901 mpt += sizeof(*m);
902 count += sizeof(*m);
903 break;
904 }
905 default: 922 default:
906 /* wrong mptable */ 923 /* wrong mptable */
907 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 924 smp_dump_mptable(mpc, mpt);
908 printk(KERN_ERR "type %x\n", *mpt);
909 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
910 1, mpc, mpc->length, 1);
911 goto out; 925 goto out;
912 } 926 }
913 } 927 }
@@ -924,23 +938,15 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
924 continue; 938 continue;
925 939
926 if (nr_m_spare > 0) { 940 if (nr_m_spare > 0) {
927 printk(KERN_INFO "*NEW* found "); 941 apic_printk(APIC_VERBOSE, "*NEW* found\n");
928 nr_m_spare--; 942 nr_m_spare--;
929 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); 943 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
930 m_spare[nr_m_spare] = NULL; 944 m_spare[nr_m_spare] = NULL;
931 } else { 945 } else {
932 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 946 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
933 count += sizeof(struct mpc_intsrc); 947 count += sizeof(struct mpc_intsrc);
934 if (!mpc_new_phys) { 948 if (!check_slot(mpc_new_phys, mpc_new_length, count))
935 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); 949 goto out;
936 } else {
937 if (count <= mpc_new_length)
938 printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
939 else {
940 printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
941 goto out;
942 }
943 }
944 assign_to_mpc_intsrc(&mp_irqs[i], m); 950 assign_to_mpc_intsrc(&mp_irqs[i], m);
945 mpc->length = count; 951 mpc->length = count;
946 mpt += sizeof(struct mpc_intsrc); 952 mpt += sizeof(struct mpc_intsrc);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 63dd358d8ee1..8e45f4464880 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -470,7 +470,6 @@ struct pv_mmu_ops pv_mmu_ops = {
470#if PAGETABLE_LEVELS >= 3 470#if PAGETABLE_LEVELS >= 3
471#ifdef CONFIG_X86_PAE 471#ifdef CONFIG_X86_PAE
472 .set_pte_atomic = native_set_pte_atomic, 472 .set_pte_atomic = native_set_pte_atomic,
473 .set_pte_present = native_set_pte_present,
474 .pte_clear = native_pte_clear, 473 .pte_clear = native_pte_clear,
475 .pmd_clear = native_pmd_clear, 474 .pmd_clear = native_pmd_clear,
476#endif 475#endif
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index c70ab5a5d4c8..8b02a3936d42 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -1,14 +1,14 @@
1/* Fallback functions when the main IOMMU code is not compiled in. This 1/* Fallback functions when the main IOMMU code is not compiled in. This
2 code is roughly equivalent to i386. */ 2 code is roughly equivalent to i386. */
3#include <linux/mm.h>
4#include <linux/init.h>
5#include <linux/pci.h>
6#include <linux/string.h>
7#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h>
6#include <linux/init.h>
7#include <linux/pci.h>
8#include <linux/mm.h>
9 9
10#include <asm/iommu.h>
11#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/iommu.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
14static int 14static int
@@ -79,11 +79,11 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
79} 79}
80 80
81struct dma_mapping_ops nommu_dma_ops = { 81struct dma_mapping_ops nommu_dma_ops = {
82 .alloc_coherent = dma_generic_alloc_coherent, 82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent, 83 .free_coherent = nommu_free_coherent,
84 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
85 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
86 .is_phys = 1, 86 .is_phys = 1,
87}; 87};
88 88
89void __init no_iommu_init(void) 89void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6afa5232dbb7..156f87582c6c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -65,11 +65,11 @@ void exit_thread(void)
65{ 65{
66 struct task_struct *me = current; 66 struct task_struct *me = current;
67 struct thread_struct *t = &me->thread; 67 struct thread_struct *t = &me->thread;
68 unsigned long *bp = t->io_bitmap_ptr;
68 69
69 if (me->thread.io_bitmap_ptr) { 70 if (bp) {
70 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 71 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
71 72
72 kfree(t->io_bitmap_ptr);
73 t->io_bitmap_ptr = NULL; 73 t->io_bitmap_ptr = NULL;
74 clear_thread_flag(TIF_IO_BITMAP); 74 clear_thread_flag(TIF_IO_BITMAP);
75 /* 75 /*
@@ -78,6 +78,7 @@ void exit_thread(void)
78 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 78 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
79 t->io_bitmap_max = 0; 79 t->io_bitmap_max = 0;
80 put_cpu(); 80 put_cpu();
81 kfree(bp);
81 } 82 }
82 83
83 ds_exit_thread(current); 84 ds_exit_thread(current);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3d9672e59c16..19378715f415 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -685,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child,
685 if (!cfg.signal) 685 if (!cfg.signal)
686 return -EINVAL; 686 return -EINVAL;
687 687
688 return -EOPNOTSUPP;
689
690 child->thread.bts_ovfl_signal = cfg.signal; 688 child->thread.bts_ovfl_signal = cfg.signal;
689 return -EOPNOTSUPP;
691 } 690 }
692 691
693 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && 692 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 697d1b78cfbf..e95022e4f5d5 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
74 if (!force_hpet_address) 74 if (!force_hpet_address)
75 return; 75 return;
76 76
77 if (rcba_base == NULL) 77 BUG_ON(rcba_base == NULL);
78 BUG();
79 78
80 /* read the Function Disable register, dword mode only */ 79 /* read the Function Disable register, dword mode only */
81 val = readl(rcba_base + 0x3404); 80 val = readl(rcba_base + 0x3404);
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0aa8d28..41235531b11c 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
17 17
18#define PTR(x) (x << 2) 18#define PTR(x) (x << 2)
19 19
20/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE 20/*
21 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
21 * ~ control_page + PAGE_SIZE are used as data storage and stack for 22 * ~ control_page + PAGE_SIZE are used as data storage and stack for
22 * jumping back 23 * jumping back
23 */ 24 */
@@ -76,8 +77,10 @@ relocate_kernel:
76 movl %eax, CP_PA_SWAP_PAGE(%edi) 77 movl %eax, CP_PA_SWAP_PAGE(%edi)
77 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) 78 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
78 79
79 /* get physical address of control page now */ 80 /*
80 /* this is impossible after page table switch */ 81 * get physical address of control page now
82 * this is impossible after page table switch
83 */
81 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 84 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
82 85
83 /* switch to new set of page tables */ 86 /* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
97 /* store the start address on the stack */ 100 /* store the start address on the stack */
98 pushl %edx 101 pushl %edx
99 102
100 /* Set cr0 to a known state: 103 /*
104 * Set cr0 to a known state:
101 * - Paging disabled 105 * - Paging disabled
102 * - Alignment check disabled 106 * - Alignment check disabled
103 * - Write protect disabled 107 * - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
113 /* clear cr4 if applicable */ 117 /* clear cr4 if applicable */
114 testl %ecx, %ecx 118 testl %ecx, %ecx
115 jz 1f 119 jz 1f
116 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
117 * Setting everything to zero seems safe. 122 * Setting everything to zero seems safe.
118 */ 123 */
119 xorl %eax, %eax 124 xorl %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
132 call swap_pages 137 call swap_pages
133 addl $8, %esp 138 addl $8, %esp
134 139
135 /* To be certain of avoiding problems with self-modifying code 140 /*
141 * To be certain of avoiding problems with self-modifying code
136 * I need to execute a serializing instruction here. 142 * I need to execute a serializing instruction here.
137 * So I flush the TLB, it's handy, and not processor dependent. 143 * So I flush the TLB, it's handy, and not processor dependent.
138 */ 144 */
139 xorl %eax, %eax 145 xorl %eax, %eax
140 movl %eax, %cr3 146 movl %eax, %cr3
141 147
142 /* set all of the registers to known values */ 148 /*
143 /* leave %esp alone */ 149 * set all of the registers to known values
150 * leave %esp alone
151 */
144 152
145 testl %esi, %esi 153 testl %esi, %esi
146 jnz 1f 154 jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb27a479..4de8f5b3d476 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,29 +19,77 @@
19#define PTR(x) (x << 3) 19#define PTR(x) (x << 3)
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21 21
22/*
23 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define RSP DATA(0x0)
31#define CR0 DATA(0x8)
32#define CR3 DATA(0x10)
33#define CR4 DATA(0x18)
34
35/* other data */
36#define CP_PA_TABLE_PAGE DATA(0x20)
37#define CP_PA_SWAP_PAGE DATA(0x28)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
39
22 .text 40 .text
23 .align PAGE_SIZE 41 .align PAGE_SIZE
24 .code64 42 .code64
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 /* %rdi indirection_page 45 /*
46 * %rdi indirection_page
28 * %rsi page_list 47 * %rsi page_list
29 * %rdx start address 48 * %rdx start address
49 * %rcx preserve_context
30 */ 50 */
31 51
52 /* Save the CPU context, used for jumping back */
53 pushq %rbx
54 pushq %rbp
55 pushq %r12
56 pushq %r13
57 pushq %r14
58 pushq %r15
59 pushf
60
61 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
62 movq %rsp, RSP(%r11)
63 movq %cr0, %rax
64 movq %rax, CR0(%r11)
65 movq %cr3, %rax
66 movq %rax, CR3(%r11)
67 movq %cr4, %rax
68 movq %rax, CR4(%r11)
69
32 /* zero out flags, and disable interrupts */ 70 /* zero out flags, and disable interrupts */
33 pushq $0 71 pushq $0
34 popfq 72 popfq
35 73
36 /* get physical address of control page now */ 74 /*
37 /* this is impossible after page table switch */ 75 * get physical address of control page now
76 * this is impossible after page table switch
77 */
38 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 78 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
39 79
40 /* get physical address of page table now too */ 80 /* get physical address of page table now too */
41 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx 81 movq PTR(PA_TABLE_PAGE)(%rsi), %r9
82
83 /* get physical address of swap page now */
84 movq PTR(PA_SWAP_PAGE)(%rsi), %r10
85
86 /* save some information for jumping back */
87 movq %r9, CP_PA_TABLE_PAGE(%r11)
88 movq %r10, CP_PA_SWAP_PAGE(%r11)
89 movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
42 90
43 /* Switch to the identity mapped page tables */ 91 /* Switch to the identity mapped page tables */
44 movq %rcx, %cr3 92 movq %r9, %cr3
45 93
46 /* setup a new stack at the end of the physical control page */ 94 /* setup a new stack at the end of the physical control page */
47 lea PAGE_SIZE(%r8), %rsp 95 lea PAGE_SIZE(%r8), %rsp
@@ -55,7 +103,8 @@ identity_mapped:
55 /* store the start address on the stack */ 103 /* store the start address on the stack */
56 pushq %rdx 104 pushq %rdx
57 105
58 /* Set cr0 to a known state: 106 /*
107 * Set cr0 to a known state:
59 * - Paging enabled 108 * - Paging enabled
60 * - Alignment check disabled 109 * - Alignment check disabled
61 * - Write protect disabled 110 * - Write protect disabled
@@ -68,7 +117,8 @@ identity_mapped:
68 orl $(X86_CR0_PG | X86_CR0_PE), %eax 117 orl $(X86_CR0_PG | X86_CR0_PE), %eax
69 movq %rax, %cr0 118 movq %rax, %cr0
70 119
71 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
72 * - physical address extension enabled 122 * - physical address extension enabled
73 */ 123 */
74 movq $X86_CR4_PAE, %rax 124 movq $X86_CR4_PAE, %rax
@@ -78,9 +128,87 @@ identity_mapped:
781: 1281:
79 129
80 /* Flush the TLB (needed?) */ 130 /* Flush the TLB (needed?) */
81 movq %rcx, %cr3 131 movq %r9, %cr3
132
133 movq %rcx, %r11
134 call swap_pages
135
136 /*
137 * To be certain of avoiding problems with self-modifying code
138 * I need to execute a serializing instruction here.
139 * So I flush the TLB by reloading %cr3 here, it's handy,
140 * and not processor dependent.
141 */
142 movq %cr3, %rax
143 movq %rax, %cr3
144
145 /*
146 * set all of the registers to known values
147 * leave %rsp alone
148 */
149
150 testq %r11, %r11
151 jnz 1f
152 xorq %rax, %rax
153 xorq %rbx, %rbx
154 xorq %rcx, %rcx
155 xorq %rdx, %rdx
156 xorq %rsi, %rsi
157 xorq %rdi, %rdi
158 xorq %rbp, %rbp
159 xorq %r8, %r8
160 xorq %r9, %r9
161 xorq %r10, %r9
162 xorq %r11, %r11
163 xorq %r12, %r12
164 xorq %r13, %r13
165 xorq %r14, %r14
166 xorq %r15, %r15
167
168 ret
169
1701:
171 popq %rdx
172 leaq PAGE_SIZE(%r10), %rsp
173 call *%rdx
174
175 /* get the re-entry point of the peer system */
176 movq 0(%rsp), %rbp
177 call 1f
1781:
179 popq %r8
180 subq $(1b - relocate_kernel), %r8
181 movq CP_PA_SWAP_PAGE(%r8), %r10
182 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
183 movq CP_PA_TABLE_PAGE(%r8), %rax
184 movq %rax, %cr3
185 lea PAGE_SIZE(%r8), %rsp
186 call swap_pages
187 movq $virtual_mapped, %rax
188 pushq %rax
189 ret
190
191virtual_mapped:
192 movq RSP(%r8), %rsp
193 movq CR4(%r8), %rax
194 movq %rax, %cr4
195 movq CR3(%r8), %rax
196 movq CR0(%r8), %r8
197 movq %rax, %cr3
198 movq %r8, %cr0
199 movq %rbp, %rax
200
201 popf
202 popq %r15
203 popq %r14
204 popq %r13
205 popq %r12
206 popq %rbp
207 popq %rbx
208 ret
82 209
83 /* Do the copies */ 210 /* Do the copies */
211swap_pages:
84 movq %rdi, %rcx /* Put the page_list in %rcx */ 212 movq %rdi, %rcx /* Put the page_list in %rcx */
85 xorq %rdi, %rdi 213 xorq %rdi, %rdi
86 xorq %rsi, %rsi 214 xorq %rsi, %rsi
@@ -112,36 +240,27 @@ identity_mapped:
112 movq %rcx, %rsi /* For ever source page do a copy */ 240 movq %rcx, %rsi /* For ever source page do a copy */
113 andq $0xfffffffffffff000, %rsi 241 andq $0xfffffffffffff000, %rsi
114 242
243 movq %rdi, %rdx
244 movq %rsi, %rax
245
246 movq %r10, %rdi
115 movq $512, %rcx 247 movq $512, %rcx
116 rep ; movsq 248 rep ; movsq
117 jmp 0b
1183:
119
120 /* To be certain of avoiding problems with self-modifying code
121 * I need to execute a serializing instruction here.
122 * So I flush the TLB by reloading %cr3 here, it's handy,
123 * and not processor dependent.
124 */
125 movq %cr3, %rax
126 movq %rax, %cr3
127 249
128 /* set all of the registers to known values */ 250 movq %rax, %rdi
129 /* leave %rsp alone */ 251 movq %rdx, %rsi
252 movq $512, %rcx
253 rep ; movsq
130 254
131 xorq %rax, %rax 255 movq %rdx, %rdi
132 xorq %rbx, %rbx 256 movq %r10, %rsi
133 xorq %rcx, %rcx 257 movq $512, %rcx
134 xorq %rdx, %rdx 258 rep ; movsq
135 xorq %rsi, %rsi
136 xorq %rdi, %rdi
137 xorq %rbp, %rbp
138 xorq %r8, %r8
139 xorq %r9, %r9
140 xorq %r10, %r9
141 xorq %r11, %r11
142 xorq %r12, %r12
143 xorq %r13, %r13
144 xorq %r14, %r14
145 xorq %r15, %r15
146 259
260 lea PAGE_SIZE(%rax), %rsi
261 jmp 0b
2623:
147 ret 263 ret
264
265 .globl kexec_control_code_size
266.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index dd6f2b71561b..5d465b207e72 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,14 +1,14 @@
1/* 1/*
2 * RTC related functions 2 * RTC related functions
3 */ 3 */
4#include <linux/platform_device.h>
5#include <linux/mc146818rtc.h>
4#include <linux/acpi.h> 6#include <linux/acpi.h>
5#include <linux/bcd.h> 7#include <linux/bcd.h>
6#include <linux/mc146818rtc.h>
7#include <linux/platform_device.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9 9
10#include <asm/time.h>
11#include <asm/vsyscall.h> 10#include <asm/vsyscall.h>
11#include <asm/time.h>
12 12
13#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
14/* 14/*
@@ -16,9 +16,9 @@
16 * register we are working with. It is required for NMI access to the 16 * register we are working with. It is required for NMI access to the
17 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. 17 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
18 */ 18 */
19volatile unsigned long cmos_lock = 0; 19volatile unsigned long cmos_lock;
20EXPORT_SYMBOL(cmos_lock); 20EXPORT_SYMBOL(cmos_lock);
21#endif 21#endif /* CONFIG_X86_32 */
22 22
23/* For two digit years assume time is always after that */ 23/* For two digit years assume time is always after that */
24#define CMOS_YEARS_OFFS 2000 24#define CMOS_YEARS_OFFS 2000
@@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock);
38 */ 38 */
39int mach_set_rtc_mmss(unsigned long nowtime) 39int mach_set_rtc_mmss(unsigned long nowtime)
40{ 40{
41 int retval = 0;
42 int real_seconds, real_minutes, cmos_minutes; 41 int real_seconds, real_minutes, cmos_minutes;
43 unsigned char save_control, save_freq_select; 42 unsigned char save_control, save_freq_select;
43 int retval = 0;
44 44
45 /* tell the clock it's being set */ 45 /* tell the clock it's being set */
46 save_control = CMOS_READ(RTC_CONTROL); 46 save_control = CMOS_READ(RTC_CONTROL);
@@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
72 real_seconds = bin2bcd(real_seconds); 72 real_seconds = bin2bcd(real_seconds);
73 real_minutes = bin2bcd(real_minutes); 73 real_minutes = bin2bcd(real_minutes);
74 } 74 }
75 CMOS_WRITE(real_seconds,RTC_SECONDS); 75 CMOS_WRITE(real_seconds, RTC_SECONDS);
76 CMOS_WRITE(real_minutes,RTC_MINUTES); 76 CMOS_WRITE(real_minutes, RTC_MINUTES);
77 } else { 77 } else {
78 printk(KERN_WARNING 78 printk(KERN_WARNING
79 "set_rtc_mmss: can't update from %d to %d\n", 79 "set_rtc_mmss: can't update from %d to %d\n",
@@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned char addr)
151 outb(addr, RTC_PORT(0)); 151 outb(addr, RTC_PORT(0));
152 val = inb(RTC_PORT(1)); 152 val = inb(RTC_PORT(1));
153 lock_cmos_suffix(addr); 153 lock_cmos_suffix(addr);
154
154 return val; 155 return val;
155} 156}
156EXPORT_SYMBOL(rtc_cmos_read); 157EXPORT_SYMBOL(rtc_cmos_read);
@@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write);
166 167
167static int set_rtc_mmss(unsigned long nowtime) 168static int set_rtc_mmss(unsigned long nowtime)
168{ 169{
169 int retval;
170 unsigned long flags; 170 unsigned long flags;
171 int retval;
171 172
172 spin_lock_irqsave(&rtc_lock, flags); 173 spin_lock_irqsave(&rtc_lock, flags);
173 retval = set_wallclock(nowtime); 174 retval = set_wallclock(nowtime);
@@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void)
242 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
243 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
244 "registered platform RTC device (no PNP device found)\n"); 245 "registered platform RTC device (no PNP device found)\n");
246
245 return 0; 247 return 0;
246} 248}
247device_initcall(add_rtc_cmos); 249device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b746deb9ebc6..a0d26237d7cf 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,8 +112,13 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115RESERVE_BRK(dmi_alloc, 65536);
116
115unsigned int boot_cpu_id __read_mostly; 117unsigned int boot_cpu_id __read_mostly;
116 118
119static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
120unsigned long _brk_end = (unsigned long)__brk_base;
121
117#ifdef CONFIG_X86_64 122#ifdef CONFIG_X86_64
118int default_cpu_present_to_apicid(int mps_cpu) 123int default_cpu_present_to_apicid(int mps_cpu)
119{ 124{
@@ -158,12 +163,6 @@ static struct resource bss_resource = {
158 163
159 164
160#ifdef CONFIG_X86_32 165#ifdef CONFIG_X86_32
161/* This value is set up by the early boot code to point to the value
162 immediately after the boot time page tables. It contains a *physical*
163 address, and must not be in the .bss segment! */
164unsigned long init_pg_tables_start __initdata = ~0UL;
165unsigned long init_pg_tables_end __initdata = ~0UL;
166
167static struct resource video_ram_resource = { 166static struct resource video_ram_resource = {
168 .name = "Video RAM area", 167 .name = "Video RAM area",
169 .start = 0xa0000, 168 .start = 0xa0000,
@@ -202,7 +201,9 @@ struct ist_info ist_info;
202#endif 201#endif
203 202
204#else 203#else
205struct cpuinfo_x86 boot_cpu_data __read_mostly; 204struct cpuinfo_x86 boot_cpu_data __read_mostly = {
205 .x86_phys_bits = MAX_PHYSMEM_BITS,
206};
206EXPORT_SYMBOL(boot_cpu_data); 207EXPORT_SYMBOL(boot_cpu_data);
207#endif 208#endif
208 209
@@ -217,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
217int bootloader_type; 218int bootloader_type;
218 219
219/* 220/*
220 * Early DMI memory
221 */
222int dmi_alloc_index;
223char dmi_alloc_data[DMI_MAX_DATA];
224
225/*
226 * Setup options 221 * Setup options
227 */ 222 */
228struct screen_info screen_info; 223struct screen_info screen_info;
@@ -267,6 +262,35 @@ static inline void copy_edd(void)
267} 262}
268#endif 263#endif
269 264
265void * __init extend_brk(size_t size, size_t align)
266{
267 size_t mask = align - 1;
268 void *ret;
269
270 BUG_ON(_brk_start == 0);
271 BUG_ON(align & mask);
272
273 _brk_end = (_brk_end + mask) & ~mask;
274 BUG_ON((char *)(_brk_end + size) > __brk_limit);
275
276 ret = (void *)_brk_end;
277 _brk_end += size;
278
279 memset(ret, 0, size);
280
281 return ret;
282}
283
284static void __init reserve_brk(void)
285{
286 if (_brk_end > _brk_start)
287 reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
288
289 /* Mark brk area as locked down and no longer taking any
290 new allocations */
291 _brk_start = 0;
292}
293
270#ifdef CONFIG_BLK_DEV_INITRD 294#ifdef CONFIG_BLK_DEV_INITRD
271 295
272#ifdef CONFIG_X86_32 296#ifdef CONFIG_X86_32
@@ -715,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
715 init_mm.start_code = (unsigned long) _text; 739 init_mm.start_code = (unsigned long) _text;
716 init_mm.end_code = (unsigned long) _etext; 740 init_mm.end_code = (unsigned long) _etext;
717 init_mm.end_data = (unsigned long) _edata; 741 init_mm.end_data = (unsigned long) _edata;
718#ifdef CONFIG_X86_32 742 init_mm.brk = _brk_end;
719 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
720#else
721 init_mm.brk = (unsigned long) &_end;
722#endif
723 743
724 code_resource.start = virt_to_phys(_text); 744 code_resource.start = virt_to_phys(_text);
725 code_resource.end = virt_to_phys(_etext)-1; 745 code_resource.end = virt_to_phys(_etext)-1;
@@ -840,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
840 setup_bios_corruption_check(); 860 setup_bios_corruption_check();
841#endif 861#endif
842 862
863 reserve_brk();
864
843 /* max_pfn_mapped is updated here */ 865 /* max_pfn_mapped is updated here */
844 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 866 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
845 max_pfn_mapped = max_low_pfn_mapped; 867 max_pfn_mapped = max_low_pfn_mapped;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d2cc6428c587..dfcc74ab0ab6 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -211,31 +211,27 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
211{ 211{
212 /* Default to using normal stack */ 212 /* Default to using normal stack */
213 unsigned long sp = regs->sp; 213 unsigned long sp = regs->sp;
214 int onsigstack = on_sig_stack(sp);
214 215
215#ifdef CONFIG_X86_64 216#ifdef CONFIG_X86_64
216 /* redzone */ 217 /* redzone */
217 sp -= 128; 218 sp -= 128;
218#endif /* CONFIG_X86_64 */ 219#endif /* CONFIG_X86_64 */
219 220
220 /* 221 if (!onsigstack) {
221 * If we are on the alternate signal stack and would overflow it, don't. 222 /* This is the X/Open sanctioned signal stack switching. */
222 * Return an always-bogus address instead so we will die with SIGSEGV. 223 if (ka->sa.sa_flags & SA_ONSTACK) {
223 */ 224 if (sas_ss_flags(sp) == 0)
224 if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) 225 sp = current->sas_ss_sp + current->sas_ss_size;
225 return (void __user *) -1L; 226 } else {
226
227 /* This is the X/Open sanctioned signal stack switching. */
228 if (ka->sa.sa_flags & SA_ONSTACK) {
229 if (sas_ss_flags(sp) == 0)
230 sp = current->sas_ss_sp + current->sas_ss_size;
231 } else {
232#ifdef CONFIG_X86_32 227#ifdef CONFIG_X86_32
233 /* This is the legacy signal stack switching. */ 228 /* This is the legacy signal stack switching. */
234 if ((regs->ss & 0xffff) != __USER_DS && 229 if ((regs->ss & 0xffff) != __USER_DS &&
235 !(ka->sa.sa_flags & SA_RESTORER) && 230 !(ka->sa.sa_flags & SA_RESTORER) &&
236 ka->sa.sa_restorer) 231 ka->sa.sa_restorer)
237 sp = (unsigned long) ka->sa.sa_restorer; 232 sp = (unsigned long) ka->sa.sa_restorer;
238#endif /* CONFIG_X86_32 */ 233#endif /* CONFIG_X86_32 */
234 }
239 } 235 }
240 236
241 if (used_math()) { 237 if (used_math()) {
@@ -244,12 +240,22 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
244 sp = round_down(sp, 64); 240 sp = round_down(sp, 64);
245#endif /* CONFIG_X86_64 */ 241#endif /* CONFIG_X86_64 */
246 *fpstate = (void __user *)sp; 242 *fpstate = (void __user *)sp;
247
248 if (save_i387_xstate(*fpstate) < 0)
249 return (void __user *)-1L;
250 } 243 }
251 244
252 return (void __user *)align_sigframe(sp - frame_size); 245 sp = align_sigframe(sp - frame_size);
246
247 /*
248 * If we are on the alternate signal stack and would overflow it, don't.
249 * Return an always-bogus address instead so we will die with SIGSEGV.
250 */
251 if (onsigstack && !likely(on_sig_stack(sp)))
252 return (void __user *)-1L;
253
254 /* save i387 state */
255 if (used_math() && save_i387_xstate(*fpstate) < 0)
256 return (void __user *)-1L;
257
258 return (void __user *)sp;
253} 259}
254 260
255#ifdef CONFIG_X86_32 261#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 249334f5080a..ef7d10170c30 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
114 114
115atomic_t init_deasserted; 115atomic_t init_deasserted;
116 116
117
118/* Set if we find a B stepping CPU */
119static int __cpuinitdata smp_b_stepping;
120
121#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 117#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
122 118
123/* which logical CPUs are on which nodes */ 119/* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
271 cpumask_set_cpu(cpuid, cpu_callin_mask); 267 cpumask_set_cpu(cpuid, cpu_callin_mask);
272} 268}
273 269
274static int __cpuinitdata unsafe_smp;
275
276/* 270/*
277 * Activate a secondary processor. 271 * Activate a secondary processor.
278 */ 272 */
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
340 cpu_idle(); 334 cpu_idle();
341} 335}
342 336
343static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
344{
345 /*
346 * Mask B, Pentium, but not Pentium MMX
347 */
348 if (c->x86_vendor == X86_VENDOR_INTEL &&
349 c->x86 == 5 &&
350 c->x86_mask >= 1 && c->x86_mask <= 4 &&
351 c->x86_model <= 3)
352 /*
353 * Remember we have B step Pentia with bugs
354 */
355 smp_b_stepping = 1;
356
357 /*
358 * Certain Athlons might work (for various values of 'work') in SMP
359 * but they are not certified as MP capable.
360 */
361 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
362
363 if (num_possible_cpus() == 1)
364 goto valid_k7;
365
366 /* Athlon 660/661 is valid. */
367 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
368 (c->x86_mask == 1)))
369 goto valid_k7;
370
371 /* Duron 670 is valid */
372 if ((c->x86_model == 7) && (c->x86_mask == 0))
373 goto valid_k7;
374
375 /*
376 * Athlon 662, Duron 671, and Athlon >model 7 have capability
377 * bit. It's worth noting that the A5 stepping (662) of some
378 * Athlon XP's have the MP bit set.
379 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
380 * more.
381 */
382 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
383 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
384 (c->x86_model > 7))
385 if (cpu_has_mp)
386 goto valid_k7;
387
388 /* If we get here, not a certified SMP capable AMD system. */
389 unsafe_smp = 1;
390 }
391
392valid_k7:
393 ;
394}
395
396static void __cpuinit smp_checks(void)
397{
398 if (smp_b_stepping)
399 printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
400 "with B stepping processors.\n");
401
402 /*
403 * Don't taint if we are running SMP kernel on a single non-MP
404 * approved Athlon
405 */
406 if (unsafe_smp && num_online_cpus() > 1) {
407 printk(KERN_INFO "WARNING: This combination of AMD"
408 "processors is not suitable for SMP.\n");
409 add_taint(TAINT_UNSAFE_SMP);
410 }
411}
412
413/* 337/*
414 * The bootstrap kernel entry code has set these up. Save them for 338 * The bootstrap kernel entry code has set these up. Save them for
415 * a given CPU 339 * a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
423 c->cpu_index = id; 347 c->cpu_index = id;
424 if (id != 0) 348 if (id != 0)
425 identify_secondary_cpu(c); 349 identify_secondary_cpu(c);
426 smp_apply_quirks(c);
427} 350}
428 351
429 352
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1193 pr_debug("Boot done.\n"); 1116 pr_debug("Boot done.\n");
1194 1117
1195 impress_friends(); 1118 impress_friends();
1196 smp_checks();
1197#ifdef CONFIG_X86_IO_APIC 1119#ifdef CONFIG_X86_IO_APIC
1198 setup_ioapic_dest(); 1120 setup_ioapic_dest();
1199#endif 1121#endif
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d038b9c45cf8..79c073247284 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -750,7 +750,7 @@ static int __init uv_bau_init(void)
750 int node; 750 int node;
751 int nblades; 751 int nblades;
752 int last_blade; 752 int last_blade;
753 int cur_cpu = 0; 753 int cur_cpu;
754 754
755 if (!is_uv_system()) 755 if (!is_uv_system())
756 return 0; 756 return 0;
@@ -760,6 +760,7 @@ static int __init uv_bau_init(void)
760 uv_mmask = (1UL << uv_hub_info->n_val) - 1; 760 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
761 nblades = 0; 761 nblades = 0;
762 last_blade = -1; 762 last_blade = -1;
763 cur_cpu = 0;
763 for_each_online_node(node) { 764 for_each_online_node(node) {
764 blade = uv_node_to_blade_id(node); 765 blade = uv_node_to_blade_id(node);
765 if (blade == last_blade) 766 if (blade == last_blade)
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0fcc95a354f7..7e4515957a1c 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,10 +25,10 @@
25 * 25 *
26 * Send feedback to <colpatch@us.ibm.com> 26 * Send feedback to <colpatch@us.ibm.com>
27 */ 27 */
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/nodemask.h> 28#include <linux/nodemask.h>
31#include <linux/mmzone.h> 29#include <linux/mmzone.h>
30#include <linux/init.h>
31#include <linux/smp.h>
32#include <asm/cpu.h> 32#include <asm/cpu.h>
33 33
34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); 34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
@@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num)
47 */ 47 */
48 if (num) 48 if (num)
49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1; 49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
50
50 return register_cpu(&per_cpu(cpu_devices, num).cpu, num); 51 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
51} 52}
52EXPORT_SYMBOL(arch_register_cpu); 53EXPORT_SYMBOL(arch_register_cpu);
@@ -56,12 +57,13 @@ void arch_unregister_cpu(int num)
56 unregister_cpu(&per_cpu(cpu_devices, num).cpu); 57 unregister_cpu(&per_cpu(cpu_devices, num).cpu);
57} 58}
58EXPORT_SYMBOL(arch_unregister_cpu); 59EXPORT_SYMBOL(arch_unregister_cpu);
59#else 60#else /* CONFIG_HOTPLUG_CPU */
61
60static int __init arch_register_cpu(int num) 62static int __init arch_register_cpu(int num)
61{ 63{
62 return register_cpu(&per_cpu(cpu_devices, num).cpu, num); 64 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
63} 65}
64#endif /*CONFIG_HOTPLUG_CPU*/ 66#endif /* CONFIG_HOTPLUG_CPU */
65 67
66static int __init topology_init(void) 68static int __init topology_init(void)
67{ 69{
@@ -70,11 +72,11 @@ static int __init topology_init(void)
70#ifdef CONFIG_NUMA 72#ifdef CONFIG_NUMA
71 for_each_online_node(i) 73 for_each_online_node(i)
72 register_one_node(i); 74 register_one_node(i);
73#endif /* CONFIG_NUMA */ 75#endif
74 76
75 for_each_present_cpu(i) 77 for_each_present_cpu(i)
76 arch_register_cpu(i); 78 arch_register_cpu(i);
79
77 return 0; 80 return 0;
78} 81}
79
80subsys_initcall(topology_init); 82subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 000000000000..2ffb6c53326e
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
1/*
2 * SGI RTC clock/timer routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Dimitri Sivanich
20 */
21#include <linux/clockchips.h>
22
23#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h>
25#include <asm/uv/bios.h>
26#include <asm/uv/uv.h>
27#include <asm/apic.h>
28#include <asm/cpu.h>
29
30#define RTC_NAME "sgi_rtc"
31
32static cycle_t uv_read_rtc(void);
33static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
34static void uv_rtc_timer_setup(enum clock_event_mode,
35 struct clock_event_device *);
36
37static struct clocksource clocksource_uv = {
38 .name = RTC_NAME,
39 .rating = 400,
40 .read = uv_read_rtc,
41 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
42 .shift = 10,
43 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
44};
45
46static struct clock_event_device clock_event_device_uv = {
47 .name = RTC_NAME,
48 .features = CLOCK_EVT_FEAT_ONESHOT,
49 .shift = 20,
50 .rating = 400,
51 .irq = -1,
52 .set_next_event = uv_rtc_next_event,
53 .set_mode = uv_rtc_timer_setup,
54 .event_handler = NULL,
55};
56
57static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
58
59/* There is one of these allocated per node */
60struct uv_rtc_timer_head {
61 spinlock_t lock;
62 /* next cpu waiting for timer, local node relative: */
63 int next_cpu;
64 /* number of cpus on this node: */
65 int ncpus;
66 struct {
67 int lcpu; /* systemwide logical cpu number */
68 u64 expires; /* next timer expiration for this cpu */
69 } cpu[1];
70};
71
72/*
73 * Access to uv_rtc_timer_head via blade id.
74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly;
76
77static int uv_rtc_enable;
78
79/*
80 * Hardware interface routines
81 */
82
83/* Send IPIs to another node */
84static void uv_rtc_send_IPI(int cpu)
85{
86 unsigned long apicid, val;
87 int pnode;
88
89 apicid = cpu_physical_id(cpu);
90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96}
97
98/* Check for an RTC interrupt pending */
99static int uv_intr_pending(int pnode)
100{
101 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
102 UVH_EVENT_OCCURRED0_RTC1_MASK;
103}
104
105/* Setup interrupt and return non-zero if early expiration occurred. */
106static int uv_setup_intr(int cpu, u64 expires)
107{
108 u64 val;
109 int pnode = uv_cpu_to_pnode(cpu);
110
111 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
112 UVH_RTC1_INT_CONFIG_M_MASK);
113 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
114
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120
121 /* Set configuration */
122 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125
126 return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
127}
128
129/*
130 * Per-cpu timer tracking routines
131 */
132
133static __init void uv_rtc_deallocate_timers(void)
134{
135 int bid;
136
137 for_each_possible_blade(bid) {
138 kfree(blade_info[bid]);
139 }
140 kfree(blade_info);
141}
142
143/* Allocate per-node list of cpu timer expiration times. */
144static __init int uv_rtc_allocate_timers(void)
145{
146 int cpu;
147
148 blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
149 if (!blade_info)
150 return -ENOMEM;
151 memset(blade_info, 0, uv_possible_blades * sizeof(void *));
152
153 for_each_present_cpu(cpu) {
154 int nid = cpu_to_node(cpu);
155 int bid = uv_cpu_to_blade_id(cpu);
156 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
157 struct uv_rtc_timer_head *head = blade_info[bid];
158
159 if (!head) {
160 head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
161 (uv_blade_nr_possible_cpus(bid) *
162 2 * sizeof(u64)),
163 GFP_KERNEL, nid);
164 if (!head) {
165 uv_rtc_deallocate_timers();
166 return -ENOMEM;
167 }
168 spin_lock_init(&head->lock);
169 head->ncpus = uv_blade_nr_possible_cpus(bid);
170 head->next_cpu = -1;
171 blade_info[bid] = head;
172 }
173
174 head->cpu[bcpu].lcpu = cpu;
175 head->cpu[bcpu].expires = ULLONG_MAX;
176 }
177
178 return 0;
179}
180
181/* Find and set the next expiring timer. */
182static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
183{
184 u64 lowest = ULLONG_MAX;
185 int c, bcpu = -1;
186
187 head->next_cpu = -1;
188 for (c = 0; c < head->ncpus; c++) {
189 u64 exp = head->cpu[c].expires;
190 if (exp < lowest) {
191 bcpu = c;
192 lowest = exp;
193 }
194 }
195 if (bcpu >= 0) {
196 head->next_cpu = bcpu;
197 c = head->cpu[bcpu].lcpu;
198 if (uv_setup_intr(c, lowest))
199 /* If we didn't set it up in time, trigger */
200 uv_rtc_send_IPI(c);
201 } else {
202 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
203 UVH_RTC1_INT_CONFIG_M_MASK);
204 }
205}
206
207/*
208 * Set expiration time for current cpu.
209 *
210 * Returns 1 if we missed the expiration time.
211 */
212static int uv_rtc_set_timer(int cpu, u64 expires)
213{
214 int pnode = uv_cpu_to_pnode(cpu);
215 int bid = uv_cpu_to_blade_id(cpu);
216 struct uv_rtc_timer_head *head = blade_info[bid];
217 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
218 u64 *t = &head->cpu[bcpu].expires;
219 unsigned long flags;
220 int next_cpu;
221
222 spin_lock_irqsave(&head->lock, flags);
223
224 next_cpu = head->next_cpu;
225 *t = expires;
226 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) {
229 head->next_cpu = bcpu;
230 if (uv_setup_intr(cpu, expires)) {
231 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags);
234 return 1;
235 }
236 }
237
238 spin_unlock_irqrestore(&head->lock, flags);
239 return 0;
240}
241
242/*
243 * Unset expiration time for current cpu.
244 *
245 * Returns 1 if this timer was pending.
246 */
247static int uv_rtc_unset_timer(int cpu)
248{
249 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu);
251 struct uv_rtc_timer_head *head = blade_info[bid];
252 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
253 u64 *t = &head->cpu[bcpu].expires;
254 unsigned long flags;
255 int rc = 0;
256
257 spin_lock_irqsave(&head->lock, flags);
258
259 if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
260 rc = 1;
261
262 *t = ULLONG_MAX;
263
264 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode);
267
268 spin_unlock_irqrestore(&head->lock, flags);
269
270 return rc;
271}
272
273
274/*
275 * Kernel interface routines.
276 */
277
278/*
279 * Read the RTC.
280 */
281static cycle_t uv_read_rtc(void)
282{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC);
284}
285
286/*
287 * Program the next event, relative to now
288 */
289static int uv_rtc_next_event(unsigned long delta,
290 struct clock_event_device *ced)
291{
292 int ced_cpu = cpumask_first(ced->cpumask);
293
294 return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
295}
296
297/*
298 * Setup the RTC timer in oneshot mode
299 */
300static void uv_rtc_timer_setup(enum clock_event_mode mode,
301 struct clock_event_device *evt)
302{
303 int ced_cpu = cpumask_first(evt->cpumask);
304
305 switch (mode) {
306 case CLOCK_EVT_MODE_PERIODIC:
307 case CLOCK_EVT_MODE_ONESHOT:
308 case CLOCK_EVT_MODE_RESUME:
309 /* Nothing to do here yet */
310 break;
311 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu);
314 break;
315 }
316}
317
318static void uv_rtc_interrupt(void)
319{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id();
322
323 if (!ced || !ced->event_handler)
324 return;
325
326 if (uv_rtc_unset_timer(cpu) != 1)
327 return;
328
329 ced->event_handler(ced);
330}
331
332static int __init uv_enable_rtc(char *str)
333{
334 uv_rtc_enable = 1;
335
336 return 1;
337}
338__setup("uvrtc", uv_enable_rtc);
339
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{
342 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
343
344 *ced = clock_event_device_uv;
345 ced->cpumask = cpumask_of(smp_processor_id());
346 clockevents_register_device(ced);
347}
348
349static __init int uv_rtc_setup_clock(void)
350{
351 int rc;
352
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
354 return -ENODEV;
355
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift);
360
361 rc = clocksource_register(&clocksource_uv);
362 if (rc) {
363 generic_interrupt_extension = NULL;
364 return rc;
365 }
366
367 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers();
369 if (rc) {
370 clocksource_unregister(&clocksource_uv);
371 generic_interrupt_extension = NULL;
372 return rc;
373 }
374
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift);
377
378 clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
379 sn_rtc_cycles_per_second;
380
381 clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
382 (NSEC_PER_SEC / sn_rtc_cycles_per_second);
383
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) {
386 clocksource_unregister(&clocksource_uv);
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers();
389 }
390
391 return rc;
392}
393arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 191a876e9e87..31ffc24eec4d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
578static irqreturn_t piix4_master_intr(int irq, void *dev_id) 578static irqreturn_t piix4_master_intr(int irq, void *dev_id)
579{ 579{
580 int realirq; 580 int realirq;
581 irq_desc_t *desc; 581 struct irq_desc *desc;
582 unsigned long flags; 582 unsigned long flags;
583 583
584 spin_lock_irqsave(&i8259A_lock, flags); 584 spin_lock_irqsave(&i8259A_lock, flags);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2cc4a90e2cb3..95deb9f2211e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
395 vmi_ops.update_pte(ptep, VMI_PAGE_PT); 395 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
396} 396}
397 397
398static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
399{
400 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
401}
402
403static void vmi_set_pud(pud_t *pudp, pud_t pudval) 398static void vmi_set_pud(pud_t *pudp, pud_t pudval)
404{ 399{
405 /* Um, eww */ 400 /* Um, eww */
@@ -750,7 +745,6 @@ static inline int __init activate_vmi(void)
750 pv_mmu_ops.set_pmd = vmi_set_pmd; 745 pv_mmu_ops.set_pmd = vmi_set_pmd;
751#ifdef CONFIG_X86_PAE 746#ifdef CONFIG_X86_PAE
752 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; 747 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
753 pv_mmu_ops.set_pte_present = vmi_set_pte_present;
754 pv_mmu_ops.set_pud = vmi_set_pud; 748 pv_mmu_ops.set_pud = vmi_set_pud;
755 pv_mmu_ops.pte_clear = vmi_pte_clear; 749 pv_mmu_ops.pte_clear = vmi_pte_clear;
756 pv_mmu_ops.pmd_clear = vmi_pmd_clear; 750 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 0d860963f268..62ad500d55f3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,15 +189,24 @@ SECTIONS
189 *(.bss) 189 *(.bss)
190 . = ALIGN(4); 190 . = ALIGN(4);
191 __bss_stop = .; 191 __bss_stop = .;
192 _end = . ; 192 }
193 /* This is where the kernel creates the early boot page tables */ 193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
194 . = ALIGN(PAGE_SIZE); 195 . = ALIGN(PAGE_SIZE);
195 pg0 = . ; 196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
196 } 204 }
197 205
198 /* Sections to be discarded */ 206 /* Sections to be discarded */
199 /DISCARD/ : { 207 /DISCARD/ : {
200 *(.exitcall.exit) 208 *(.exitcall.exit)
209 *(.discard)
201 } 210 }
202 211
203 STABS_DEBUG 212 STABS_DEBUG
@@ -205,6 +214,12 @@ SECTIONS
205 DWARF_DEBUG 214 DWARF_DEBUG
206} 215}
207 216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
208#ifdef CONFIG_KEXEC 223#ifdef CONFIG_KEXEC
209/* Link time checks */ 224/* Link time checks */
210#include <asm/kexec.h> 225#include <asm/kexec.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6f6800..c8742507b030 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -29,8 +29,8 @@ SECTIONS
29{ 29{
30 . = __START_KERNEL; 30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET; 31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 _text = .; /* Text and read-only data */
33 .text : AT(ADDR(.text) - LOAD_OFFSET) { 32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */ 34 /* First the code that has to be first for bootstrapping */
35 *(.text.head) 35 *(.text.head)
36 _stext = .; 36 _stext = .;
@@ -61,13 +61,13 @@ SECTIONS
61 .data : AT(ADDR(.data) - LOAD_OFFSET) { 61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA 62 DATA_DATA
63 CONSTRUCTORS 63 CONSTRUCTORS
64 _edata = .; /* End of data section */
64 } :data 65 } :data
65 66
66 _edata = .; /* End of data section */
67 67
68 . = ALIGN(PAGE_SIZE);
69 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
70 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { 68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned) 71 *(.data.cacheline_aligned)
72 } 72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); 73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -125,29 +125,29 @@ SECTIONS
125#undef VVIRT_OFFSET 125#undef VVIRT_OFFSET
126#undef VVIRT 126#undef VVIRT
127 127
128 . = ALIGN(THREAD_SIZE); /* init_task */
129 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task) 130 *(.data.init_task)
131 }:data.init 131 }:data.init
132 132
133 . = ALIGN(PAGE_SIZE);
134 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned) 135 *(.data.page_aligned)
136 } 136 }
137 137
138 /* might get freed after init */
139 . = ALIGN(PAGE_SIZE);
140 __smp_alt_begin = .;
141 __smp_locks = .;
142 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks) 143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
144 } 147 }
145 __smp_locks_end = .;
146 . = ALIGN(PAGE_SIZE);
147 __smp_alt_end = .;
148 148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */ 149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; 150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .; 152 _sinittext = .;
153 INIT_TEXT 153 INIT_TEXT
@@ -159,40 +159,42 @@ SECTIONS
159 __initdata_end = .; 159 __initdata_end = .;
160 } 160 }
161 161
162 . = ALIGN(16); 162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 __setup_start = .; 163 . = ALIGN(16);
164 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 164 __setup_start = .;
165 __setup_end = .; 165 *(.init.setup)
166 __initcall_start = .; 166 __setup_end = .;
167 }
167 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
168 INITCALLS 170 INITCALLS
171 __initcall_end = .;
169 } 172 }
170 __initcall_end = .;
171 __con_initcall_start = .;
172 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
173 *(.con_initcall.init) 175 *(.con_initcall.init)
176 __con_initcall_end = .;
174 } 177 }
175 __con_initcall_end = .;
176 __x86_cpu_dev_start = .;
177 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
178 *(.x86_cpu_dev.init) 180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
179 } 182 }
180 __x86_cpu_dev_end = .;
181 SECURITY_INIT 183 SECURITY_INIT
182 184
183 . = ALIGN(8); 185 . = ALIGN(8);
184 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
185 __parainstructions = .; 187 __parainstructions = .;
186 *(.parainstructions) 188 *(.parainstructions)
187 __parainstructions_end = .; 189 __parainstructions_end = .;
188 } 190 }
189 191
190 . = ALIGN(8);
191 __alt_instructions = .;
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
193 *(.altinstructions) 195 *(.altinstructions)
196 __alt_instructions_end = .;
194 } 197 }
195 __alt_instructions_end = .;
196 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
197 *(.altinstr_replacement) 199 *(.altinstr_replacement)
198 } 200 }
@@ -207,9 +209,11 @@ SECTIONS
207 209
208#ifdef CONFIG_BLK_DEV_INITRD 210#ifdef CONFIG_BLK_DEV_INITRD
209 . = ALIGN(PAGE_SIZE); 211 . = ALIGN(PAGE_SIZE);
210 __initramfs_start = .; 212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
211 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 213 __initramfs_start = .;
212 __initramfs_end = .; 214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
213#endif 217#endif
214 218
215#ifdef CONFIG_SMP 219#ifdef CONFIG_SMP
@@ -229,20 +233,29 @@ SECTIONS
229 . = ALIGN(PAGE_SIZE); 233 . = ALIGN(PAGE_SIZE);
230 __init_end = .; 234 __init_end = .;
231 235
232 . = ALIGN(PAGE_SIZE);
233 __nosave_begin = .;
234 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { 236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
235 *(.data.nosave) 237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
236 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ 242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
237 . = ALIGN(PAGE_SIZE);
238 __nosave_end = .;
239 243
240 __bss_start = .; /* BSS */
241 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
242 *(.bss.page_aligned) 247 *(.bss.page_aligned)
243 *(.bss) 248 *(.bss)
244 } 249 __bss_stop = .;
245 __bss_stop = .; 250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
246 259
247 _end = . ; 260 _end = . ;
248 261
@@ -250,6 +263,7 @@ SECTIONS
250 /DISCARD/ : { 263 /DISCARD/ : {
251 *(.exitcall.exit) 264 *(.exitcall.exit)
252 *(.eh_frame) 265 *(.eh_frame)
266 *(.discard)
253 } 267 }
254 268
255 STABS_DEBUG 269 STABS_DEBUG
@@ -275,3 +289,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
275ASSERT((per_cpu__irq_stack_union == 0), 289ASSERT((per_cpu__irq_stack_union == 0),
276 "irq_stack_union is not at start of per-cpu area"); 290 "irq_stack_union is not at start of per-cpu area");
277#endif 291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 74de562812cc..a1d804bcd483 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -22,7 +22,7 @@
22#include <asm/paravirt.h> 22#include <asm/paravirt.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24 24
25#ifdef CONFIG_PARAVIRT 25#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
26/* 26/*
27 * Interrupt control on vSMPowered systems: 27 * Interrupt control on vSMPowered systems:
28 * ~AC is a shadow of IF. If IF is 'on' AC should be 'off' 28 * ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
@@ -114,6 +114,7 @@ static void __init set_vsmp_pv_ops(void)
114} 114}
115#endif 115#endif
116 116
117#ifdef CONFIG_PCI
117static int is_vsmp = -1; 118static int is_vsmp = -1;
118 119
119static void __init detect_vsmp_box(void) 120static void __init detect_vsmp_box(void)
@@ -139,6 +140,15 @@ int is_vsmp_box(void)
139 } 140 }
140} 141}
141 142
143#else
144static void __init detect_vsmp_box(void)
145{
146}
147int is_vsmp_box(void)
148{
149 return 0;
150}
151#endif
142void __init vsmp_init(void) 152void __init vsmp_init(void)
143{ 153{
144 detect_vsmp_box(); 154 detect_vsmp_box();
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9fe4ddaa8f6f..90e44a10e68a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1058,14 +1058,6 @@ __init void lguest_init(void)
1058 * lguest_init() where the rest of the fairly chaotic boot setup 1058 * lguest_init() where the rest of the fairly chaotic boot setup
1059 * occurs. */ 1059 * occurs. */
1060 1060
1061 /* The native boot code sets up initial page tables immediately after
1062 * the kernel itself, and sets init_pg_tables_end so they're not
1063 * clobbered. The Launcher places our initial pagetables somewhere at
1064 * the top of our physical memory, so we don't need extra space: set
1065 * init_pg_tables_end to the end of the kernel. */
1066 init_pg_tables_start = __pa(pg0);
1067 init_pg_tables_end = __pa(pg0);
1068
1069 /* As described in head_32.S, we map the first 128M of memory. */ 1061 /* As described in head_32.S, we map the first 128M of memory. */
1070 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1062 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1071 1063
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3a..ad5441ed1b57 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/dwarf2.h>
6 7
7/* 8/*
8 * memcpy - Copy a memory block. 9 * memcpy - Copy a memory block.
9 * 10 *
10 * Input: 11 * Input:
11 * rdi destination 12 * rdi destination
12 * rsi source 13 * rsi source
13 * rdx count 14 * rdx count
14 * 15 *
15 * Output: 16 * Output:
16 * rax original destination 17 * rax original destination
17 */ 18 */
18 19
20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
23 * Calls to this get patched into the kernel image via the
24 * alternative instructions framework:
25 */
19 ALIGN 26 ALIGN
20memcpy_c: 27memcpy_c:
21 CFI_STARTPROC 28 CFI_STARTPROC
22 movq %rdi,%rax 29 movq %rdi, %rax
23 movl %edx,%ecx 30
24 shrl $3,%ecx 31 movl %edx, %ecx
25 andl $7,%edx 32 shrl $3, %ecx
33 andl $7, %edx
26 rep movsq 34 rep movsq
27 movl %edx,%ecx 35 movl %edx, %ecx
28 rep movsb 36 rep movsb
29 ret 37 ret
30 CFI_ENDPROC 38 CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
33ENTRY(__memcpy) 41ENTRY(__memcpy)
34ENTRY(memcpy) 42ENTRY(memcpy)
35 CFI_STARTPROC 43 CFI_STARTPROC
36 pushq %rbx
37 CFI_ADJUST_CFA_OFFSET 8
38 CFI_REL_OFFSET rbx, 0
39 movq %rdi,%rax
40 44
41 movl %edx,%ecx 45 /*
42 shrl $6,%ecx 46 * Put the number of full 64-byte blocks into %ecx.
47 * Tail portion is handled at the end:
48 */
49 movq %rdi, %rax
50 movl %edx, %ecx
51 shrl $6, %ecx
43 jz .Lhandle_tail 52 jz .Lhandle_tail
44 53
45 .p2align 4 54 .p2align 4
46.Lloop_64: 55.Lloop_64:
56 /*
57 * We decrement the loop index here - and the zero-flag is
58 * checked at the end of the loop (instructions inbetween do
59 * not change the zero flag):
60 */
47 decl %ecx 61 decl %ecx
48 62
49 movq (%rsi),%r11 63 /*
50 movq 8(%rsi),%r8 64 * Move in blocks of 4x16 bytes:
65 */
66 movq 0*8(%rsi), %r11
67 movq 1*8(%rsi), %r8
68 movq %r11, 0*8(%rdi)
69 movq %r8, 1*8(%rdi)
51 70
52 movq %r11,(%rdi) 71 movq 2*8(%rsi), %r9
53 movq %r8,1*8(%rdi) 72 movq 3*8(%rsi), %r10
73 movq %r9, 2*8(%rdi)
74 movq %r10, 3*8(%rdi)
54 75
55 movq 2*8(%rsi),%r9 76 movq 4*8(%rsi), %r11
56 movq 3*8(%rsi),%r10 77 movq 5*8(%rsi), %r8
78 movq %r11, 4*8(%rdi)
79 movq %r8, 5*8(%rdi)
57 80
58 movq %r9,2*8(%rdi) 81 movq 6*8(%rsi), %r9
59 movq %r10,3*8(%rdi) 82 movq 7*8(%rsi), %r10
83 movq %r9, 6*8(%rdi)
84 movq %r10, 7*8(%rdi)
60 85
61 movq 4*8(%rsi),%r11 86 leaq 64(%rsi), %rsi
62 movq 5*8(%rsi),%r8 87 leaq 64(%rdi), %rdi
63 88
64 movq %r11,4*8(%rdi)
65 movq %r8,5*8(%rdi)
66
67 movq 6*8(%rsi),%r9
68 movq 7*8(%rsi),%r10
69
70 movq %r9,6*8(%rdi)
71 movq %r10,7*8(%rdi)
72
73 leaq 64(%rsi),%rsi
74 leaq 64(%rdi),%rdi
75 jnz .Lloop_64 89 jnz .Lloop_64
76 90
77.Lhandle_tail: 91.Lhandle_tail:
78 movl %edx,%ecx 92 movl %edx, %ecx
79 andl $63,%ecx 93 andl $63, %ecx
80 shrl $3,%ecx 94 shrl $3, %ecx
81 jz .Lhandle_7 95 jz .Lhandle_7
96
82 .p2align 4 97 .p2align 4
83.Lloop_8: 98.Lloop_8:
84 decl %ecx 99 decl %ecx
85 movq (%rsi),%r8 100 movq (%rsi), %r8
86 movq %r8,(%rdi) 101 movq %r8, (%rdi)
87 leaq 8(%rdi),%rdi 102 leaq 8(%rdi), %rdi
88 leaq 8(%rsi),%rsi 103 leaq 8(%rsi), %rsi
89 jnz .Lloop_8 104 jnz .Lloop_8
90 105
91.Lhandle_7: 106.Lhandle_7:
92 movl %edx,%ecx 107 movl %edx, %ecx
93 andl $7,%ecx 108 andl $7, %ecx
94 jz .Lende 109 jz .Lend
110
95 .p2align 4 111 .p2align 4
96.Lloop_1: 112.Lloop_1:
97 movb (%rsi),%r8b 113 movb (%rsi), %r8b
98 movb %r8b,(%rdi) 114 movb %r8b, (%rdi)
99 incq %rdi 115 incq %rdi
100 incq %rsi 116 incq %rsi
101 decl %ecx 117 decl %ecx
102 jnz .Lloop_1 118 jnz .Lloop_1
103 119
104.Lende: 120.Lend:
105 popq %rbx
106 CFI_ADJUST_CFA_OFFSET -8
107 CFI_RESTORE rbx
108 ret 121 ret
109.Lfinal:
110 CFI_ENDPROC 122 CFI_ENDPROC
111ENDPROC(memcpy) 123ENDPROC(memcpy)
112ENDPROC(__memcpy) 124ENDPROC(__memcpy)
113 125
114 /* Some CPUs run faster using the string copy instructions. 126 /*
115 It is also a lot simpler. Use this when possible */ 127 * Some CPUs run faster using the string copy instructions.
128 * It is also a lot simpler. Use this when possible:
129 */
116 130
117 .section .altinstr_replacement,"ax" 131 .section .altinstr_replacement, "ax"
1181: .byte 0xeb /* jmp <disp8> */ 1321: .byte 0xeb /* jmp <disp8> */
119 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1202: 1342:
121 .previous 135 .previous
122 .section .altinstructions,"a" 136
137 .section .altinstructions, "a"
123 .align 8 138 .align 8
124 .quad memcpy 139 .quad memcpy
125 .quad 1b 140 .quad 1b
126 .byte X86_FEATURE_REP_GOOD 141 .byte X86_FEATURE_REP_GOOD
127 /* Replace only beginning, memcpy is used to apply alternatives, so it 142
128 * is silly to overwrite itself with nops - reboot is only outcome... */ 143 /*
144 * Replace only beginning, memcpy is used to apply alternatives,
145 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome...
147 */
129 .byte 2b - 1b 148 .byte 2b - 1b
130 .byte 2b - 1b 149 .byte 2b - 1b
131 .previous 150 .previous
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c80b0e..522db5e3d0bf 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -121,22 +121,13 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
121 pagefault_enable(); 121 pagefault_enable();
122} 122}
123 123
124/* This is the same as kmap_atomic() but can map memory that doesn't 124/*
125 * This is the same as kmap_atomic() but can map memory that doesn't
125 * have a struct page associated with it. 126 * have a struct page associated with it.
126 */ 127 */
127void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) 128void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
128{ 129{
129 enum fixed_addresses idx; 130 return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
130 unsigned long vaddr;
131
132 pagefault_disable();
133
134 idx = type + KM_TYPE_NR*smp_processor_id();
135 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
136 set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
137 arch_flush_lazy_mmu_mode();
138
139 return (void*) vaddr;
140} 131}
141EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ 132EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
142 133
@@ -158,7 +149,6 @@ EXPORT_SYMBOL(kunmap);
158EXPORT_SYMBOL(kmap_atomic); 149EXPORT_SYMBOL(kmap_atomic);
159EXPORT_SYMBOL(kunmap_atomic); 150EXPORT_SYMBOL(kunmap_atomic);
160 151
161#ifdef CONFIG_NUMA
162void __init set_highmem_pages_init(void) 152void __init set_highmem_pages_init(void)
163{ 153{
164 struct zone *zone; 154 struct zone *zone;
@@ -182,11 +172,3 @@ void __init set_highmem_pages_init(void)
182 } 172 }
183 totalram_pages += totalhigh_pages; 173 totalram_pages += totalhigh_pages;
184} 174}
185#else
186void __init set_highmem_pages_init(void)
187{
188 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
189
190 totalram_pages += totalhigh_pages;
191}
192#endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ce6a722587d8..fd3da1dda1c9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,8 +1,345 @@
1#include <linux/ioport.h>
1#include <linux/swap.h> 2#include <linux/swap.h>
3
2#include <asm/cacheflush.h> 4#include <asm/cacheflush.h>
5#include <asm/e820.h>
6#include <asm/init.h>
3#include <asm/page.h> 7#include <asm/page.h>
8#include <asm/page_types.h>
4#include <asm/sections.h> 9#include <asm/sections.h>
5#include <asm/system.h> 10#include <asm/system.h>
11#include <asm/tlbflush.h>
12
13unsigned long __initdata e820_table_start;
14unsigned long __meminitdata e820_table_end;
15unsigned long __meminitdata e820_table_top;
16
17int after_bootmem;
18
19int direct_gbpages
20#ifdef CONFIG_DIRECT_GBPAGES
21 = 1
22#endif
23;
24
25static void __init find_early_table_space(unsigned long end, int use_pse,
26 int use_gbpages)
27{
28 unsigned long puds, pmds, ptes, tables, start;
29
30 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
31 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
32
33 if (use_gbpages) {
34 unsigned long extra;
35
36 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
37 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
38 } else
39 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
40
41 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
42
43 if (use_pse) {
44 unsigned long extra;
45
46 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
47#ifdef CONFIG_X86_32
48 extra += PMD_SIZE;
49#endif
50 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
51 } else
52 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
53
54 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
55
56#ifdef CONFIG_X86_32
57 /* for fixmap */
58 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
59#endif
60
61 /*
62 * RED-PEN putting page tables only on node 0 could
63 * cause a hotspot and fill up ZONE_DMA. The page tables
64 * need roughly 0.5KB per GB.
65 */
66#ifdef CONFIG_X86_32
67 start = 0x7000;
68 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
69 tables, PAGE_SIZE);
70#else /* CONFIG_X86_64 */
71 start = 0x8000;
72 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
73#endif
74 if (e820_table_start == -1UL)
75 panic("Cannot find space for the kernel page tables");
76
77 e820_table_start >>= PAGE_SHIFT;
78 e820_table_end = e820_table_start;
79 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
80
81 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
82 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
83}
84
85struct map_range {
86 unsigned long start;
87 unsigned long end;
88 unsigned page_size_mask;
89};
90
91#ifdef CONFIG_X86_32
92#define NR_RANGE_MR 3
93#else /* CONFIG_X86_64 */
94#define NR_RANGE_MR 5
95#endif
96
97static int __meminit save_mr(struct map_range *mr, int nr_range,
98 unsigned long start_pfn, unsigned long end_pfn,
99 unsigned long page_size_mask)
100{
101 if (start_pfn < end_pfn) {
102 if (nr_range >= NR_RANGE_MR)
103 panic("run out of range for init_memory_mapping\n");
104 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
105 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
106 mr[nr_range].page_size_mask = page_size_mask;
107 nr_range++;
108 }
109
110 return nr_range;
111}
112
113#ifdef CONFIG_X86_64
114static void __init init_gbpages(void)
115{
116 if (direct_gbpages && cpu_has_gbpages)
117 printk(KERN_INFO "Using GB pages for direct mapping\n");
118 else
119 direct_gbpages = 0;
120}
121#else
122static inline void init_gbpages(void)
123{
124}
125#endif
126
127/*
128 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
129 * This runs before bootmem is initialized and gets pages directly from
130 * the physical memory. To access them they are temporarily mapped.
131 */
132unsigned long __init_refok init_memory_mapping(unsigned long start,
133 unsigned long end)
134{
135 unsigned long page_size_mask = 0;
136 unsigned long start_pfn, end_pfn;
137 unsigned long ret = 0;
138 unsigned long pos;
139
140 struct map_range mr[NR_RANGE_MR];
141 int nr_range, i;
142 int use_pse, use_gbpages;
143
144 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
145
146 if (!after_bootmem)
147 init_gbpages();
148
149#ifdef CONFIG_DEBUG_PAGEALLOC
150 /*
151 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
152 * This will simplify cpa(), which otherwise needs to support splitting
153 * large pages into small in interrupt context, etc.
154 */
155 use_pse = use_gbpages = 0;
156#else
157 use_pse = cpu_has_pse;
158 use_gbpages = direct_gbpages;
159#endif
160
161#ifdef CONFIG_X86_32
162#ifdef CONFIG_X86_PAE
163 set_nx();
164 if (nx_enabled)
165 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
166#endif
167
168 /* Enable PSE if available */
169 if (cpu_has_pse)
170 set_in_cr4(X86_CR4_PSE);
171
172 /* Enable PGE if available */
173 if (cpu_has_pge) {
174 set_in_cr4(X86_CR4_PGE);
175 __supported_pte_mask |= _PAGE_GLOBAL;
176 }
177#endif
178
179 if (use_gbpages)
180 page_size_mask |= 1 << PG_LEVEL_1G;
181 if (use_pse)
182 page_size_mask |= 1 << PG_LEVEL_2M;
183
184 memset(mr, 0, sizeof(mr));
185 nr_range = 0;
186
187 /* head if not big page alignment ? */
188 start_pfn = start >> PAGE_SHIFT;
189 pos = start_pfn << PAGE_SHIFT;
190#ifdef CONFIG_X86_32
191 /*
192 * Don't use a large page for the first 2/4MB of memory
193 * because there are often fixed size MTRRs in there
194 * and overlapping MTRRs into large pages can cause
195 * slowdowns.
196 */
197 if (pos == 0)
198 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
199 else
200 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
201 << (PMD_SHIFT - PAGE_SHIFT);
202#else /* CONFIG_X86_64 */
203 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
204 << (PMD_SHIFT - PAGE_SHIFT);
205#endif
206 if (end_pfn > (end >> PAGE_SHIFT))
207 end_pfn = end >> PAGE_SHIFT;
208 if (start_pfn < end_pfn) {
209 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
210 pos = end_pfn << PAGE_SHIFT;
211 }
212
213 /* big page (2M) range */
214 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
215 << (PMD_SHIFT - PAGE_SHIFT);
216#ifdef CONFIG_X86_32
217 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
218#else /* CONFIG_X86_64 */
219 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
220 << (PUD_SHIFT - PAGE_SHIFT);
221 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
222 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
223#endif
224
225 if (start_pfn < end_pfn) {
226 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
227 page_size_mask & (1<<PG_LEVEL_2M));
228 pos = end_pfn << PAGE_SHIFT;
229 }
230
231#ifdef CONFIG_X86_64
232 /* big page (1G) range */
233 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
234 << (PUD_SHIFT - PAGE_SHIFT);
235 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
236 if (start_pfn < end_pfn) {
237 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
238 page_size_mask &
239 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
240 pos = end_pfn << PAGE_SHIFT;
241 }
242
243 /* tail is not big page (1G) alignment */
244 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
245 << (PMD_SHIFT - PAGE_SHIFT);
246 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
247 if (start_pfn < end_pfn) {
248 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
249 page_size_mask & (1<<PG_LEVEL_2M));
250 pos = end_pfn << PAGE_SHIFT;
251 }
252#endif
253
254 /* tail is not big page (2M) alignment */
255 start_pfn = pos>>PAGE_SHIFT;
256 end_pfn = end>>PAGE_SHIFT;
257 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
258
259 /* try to merge same page size and continuous */
260 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
261 unsigned long old_start;
262 if (mr[i].end != mr[i+1].start ||
263 mr[i].page_size_mask != mr[i+1].page_size_mask)
264 continue;
265 /* move it */
266 old_start = mr[i].start;
267 memmove(&mr[i], &mr[i+1],
268 (nr_range - 1 - i) * sizeof(struct map_range));
269 mr[i--].start = old_start;
270 nr_range--;
271 }
272
273 for (i = 0; i < nr_range; i++)
274 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
275 mr[i].start, mr[i].end,
276 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
277 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
278
279 /*
280 * Find space for the kernel direct mapping tables.
281 *
282 * Later we should allocate these tables in the local node of the
283 * memory mapped. Unfortunately this is done currently before the
284 * nodes are discovered.
285 */
286 if (!after_bootmem)
287 find_early_table_space(end, use_pse, use_gbpages);
288
289#ifdef CONFIG_X86_32
290 for (i = 0; i < nr_range; i++)
291 kernel_physical_mapping_init(mr[i].start, mr[i].end,
292 mr[i].page_size_mask);
293 ret = end;
294#else /* CONFIG_X86_64 */
295 for (i = 0; i < nr_range; i++)
296 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
297 mr[i].page_size_mask);
298#endif
299
300#ifdef CONFIG_X86_32
301 early_ioremap_page_table_range_init();
302
303 load_cr3(swapper_pg_dir);
304#endif
305
306#ifdef CONFIG_X86_64
307 if (!after_bootmem)
308 mmu_cr4_features = read_cr4();
309#endif
310 __flush_tlb_all();
311
312 if (!after_bootmem && e820_table_end > e820_table_start)
313 reserve_early(e820_table_start << PAGE_SHIFT,
314 e820_table_end << PAGE_SHIFT, "PGTABLE");
315
316 if (!after_bootmem)
317 early_memtest(start, end);
318
319 return ret >> PAGE_SHIFT;
320}
321
322
323/*
324 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
325 * is valid. The argument is a physical page number.
326 *
327 *
328 * On x86, access has to be given to the first megabyte of ram because that area
329 * contains bios code and data regions used by X and dosemu and similar apps.
330 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
331 * mmio resources as well as potential bios/acpi data regions.
332 */
333int devmem_is_allowed(unsigned long pagenr)
334{
335 if (pagenr <= 256)
336 return 1;
337 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
338 return 0;
339 if (!page_is_ram(pagenr))
340 return 1;
341 return 0;
342}
6 343
7void free_init_pages(char *what, unsigned long begin, unsigned long end) 344void free_init_pages(char *what, unsigned long begin, unsigned long end)
8{ 345{
@@ -47,3 +384,10 @@ void free_initmem(void)
47 (unsigned long)(&__init_begin), 384 (unsigned long)(&__init_begin),
48 (unsigned long)(&__init_end)); 385 (unsigned long)(&__init_end));
49} 386}
387
388#ifdef CONFIG_BLK_DEV_INITRD
389void free_initrd_mem(unsigned long start, unsigned long end)
390{
391 free_init_pages("initrd memory", start, end);
392}
393#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 47df0e1bbeb9..db81e9a8556b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,6 +49,7 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/init.h>
52 53
53unsigned long max_low_pfn_mapped; 54unsigned long max_low_pfn_mapped;
54unsigned long max_pfn_mapped; 55unsigned long max_pfn_mapped;
@@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
58 59
59static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
60 61
61 62bool __read_mostly __vmalloc_start_set = false;
62static unsigned long __initdata table_start;
63static unsigned long __meminitdata table_end;
64static unsigned long __meminitdata table_top;
65
66static int __initdata after_init_bootmem;
67 63
68static __init void *alloc_low_page(void) 64static __init void *alloc_low_page(void)
69{ 65{
70 unsigned long pfn = table_end++; 66 unsigned long pfn = e820_table_end++;
71 void *adr; 67 void *adr;
72 68
73 if (pfn >= table_top) 69 if (pfn >= e820_table_top)
74 panic("alloc_low_page: ran out of memory"); 70 panic("alloc_low_page: ran out of memory");
75 71
76 adr = __va(pfn * PAGE_SIZE); 72 adr = __va(pfn * PAGE_SIZE);
@@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
90 86
91#ifdef CONFIG_X86_PAE 87#ifdef CONFIG_X86_PAE
92 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 88 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
93 if (after_init_bootmem) 89 if (after_bootmem)
94 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 90 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
95 else 91 else
96 pmd_table = (pmd_t *)alloc_low_page(); 92 pmd_table = (pmd_t *)alloc_low_page();
@@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
117 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 113 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
118 pte_t *page_table = NULL; 114 pte_t *page_table = NULL;
119 115
120 if (after_init_bootmem) { 116 if (after_bootmem) {
121#ifdef CONFIG_DEBUG_PAGEALLOC 117#ifdef CONFIG_DEBUG_PAGEALLOC
122 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 118 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
123#endif 119#endif
@@ -168,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
168 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 164 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
169 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 165 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
170 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 166 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
171 && ((__pa(pte) >> PAGE_SHIFT) < table_start 167 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
172 || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { 168 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
173 pte_t *newpte; 169 pte_t *newpte;
174 int i; 170 int i;
175 171
176 BUG_ON(after_init_bootmem); 172 BUG_ON(after_bootmem);
177 newpte = alloc_low_page(); 173 newpte = alloc_low_page();
178 for (i = 0; i < PTRS_PER_PTE; i++) 174 for (i = 0; i < PTRS_PER_PTE; i++)
179 set_pte(newpte + i, pte[i]); 175 set_pte(newpte + i, pte[i]);
@@ -242,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
242 * of max_low_pfn pages, by creating page tables starting from address 238 * of max_low_pfn pages, by creating page tables starting from address
243 * PAGE_OFFSET: 239 * PAGE_OFFSET:
244 */ 240 */
245static void __init kernel_physical_mapping_init(pgd_t *pgd_base, 241unsigned long __init
246 unsigned long start_pfn, 242kernel_physical_mapping_init(unsigned long start,
247 unsigned long end_pfn, 243 unsigned long end,
248 int use_pse) 244 unsigned long page_size_mask)
249{ 245{
246 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
247 unsigned long start_pfn, end_pfn;
248 pgd_t *pgd_base = swapper_pg_dir;
250 int pgd_idx, pmd_idx, pte_ofs; 249 int pgd_idx, pmd_idx, pte_ofs;
251 unsigned long pfn; 250 unsigned long pfn;
252 pgd_t *pgd; 251 pgd_t *pgd;
@@ -255,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
255 unsigned pages_2m, pages_4k; 254 unsigned pages_2m, pages_4k;
256 int mapping_iter; 255 int mapping_iter;
257 256
257 start_pfn = start >> PAGE_SHIFT;
258 end_pfn = end >> PAGE_SHIFT;
259
258 /* 260 /*
259 * First iteration will setup identity mapping using large/small pages 261 * First iteration will setup identity mapping using large/small pages
260 * based on use_pse, with other attributes same as set by 262 * based on use_pse, with other attributes same as set by
@@ -369,26 +371,6 @@ repeat:
369 mapping_iter = 2; 371 mapping_iter = 2;
370 goto repeat; 372 goto repeat;
371 } 373 }
372}
373
374/*
375 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
376 * is valid. The argument is a physical page number.
377 *
378 *
379 * On x86, access has to be given to the first megabyte of ram because that area
380 * contains bios code and data regions used by X and dosemu and similar apps.
381 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
382 * mmio resources as well as potential bios/acpi data regions.
383 */
384int devmem_is_allowed(unsigned long pagenr)
385{
386 if (pagenr <= 256)
387 return 1;
388 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
389 return 0;
390 if (!page_is_ram(pagenr))
391 return 1;
392 return 0; 374 return 0;
393} 375}
394 376
@@ -545,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
545 * be partially populated, and so it avoids stomping on any existing 527 * be partially populated, and so it avoids stomping on any existing
546 * mappings. 528 * mappings.
547 */ 529 */
548static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) 530void __init early_ioremap_page_table_range_init(void)
549{ 531{
532 pgd_t *pgd_base = swapper_pg_dir;
550 unsigned long vaddr, end; 533 unsigned long vaddr, end;
551 534
552 /* 535 /*
@@ -641,7 +624,7 @@ static int __init noexec_setup(char *str)
641} 624}
642early_param("noexec", noexec_setup); 625early_param("noexec", noexec_setup);
643 626
644static void __init set_nx(void) 627void __init set_nx(void)
645{ 628{
646 unsigned int v[4], l, h; 629 unsigned int v[4], l, h;
647 630
@@ -793,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
793#ifdef CONFIG_FLATMEM 776#ifdef CONFIG_FLATMEM
794 max_mapnr = num_physpages; 777 max_mapnr = num_physpages;
795#endif 778#endif
779 __vmalloc_start_set = true;
780
796 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 781 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
797 pages_to_mb(max_low_pfn)); 782 pages_to_mb(max_low_pfn));
798 783
@@ -814,176 +799,66 @@ static void __init zone_sizes_init(void)
814 free_area_init_nodes(max_zone_pfns); 799 free_area_init_nodes(max_zone_pfns);
815} 800}
816 801
802static unsigned long __init setup_node_bootmem(int nodeid,
803 unsigned long start_pfn,
804 unsigned long end_pfn,
805 unsigned long bootmap)
806{
807 unsigned long bootmap_size;
808
809 /* don't touch min_low_pfn */
810 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
811 bootmap >> PAGE_SHIFT,
812 start_pfn, end_pfn);
813 printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
814 nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
815 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
816 nodeid, bootmap, bootmap + bootmap_size);
817 free_bootmem_with_active_regions(nodeid, end_pfn);
818 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
819
820 return bootmap + bootmap_size;
821}
822
817void __init setup_bootmem_allocator(void) 823void __init setup_bootmem_allocator(void)
818{ 824{
819 int i; 825 int nodeid;
820 unsigned long bootmap_size, bootmap; 826 unsigned long bootmap_size, bootmap;
821 /* 827 /*
822 * Initialize the boot-time allocator (with low memory only): 828 * Initialize the boot-time allocator (with low memory only):
823 */ 829 */
824 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; 830 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
825 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, 831 bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
826 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
827 PAGE_SIZE); 832 PAGE_SIZE);
828 if (bootmap == -1L) 833 if (bootmap == -1L)
829 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 834 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
830 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 835 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
831 836
832 /* don't touch min_low_pfn */
833 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
834 min_low_pfn, max_low_pfn);
835 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 837 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
836 max_pfn_mapped<<PAGE_SHIFT); 838 max_pfn_mapped<<PAGE_SHIFT);
837 printk(KERN_INFO " low ram: %08lx - %08lx\n", 839 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
838 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
839 printk(KERN_INFO " bootmap %08lx - %08lx\n",
840 bootmap, bootmap + bootmap_size);
841 for_each_online_node(i)
842 free_bootmem_with_active_regions(i, max_low_pfn);
843 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
844
845 after_init_bootmem = 1;
846}
847
848static void __init find_early_table_space(unsigned long end, int use_pse)
849{
850 unsigned long puds, pmds, ptes, tables, start;
851 840
852 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 841 for_each_online_node(nodeid) {
853 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 842 unsigned long start_pfn, end_pfn;
854 843
855 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 844#ifdef CONFIG_NEED_MULTIPLE_NODES
856 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); 845 start_pfn = node_start_pfn[nodeid];
857 846 end_pfn = node_end_pfn[nodeid];
858 if (use_pse) { 847 if (start_pfn > max_low_pfn)
859 unsigned long extra; 848 continue;
860 849 if (end_pfn > max_low_pfn)
861 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 850 end_pfn = max_low_pfn;
862 extra += PMD_SIZE;
863 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
864 } else
865 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
866
867 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
868
869 /* for fixmap */
870 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
871
872 /*
873 * RED-PEN putting page tables only on node 0 could
874 * cause a hotspot and fill up ZONE_DMA. The page tables
875 * need roughly 0.5KB per GB.
876 */
877 start = 0x7000;
878 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
879 tables, PAGE_SIZE);
880 if (table_start == -1UL)
881 panic("Cannot find space for the kernel page tables");
882
883 table_start >>= PAGE_SHIFT;
884 table_end = table_start;
885 table_top = table_start + (tables>>PAGE_SHIFT);
886
887 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
888 end, table_start << PAGE_SHIFT,
889 (table_start << PAGE_SHIFT) + tables);
890}
891
892unsigned long __init_refok init_memory_mapping(unsigned long start,
893 unsigned long end)
894{
895 pgd_t *pgd_base = swapper_pg_dir;
896 unsigned long start_pfn, end_pfn;
897 unsigned long big_page_start;
898#ifdef CONFIG_DEBUG_PAGEALLOC
899 /*
900 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
901 * This will simplify cpa(), which otherwise needs to support splitting
902 * large pages into small in interrupt context, etc.
903 */
904 int use_pse = 0;
905#else 851#else
906 int use_pse = cpu_has_pse; 852 start_pfn = 0;
853 end_pfn = max_low_pfn;
907#endif 854#endif
908 855 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
909 /* 856 bootmap);
910 * Find space for the kernel direct mapping tables.
911 */
912 if (!after_init_bootmem)
913 find_early_table_space(end, use_pse);
914
915#ifdef CONFIG_X86_PAE
916 set_nx();
917 if (nx_enabled)
918 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
919#endif
920
921 /* Enable PSE if available */
922 if (cpu_has_pse)
923 set_in_cr4(X86_CR4_PSE);
924
925 /* Enable PGE if available */
926 if (cpu_has_pge) {
927 set_in_cr4(X86_CR4_PGE);
928 __supported_pte_mask |= _PAGE_GLOBAL;
929 }
930
931 /*
932 * Don't use a large page for the first 2/4MB of memory
933 * because there are often fixed size MTRRs in there
934 * and overlapping MTRRs into large pages can cause
935 * slowdowns.
936 */
937 big_page_start = PMD_SIZE;
938
939 if (start < big_page_start) {
940 start_pfn = start >> PAGE_SHIFT;
941 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
942 } else {
943 /* head is not big page alignment ? */
944 start_pfn = start >> PAGE_SHIFT;
945 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
946 << (PMD_SHIFT - PAGE_SHIFT);
947 } 857 }
948 if (start_pfn < end_pfn)
949 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
950
951 /* big page range */
952 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
953 << (PMD_SHIFT - PAGE_SHIFT);
954 if (start_pfn < (big_page_start >> PAGE_SHIFT))
955 start_pfn = big_page_start >> PAGE_SHIFT;
956 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
957 if (start_pfn < end_pfn)
958 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
959 use_pse);
960
961 /* tail is not big page alignment ? */
962 start_pfn = end_pfn;
963 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
964 end_pfn = end >> PAGE_SHIFT;
965 if (start_pfn < end_pfn)
966 kernel_physical_mapping_init(pgd_base, start_pfn,
967 end_pfn, 0);
968 }
969
970 early_ioremap_page_table_range_init(pgd_base);
971 858
972 load_cr3(swapper_pg_dir); 859 after_bootmem = 1;
973
974 __flush_tlb_all();
975
976 if (!after_init_bootmem)
977 reserve_early(table_start << PAGE_SHIFT,
978 table_end << PAGE_SHIFT, "PGTABLE");
979
980 if (!after_init_bootmem)
981 early_memtest(start, end);
982
983 return end >> PAGE_SHIFT;
984} 860}
985 861
986
987/* 862/*
988 * paging_init() sets up the page tables - note that the first 8MB are 863 * paging_init() sets up the page tables - note that the first 8MB are
989 * already mapped by head.S. 864 * already mapped by head.S.
@@ -1217,13 +1092,6 @@ void mark_rodata_ro(void)
1217} 1092}
1218#endif 1093#endif
1219 1094
1220#ifdef CONFIG_BLK_DEV_INITRD
1221void free_initrd_mem(unsigned long start, unsigned long end)
1222{
1223 free_init_pages("initrd memory", start, end);
1224}
1225#endif
1226
1227int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 1095int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1228 int flags) 1096 int flags)
1229{ 1097{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 07f44d491df1..54efa57d1c03 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
48#include <asm/kdebug.h> 48#include <asm/kdebug.h>
49#include <asm/numa.h> 49#include <asm/numa.h>
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h>
51 52
52/* 53/*
53 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
61 62
62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
63 64
64int direct_gbpages
65#ifdef CONFIG_DIRECT_GBPAGES
66 = 1
67#endif
68;
69
70static int __init parse_direct_gbpages_off(char *arg) 65static int __init parse_direct_gbpages_off(char *arg)
71{ 66{
72 direct_gbpages = 0; 67 direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
87 * around without checking the pgd every time. 82 * around without checking the pgd every time.
88 */ 83 */
89 84
90int after_bootmem;
91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask); 86EXPORT_SYMBOL_GPL(__supported_pte_mask);
94 87
95static int do_not_nx __cpuinitdata; 88static int disable_nx __cpuinitdata;
96 89
97/* 90/*
98 * noexec=on|off 91 * noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
107 return -EINVAL; 100 return -EINVAL;
108 if (!strncmp(str, "on", 2)) { 101 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX; 102 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0; 103 disable_nx = 0;
111 } else if (!strncmp(str, "off", 3)) { 104 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1; 105 disable_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX; 106 __supported_pte_mask &= ~_PAGE_NX;
114 } 107 }
115 return 0; 108 return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
121 unsigned long efer; 114 unsigned long efer;
122 115
123 rdmsrl(MSR_EFER, efer); 116 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx) 117 if (!(efer & EFER_NX) || disable_nx)
125 __supported_pte_mask &= ~_PAGE_NX; 118 __supported_pte_mask &= ~_PAGE_NX;
126} 119}
127 120
@@ -325,13 +318,9 @@ void __init cleanup_highmap(void)
325 } 318 }
326} 319}
327 320
328static unsigned long __initdata table_start;
329static unsigned long __meminitdata table_end;
330static unsigned long __meminitdata table_top;
331
332static __ref void *alloc_low_page(unsigned long *phys) 321static __ref void *alloc_low_page(unsigned long *phys)
333{ 322{
334 unsigned long pfn = table_end++; 323 unsigned long pfn = e820_table_end++;
335 void *adr; 324 void *adr;
336 325
337 if (after_bootmem) { 326 if (after_bootmem) {
@@ -341,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
341 return adr; 330 return adr;
342 } 331 }
343 332
344 if (pfn >= table_top) 333 if (pfn >= e820_table_top)
345 panic("alloc_low_page: ran out of memory"); 334 panic("alloc_low_page: ran out of memory");
346 335
347 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 336 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -581,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
581 return phys_pud_init(pud, addr, end, page_size_mask); 570 return phys_pud_init(pud, addr, end, page_size_mask);
582} 571}
583 572
584static void __init find_early_table_space(unsigned long end, int use_pse, 573unsigned long __init
585 int use_gbpages) 574kernel_physical_mapping_init(unsigned long start,
586{ 575 unsigned long end,
587 unsigned long puds, pmds, ptes, tables, start; 576 unsigned long page_size_mask)
588
589 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
590 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
591 if (use_gbpages) {
592 unsigned long extra;
593 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
594 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
595 } else
596 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
597 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
598
599 if (use_pse) {
600 unsigned long extra;
601 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
602 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
603 } else
604 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
605 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
606
607 /*
608 * RED-PEN putting page tables only on node 0 could
609 * cause a hotspot and fill up ZONE_DMA. The page tables
610 * need roughly 0.5KB per GB.
611 */
612 start = 0x8000;
613 table_start = find_e820_area(start, end, tables, PAGE_SIZE);
614 if (table_start == -1UL)
615 panic("Cannot find space for the kernel page tables");
616
617 table_start >>= PAGE_SHIFT;
618 table_end = table_start;
619 table_top = table_start + (tables >> PAGE_SHIFT);
620
621 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
622 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
623}
624
625static void __init init_gbpages(void)
626{
627 if (direct_gbpages && cpu_has_gbpages)
628 printk(KERN_INFO "Using GB pages for direct mapping\n");
629 else
630 direct_gbpages = 0;
631}
632
633static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
634 unsigned long end,
635 unsigned long page_size_mask)
636{ 577{
637 578
638 unsigned long next, last_map_addr = end; 579 unsigned long next, last_map_addr = end;
@@ -669,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
669 return last_map_addr; 610 return last_map_addr;
670} 611}
671 612
672struct map_range {
673 unsigned long start;
674 unsigned long end;
675 unsigned page_size_mask;
676};
677
678#define NR_RANGE_MR 5
679
680static int save_mr(struct map_range *mr, int nr_range,
681 unsigned long start_pfn, unsigned long end_pfn,
682 unsigned long page_size_mask)
683{
684
685 if (start_pfn < end_pfn) {
686 if (nr_range >= NR_RANGE_MR)
687 panic("run out of range for init_memory_mapping\n");
688 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
689 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
690 mr[nr_range].page_size_mask = page_size_mask;
691 nr_range++;
692 }
693
694 return nr_range;
695}
696
697/*
698 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
699 * This runs before bootmem is initialized and gets pages directly from
700 * the physical memory. To access them they are temporarily mapped.
701 */
702unsigned long __init_refok init_memory_mapping(unsigned long start,
703 unsigned long end)
704{
705 unsigned long last_map_addr = 0;
706 unsigned long page_size_mask = 0;
707 unsigned long start_pfn, end_pfn;
708 unsigned long pos;
709
710 struct map_range mr[NR_RANGE_MR];
711 int nr_range, i;
712 int use_pse, use_gbpages;
713
714 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
715
716 /*
717 * Find space for the kernel direct mapping tables.
718 *
719 * Later we should allocate these tables in the local node of the
720 * memory mapped. Unfortunately this is done currently before the
721 * nodes are discovered.
722 */
723 if (!after_bootmem)
724 init_gbpages();
725
726#ifdef CONFIG_DEBUG_PAGEALLOC
727 /*
728 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
729 * This will simplify cpa(), which otherwise needs to support splitting
730 * large pages into small in interrupt context, etc.
731 */
732 use_pse = use_gbpages = 0;
733#else
734 use_pse = cpu_has_pse;
735 use_gbpages = direct_gbpages;
736#endif
737
738 if (use_gbpages)
739 page_size_mask |= 1 << PG_LEVEL_1G;
740 if (use_pse)
741 page_size_mask |= 1 << PG_LEVEL_2M;
742
743 memset(mr, 0, sizeof(mr));
744 nr_range = 0;
745
746 /* head if not big page alignment ?*/
747 start_pfn = start >> PAGE_SHIFT;
748 pos = start_pfn << PAGE_SHIFT;
749 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
750 << (PMD_SHIFT - PAGE_SHIFT);
751 if (end_pfn > (end >> PAGE_SHIFT))
752 end_pfn = end >> PAGE_SHIFT;
753 if (start_pfn < end_pfn) {
754 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
755 pos = end_pfn << PAGE_SHIFT;
756 }
757
758 /* big page (2M) range*/
759 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
760 << (PMD_SHIFT - PAGE_SHIFT);
761 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
762 << (PUD_SHIFT - PAGE_SHIFT);
763 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
764 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
765 if (start_pfn < end_pfn) {
766 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
767 page_size_mask & (1<<PG_LEVEL_2M));
768 pos = end_pfn << PAGE_SHIFT;
769 }
770
771 /* big page (1G) range */
772 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
773 << (PUD_SHIFT - PAGE_SHIFT);
774 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
775 if (start_pfn < end_pfn) {
776 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
777 page_size_mask &
778 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
779 pos = end_pfn << PAGE_SHIFT;
780 }
781
782 /* tail is not big page (1G) alignment */
783 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
784 << (PMD_SHIFT - PAGE_SHIFT);
785 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
786 if (start_pfn < end_pfn) {
787 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
788 page_size_mask & (1<<PG_LEVEL_2M));
789 pos = end_pfn << PAGE_SHIFT;
790 }
791
792 /* tail is not big page (2M) alignment */
793 start_pfn = pos>>PAGE_SHIFT;
794 end_pfn = end>>PAGE_SHIFT;
795 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
796
797 /* try to merge same page size and continuous */
798 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
799 unsigned long old_start;
800 if (mr[i].end != mr[i+1].start ||
801 mr[i].page_size_mask != mr[i+1].page_size_mask)
802 continue;
803 /* move it */
804 old_start = mr[i].start;
805 memmove(&mr[i], &mr[i+1],
806 (nr_range - 1 - i) * sizeof (struct map_range));
807 mr[i--].start = old_start;
808 nr_range--;
809 }
810
811 for (i = 0; i < nr_range; i++)
812 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
813 mr[i].start, mr[i].end,
814 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
815 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
816
817 if (!after_bootmem)
818 find_early_table_space(end, use_pse, use_gbpages);
819
820 for (i = 0; i < nr_range; i++)
821 last_map_addr = kernel_physical_mapping_init(
822 mr[i].start, mr[i].end,
823 mr[i].page_size_mask);
824
825 if (!after_bootmem)
826 mmu_cr4_features = read_cr4();
827 __flush_tlb_all();
828
829 if (!after_bootmem && table_end > table_start)
830 reserve_early(table_start << PAGE_SHIFT,
831 table_end << PAGE_SHIFT, "PGTABLE");
832
833 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
834 last_map_addr, end);
835
836 if (!after_bootmem)
837 early_memtest(start, end);
838
839 return last_map_addr >> PAGE_SHIFT;
840}
841
842#ifndef CONFIG_NUMA 613#ifndef CONFIG_NUMA
843void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 614void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
844{ 615{
@@ -910,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
910 681
911#endif /* CONFIG_MEMORY_HOTPLUG */ 682#endif /* CONFIG_MEMORY_HOTPLUG */
912 683
913/*
914 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
915 * is valid. The argument is a physical page number.
916 *
917 *
918 * On x86, access has to be given to the first megabyte of ram because that area
919 * contains bios code and data regions used by X and dosemu and similar apps.
920 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
921 * mmio resources as well as potential bios/acpi data regions.
922 */
923int devmem_is_allowed(unsigned long pagenr)
924{
925 if (pagenr <= 256)
926 return 1;
927 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
928 return 0;
929 if (!page_is_ram(pagenr))
930 return 1;
931 return 0;
932}
933
934
935static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, 684static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
936 kcore_modules, kcore_vsyscall; 685 kcore_modules, kcore_vsyscall;
937 686
@@ -1019,13 +768,6 @@ void mark_rodata_ro(void)
1019 768
1020#endif 769#endif
1021 770
1022#ifdef CONFIG_BLK_DEV_INITRD
1023void free_initrd_mem(unsigned long start, unsigned long end)
1024{
1025 free_init_pages("initrd memory", start, end);
1026}
1027#endif
1028
1029int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 771int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1030 int flags) 772 int flags)
1031{ 773{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d42ff42..699c9b2895ae 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -31,16 +31,27 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
31} 31}
32EXPORT_SYMBOL_GPL(is_io_mapping_possible); 32EXPORT_SYMBOL_GPL(is_io_mapping_possible);
33 33
34/* Map 'pfn' using fixed map 'type' and protections 'prot' 34void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
35 */
36void *
37iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
38{ 35{
39 enum fixed_addresses idx; 36 enum fixed_addresses idx;
40 unsigned long vaddr; 37 unsigned long vaddr;
41 38
42 pagefault_disable(); 39 pagefault_disable();
43 40
41 idx = type + KM_TYPE_NR * smp_processor_id();
42 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
43 set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
44 arch_flush_lazy_mmu_mode();
45
46 return (void *)vaddr;
47}
48
49/*
50 * Map 'pfn' using fixed map 'type' and protections 'prot'
51 */
52void *
53iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
54{
44 /* 55 /*
45 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. 56 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
46 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the 57 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -50,12 +61,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
50 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) 61 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
51 prot = PAGE_KERNEL_UC_MINUS; 62 prot = PAGE_KERNEL_UC_MINUS;
52 63
53 idx = type + KM_TYPE_NR*smp_processor_id(); 64 return kmap_atomic_prot_pfn(pfn, type, prot);
54 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
55 set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
56 arch_flush_lazy_mmu_mode();
57
58 return (void*) vaddr;
59} 65}
60EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); 66EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
61 67
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648a..0dfa09d69e80 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,13 +22,17 @@
22#include <asm/pgalloc.h> 22#include <asm/pgalloc.h>
23#include <asm/pat.h> 23#include <asm/pat.h>
24 24
25#ifdef CONFIG_X86_64 25static inline int phys_addr_valid(resource_size_t addr)
26
27static inline int phys_addr_valid(unsigned long addr)
28{ 26{
29 return addr < (1UL << boot_cpu_data.x86_phys_bits); 27#ifdef CONFIG_PHYS_ADDR_T_64BIT
28 return !(addr >> boot_cpu_data.x86_phys_bits);
29#else
30 return 1;
31#endif
30} 32}
31 33
34#ifdef CONFIG_X86_64
35
32unsigned long __phys_addr(unsigned long x) 36unsigned long __phys_addr(unsigned long x)
33{ 37{
34 if (x >= __START_KERNEL_map) { 38 if (x >= __START_KERNEL_map) {
@@ -38,8 +42,7 @@ unsigned long __phys_addr(unsigned long x)
38 } else { 42 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 43 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET; 44 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : 45 VIRTUAL_BUG_ON(!phys_addr_valid(x));
42 !phys_addr_valid(x));
43 } 46 }
44 return x; 47 return x;
45} 48}
@@ -56,10 +59,8 @@ bool __virt_addr_valid(unsigned long x)
56 if (x < PAGE_OFFSET) 59 if (x < PAGE_OFFSET)
57 return false; 60 return false;
58 x -= PAGE_OFFSET; 61 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ? 62 if (!phys_addr_valid(x))
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false; 63 return false;
62 }
63 } 64 }
64 65
65 return pfn_valid(x >> PAGE_SHIFT); 66 return pfn_valid(x >> PAGE_SHIFT);
@@ -68,18 +69,12 @@ EXPORT_SYMBOL(__virt_addr_valid);
68 69
69#else 70#else
70 71
71static inline int phys_addr_valid(unsigned long addr)
72{
73 return 1;
74}
75
76#ifdef CONFIG_DEBUG_VIRTUAL 72#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x) 73unsigned long __phys_addr(unsigned long x)
78{ 74{
79 /* VMALLOC_* aren't constants; not available at the boot time */ 75 /* VMALLOC_* aren't constants */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 76 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && 77 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET; 78 return x - PAGE_OFFSET;
84} 79}
85EXPORT_SYMBOL(__phys_addr); 80EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +84,9 @@ bool __virt_addr_valid(unsigned long x)
89{ 84{
90 if (x < PAGE_OFFSET) 85 if (x < PAGE_OFFSET)
91 return false; 86 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) 87 if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
88 return false;
89 if (x >= FIXADDR_START)
93 return false; 90 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); 91 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95} 92}
@@ -508,13 +505,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
508 return &bm_pte[pte_index(addr)]; 505 return &bm_pte[pte_index(addr)];
509} 506}
510 507
508static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
509
511void __init early_ioremap_init(void) 510void __init early_ioremap_init(void)
512{ 511{
513 pmd_t *pmd; 512 pmd_t *pmd;
513 int i;
514 514
515 if (early_ioremap_debug) 515 if (early_ioremap_debug)
516 printk(KERN_INFO "early_ioremap_init()\n"); 516 printk(KERN_INFO "early_ioremap_init()\n");
517 517
518 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
519 slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
520
518 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 521 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
519 memset(bm_pte, 0, sizeof(bm_pte)); 522 memset(bm_pte, 0, sizeof(bm_pte));
520 pmd_populate_kernel(&init_mm, pmd, bm_pte); 523 pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -581,6 +584,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
581 584
582static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; 585static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
583static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; 586static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
587
584static int __init check_early_ioremap_leak(void) 588static int __init check_early_ioremap_leak(void)
585{ 589{
586 int count = 0; 590 int count = 0;
@@ -602,7 +606,8 @@ static int __init check_early_ioremap_leak(void)
602} 606}
603late_initcall(check_early_ioremap_leak); 607late_initcall(check_early_ioremap_leak);
604 608
605static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) 609static void __init __iomem *
610__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
606{ 611{
607 unsigned long offset, last_addr; 612 unsigned long offset, last_addr;
608 unsigned int nrpages; 613 unsigned int nrpages;
@@ -668,9 +673,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
668 --nrpages; 673 --nrpages;
669 } 674 }
670 if (early_ioremap_debug) 675 if (early_ioremap_debug)
671 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 676 printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
672 677
673 prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); 678 prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
674 return prev_map[slot]; 679 return prev_map[slot];
675} 680}
676 681
@@ -738,8 +743,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
738 } 743 }
739 prev_map[slot] = NULL; 744 prev_map[slot] = NULL;
740} 745}
741
742void __this_fixmap_does_not_exist(void)
743{
744 WARN_ON(1);
745}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 6a518dd08a36..4f115e00486b 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
311 311
312 if (!ctx->active) { 312 if (!ctx->active) {
313 pr_warning("kmmio: spurious debug trap on CPU %d.\n", 313 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
314 smp_processor_id()); 314 smp_processor_id());
315 goto out; 315 goto out;
316 } 316 }
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd7883d036..605c8be06217 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
100{ 100{
101 if (arg) 101 if (arg)
102 memtest_pattern = simple_strtoul(arg, NULL, 0); 102 memtest_pattern = simple_strtoul(arg, NULL, 0);
103 else
104 memtest_pattern = ARRAY_SIZE(patterns);
105
103 return 0; 106 return 0;
104} 107}
105 108
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 451fe95a0352..3daefa04ace5 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
416 for_each_online_node(nid) 416 for_each_online_node(nid)
417 propagate_e820_map_node(nid); 417 propagate_e820_map_node(nid);
418 418
419 for_each_online_node(nid) 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
422 }
421 423
422 NODE_DATA(0)->bdata = &bootmem_node_data[0];
423 setup_bootmem_allocator(); 424 setup_bootmem_allocator();
424} 425}
425 426
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 9c4294986af7..d71e1b636ce6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,7 @@
16#include <asm/processor.h> 16#include <asm/processor.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include <asm/sections.h> 18#include <asm/sections.h>
19#include <asm/setup.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/pgalloc.h> 21#include <asm/pgalloc.h>
21#include <asm/proto.h> 22#include <asm/proto.h>
@@ -33,6 +34,7 @@ struct cpa_data {
33 unsigned long pfn; 34 unsigned long pfn;
34 unsigned force_split : 1; 35 unsigned force_split : 1;
35 int curpage; 36 int curpage;
37 struct page **pages;
36}; 38};
37 39
38/* 40/*
@@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock);
45 47
46#define CPA_FLUSHTLB 1 48#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2 49#define CPA_ARRAY 2
50#define CPA_PAGES_ARRAY 4
48 51
49#ifdef CONFIG_PROC_FS 52#ifdef CONFIG_PROC_FS
50static unsigned long direct_pages_count[PG_LEVEL_NUM]; 53static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -95,7 +98,7 @@ static inline unsigned long highmap_start_pfn(void)
95 98
96static inline unsigned long highmap_end_pfn(void) 99static inline unsigned long highmap_end_pfn(void)
97{ 100{
98 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 101 return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
99} 102}
100 103
101#endif 104#endif
@@ -201,10 +204,10 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
201 } 204 }
202} 205}
203 206
204static void cpa_flush_array(unsigned long *start, int numpages, int cache) 207static void cpa_flush_array(unsigned long *start, int numpages, int cache,
208 int in_flags, struct page **pages)
205{ 209{
206 unsigned int i, level; 210 unsigned int i, level;
207 unsigned long *addr;
208 211
209 BUG_ON(irqs_disabled()); 212 BUG_ON(irqs_disabled());
210 213
@@ -225,14 +228,22 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache)
225 * will cause all other CPUs to flush the same 228 * will cause all other CPUs to flush the same
226 * cachelines: 229 * cachelines:
227 */ 230 */
228 for (i = 0, addr = start; i < numpages; i++, addr++) { 231 for (i = 0; i < numpages; i++) {
229 pte_t *pte = lookup_address(*addr, &level); 232 unsigned long addr;
233 pte_t *pte;
234
235 if (in_flags & CPA_PAGES_ARRAY)
236 addr = (unsigned long)page_address(pages[i]);
237 else
238 addr = start[i];
239
240 pte = lookup_address(addr, &level);
230 241
231 /* 242 /*
232 * Only flush present addresses: 243 * Only flush present addresses:
233 */ 244 */
234 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 245 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
235 clflush_cache_range((void *) *addr, PAGE_SIZE); 246 clflush_cache_range((void *)addr, PAGE_SIZE);
236 } 247 }
237} 248}
238 249
@@ -584,7 +595,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
584 unsigned int level; 595 unsigned int level;
585 pte_t *kpte, old_pte; 596 pte_t *kpte, old_pte;
586 597
587 if (cpa->flags & CPA_ARRAY) 598 if (cpa->flags & CPA_PAGES_ARRAY)
599 address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
600 else if (cpa->flags & CPA_ARRAY)
588 address = cpa->vaddr[cpa->curpage]; 601 address = cpa->vaddr[cpa->curpage];
589 else 602 else
590 address = *cpa->vaddr; 603 address = *cpa->vaddr;
@@ -687,7 +700,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
687 * No need to redo, when the primary call touched the direct 700 * No need to redo, when the primary call touched the direct
688 * mapping already: 701 * mapping already:
689 */ 702 */
690 if (cpa->flags & CPA_ARRAY) 703 if (cpa->flags & CPA_PAGES_ARRAY)
704 vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
705 else if (cpa->flags & CPA_ARRAY)
691 vaddr = cpa->vaddr[cpa->curpage]; 706 vaddr = cpa->vaddr[cpa->curpage];
692 else 707 else
693 vaddr = *cpa->vaddr; 708 vaddr = *cpa->vaddr;
@@ -698,7 +713,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
698 alias_cpa = *cpa; 713 alias_cpa = *cpa;
699 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 714 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
700 alias_cpa.vaddr = &temp_cpa_vaddr; 715 alias_cpa.vaddr = &temp_cpa_vaddr;
701 alias_cpa.flags &= ~CPA_ARRAY; 716 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
702 717
703 718
704 ret = __change_page_attr_set_clr(&alias_cpa, 0); 719 ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -711,7 +726,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
711 * No need to redo, when the primary call touched the high 726 * No need to redo, when the primary call touched the high
712 * mapping already: 727 * mapping already:
713 */ 728 */
714 if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) 729 if (within(vaddr, (unsigned long) _text, _brk_end))
715 return 0; 730 return 0;
716 731
717 /* 732 /*
@@ -724,7 +739,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
724 alias_cpa = *cpa; 739 alias_cpa = *cpa;
725 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 740 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
726 alias_cpa.vaddr = &temp_cpa_vaddr; 741 alias_cpa.vaddr = &temp_cpa_vaddr;
727 alias_cpa.flags &= ~CPA_ARRAY; 742 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
728 743
729 /* 744 /*
730 * The high mapping range is imprecise, so ignore the return value. 745 * The high mapping range is imprecise, so ignore the return value.
@@ -745,7 +760,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
745 */ 760 */
746 cpa->numpages = numpages; 761 cpa->numpages = numpages;
747 /* for array changes, we can't use large page */ 762 /* for array changes, we can't use large page */
748 if (cpa->flags & CPA_ARRAY) 763 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
749 cpa->numpages = 1; 764 cpa->numpages = 1;
750 765
751 if (!debug_pagealloc) 766 if (!debug_pagealloc)
@@ -769,7 +784,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
769 */ 784 */
770 BUG_ON(cpa->numpages > numpages); 785 BUG_ON(cpa->numpages > numpages);
771 numpages -= cpa->numpages; 786 numpages -= cpa->numpages;
772 if (cpa->flags & CPA_ARRAY) 787 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
773 cpa->curpage++; 788 cpa->curpage++;
774 else 789 else
775 *cpa->vaddr += cpa->numpages * PAGE_SIZE; 790 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
@@ -786,7 +801,8 @@ static inline int cache_attr(pgprot_t attr)
786 801
787static int change_page_attr_set_clr(unsigned long *addr, int numpages, 802static int change_page_attr_set_clr(unsigned long *addr, int numpages,
788 pgprot_t mask_set, pgprot_t mask_clr, 803 pgprot_t mask_set, pgprot_t mask_clr,
789 int force_split, int array) 804 int force_split, int in_flag,
805 struct page **pages)
790{ 806{
791 struct cpa_data cpa; 807 struct cpa_data cpa;
792 int ret, cache, checkalias; 808 int ret, cache, checkalias;
@@ -801,15 +817,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
801 return 0; 817 return 0;
802 818
803 /* Ensure we are PAGE_SIZE aligned */ 819 /* Ensure we are PAGE_SIZE aligned */
804 if (!array) { 820 if (in_flag & CPA_ARRAY) {
805 if (*addr & ~PAGE_MASK) {
806 *addr &= PAGE_MASK;
807 /*
808 * People should not be passing in unaligned addresses:
809 */
810 WARN_ON_ONCE(1);
811 }
812 } else {
813 int i; 821 int i;
814 for (i = 0; i < numpages; i++) { 822 for (i = 0; i < numpages; i++) {
815 if (addr[i] & ~PAGE_MASK) { 823 if (addr[i] & ~PAGE_MASK) {
@@ -817,6 +825,18 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
817 WARN_ON_ONCE(1); 825 WARN_ON_ONCE(1);
818 } 826 }
819 } 827 }
828 } else if (!(in_flag & CPA_PAGES_ARRAY)) {
829 /*
830 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
831 * No need to cehck in that case
832 */
833 if (*addr & ~PAGE_MASK) {
834 *addr &= PAGE_MASK;
835 /*
836 * People should not be passing in unaligned addresses:
837 */
838 WARN_ON_ONCE(1);
839 }
820 } 840 }
821 841
822 /* Must avoid aliasing mappings in the highmem code */ 842 /* Must avoid aliasing mappings in the highmem code */
@@ -832,6 +852,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
832 arch_flush_lazy_mmu_mode(); 852 arch_flush_lazy_mmu_mode();
833 853
834 cpa.vaddr = addr; 854 cpa.vaddr = addr;
855 cpa.pages = pages;
835 cpa.numpages = numpages; 856 cpa.numpages = numpages;
836 cpa.mask_set = mask_set; 857 cpa.mask_set = mask_set;
837 cpa.mask_clr = mask_clr; 858 cpa.mask_clr = mask_clr;
@@ -839,8 +860,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
839 cpa.curpage = 0; 860 cpa.curpage = 0;
840 cpa.force_split = force_split; 861 cpa.force_split = force_split;
841 862
842 if (array) 863 if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
843 cpa.flags |= CPA_ARRAY; 864 cpa.flags |= in_flag;
844 865
845 /* No alias checking for _NX bit modifications */ 866 /* No alias checking for _NX bit modifications */
846 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 867 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -866,9 +887,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
866 * wbindv): 887 * wbindv):
867 */ 888 */
868 if (!ret && cpu_has_clflush) { 889 if (!ret && cpu_has_clflush) {
869 if (cpa.flags & CPA_ARRAY) 890 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
870 cpa_flush_array(addr, numpages, cache); 891 cpa_flush_array(addr, numpages, cache,
871 else 892 cpa.flags, pages);
893 } else
872 cpa_flush_range(*addr, numpages, cache); 894 cpa_flush_range(*addr, numpages, cache);
873 } else 895 } else
874 cpa_flush_all(cache); 896 cpa_flush_all(cache);
@@ -888,14 +910,28 @@ static inline int change_page_attr_set(unsigned long *addr, int numpages,
888 pgprot_t mask, int array) 910 pgprot_t mask, int array)
889{ 911{
890 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, 912 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
891 array); 913 (array ? CPA_ARRAY : 0), NULL);
892} 914}
893 915
894static inline int change_page_attr_clear(unsigned long *addr, int numpages, 916static inline int change_page_attr_clear(unsigned long *addr, int numpages,
895 pgprot_t mask, int array) 917 pgprot_t mask, int array)
896{ 918{
897 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, 919 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
898 array); 920 (array ? CPA_ARRAY : 0), NULL);
921}
922
923static inline int cpa_set_pages_array(struct page **pages, int numpages,
924 pgprot_t mask)
925{
926 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
927 CPA_PAGES_ARRAY, pages);
928}
929
930static inline int cpa_clear_pages_array(struct page **pages, int numpages,
931 pgprot_t mask)
932{
933 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
934 CPA_PAGES_ARRAY, pages);
899} 935}
900 936
901int _set_memory_uc(unsigned long addr, int numpages) 937int _set_memory_uc(unsigned long addr, int numpages)
@@ -1043,7 +1079,7 @@ int set_memory_np(unsigned long addr, int numpages)
1043int set_memory_4k(unsigned long addr, int numpages) 1079int set_memory_4k(unsigned long addr, int numpages)
1044{ 1080{
1045 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 1081 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1046 __pgprot(0), 1, 0); 1082 __pgprot(0), 1, 0, NULL);
1047} 1083}
1048 1084
1049int set_pages_uc(struct page *page, int numpages) 1085int set_pages_uc(struct page *page, int numpages)
@@ -1054,6 +1090,35 @@ int set_pages_uc(struct page *page, int numpages)
1054} 1090}
1055EXPORT_SYMBOL(set_pages_uc); 1091EXPORT_SYMBOL(set_pages_uc);
1056 1092
1093int set_pages_array_uc(struct page **pages, int addrinarray)
1094{
1095 unsigned long start;
1096 unsigned long end;
1097 int i;
1098 int free_idx;
1099
1100 for (i = 0; i < addrinarray; i++) {
1101 start = (unsigned long)page_address(pages[i]);
1102 end = start + PAGE_SIZE;
1103 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
1104 goto err_out;
1105 }
1106
1107 if (cpa_set_pages_array(pages, addrinarray,
1108 __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
1109 return 0; /* Success */
1110 }
1111err_out:
1112 free_idx = i;
1113 for (i = 0; i < free_idx; i++) {
1114 start = (unsigned long)page_address(pages[i]);
1115 end = start + PAGE_SIZE;
1116 free_memtype(start, end);
1117 }
1118 return -EINVAL;
1119}
1120EXPORT_SYMBOL(set_pages_array_uc);
1121
1057int set_pages_wb(struct page *page, int numpages) 1122int set_pages_wb(struct page *page, int numpages)
1058{ 1123{
1059 unsigned long addr = (unsigned long)page_address(page); 1124 unsigned long addr = (unsigned long)page_address(page);
@@ -1062,6 +1127,26 @@ int set_pages_wb(struct page *page, int numpages)
1062} 1127}
1063EXPORT_SYMBOL(set_pages_wb); 1128EXPORT_SYMBOL(set_pages_wb);
1064 1129
1130int set_pages_array_wb(struct page **pages, int addrinarray)
1131{
1132 int retval;
1133 unsigned long start;
1134 unsigned long end;
1135 int i;
1136
1137 retval = cpa_clear_pages_array(pages, addrinarray,
1138 __pgprot(_PAGE_CACHE_MASK));
1139
1140 for (i = 0; i < addrinarray; i++) {
1141 start = (unsigned long)page_address(pages[i]);
1142 end = start + PAGE_SIZE;
1143 free_memtype(start, end);
1144 }
1145
1146 return retval;
1147}
1148EXPORT_SYMBOL(set_pages_array_wb);
1149
1065int set_pages_x(struct page *page, int numpages) 1150int set_pages_x(struct page *page, int numpages)
1066{ 1151{
1067 unsigned long addr = (unsigned long)page_address(page); 1152 unsigned long addr = (unsigned long)page_address(page);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2ed37158012d..640339ee4fb2 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -677,10 +677,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
677 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 677 is_ram = pat_pagerange_is_ram(paddr, paddr + size);
678 678
679 /* 679 /*
680 * reserve_pfn_range() doesn't support RAM pages. 680 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
681 * behavior with RAM pages by returning success.
681 */ 682 */
682 if (is_ram != 0) 683 if (is_ram != 0)
683 return -EINVAL; 684 return 0;
684 685
685 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); 686 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
686 if (ret) 687 if (ret)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index f2e477c91c1b..46c8834aedc0 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -50,7 +50,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
50 } 50 }
51 pte = pte_offset_kernel(pmd, vaddr); 51 pte = pte_offset_kernel(pmd, vaddr);
52 if (pte_val(pteval)) 52 if (pte_val(pteval))
53 set_pte_present(&init_mm, vaddr, pte, pteval); 53 set_pte_at(&init_mm, vaddr, pte, pteval);
54 else 54 else
55 pte_clear(&init_mm, vaddr, pte); 55 pte_clear(&init_mm, vaddr, pte);
56 56
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a654d59e4483..821e97017e95 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -187,11 +187,6 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
187 cpumask, cpumask_of(smp_processor_id())); 187 cpumask, cpumask_of(smp_processor_id()));
188 188
189 /* 189 /*
190 * Make the above memory operations globally visible before
191 * sending the IPI.
192 */
193 smp_mb();
194 /*
195 * We have to send the IPI only to 190 * We have to send the IPI only to
196 * CPUs affected. 191 * CPUs affected.
197 */ 192 */
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 82d22fc601ae..8c362b96b644 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -90,7 +90,7 @@ static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
90 return 0; 90 return 0;
91} 91}
92 92
93static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = { 93static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = {
94/* 94/*
95 * Systems where PCI IO resource ISA alignment can be skipped 95 * Systems where PCI IO resource ISA alignment can be skipped
96 * when the ISA enable bit in the bridge control is not set 96 * when the ISA enable bit in the bridge control is not set
@@ -183,7 +183,7 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d)
183} 183}
184#endif 184#endif
185 185
186static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { 186static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
187#ifdef __i386__ 187#ifdef __i386__
188/* 188/*
189 * Laptops which need pci=assign-busses to see Cardbus cards 189 * Laptops which need pci=assign-busses to see Cardbus cards
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 7d388d5cf548..9c49919e4d1c 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -356,7 +356,7 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
356DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); 356DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
357 357
358 358
359static struct dmi_system_id __devinitdata msi_k8t_dmi_table[] = { 359static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
360 { 360 {
361 .ident = "MSI-K8T-Neo2Fir", 361 .ident = "MSI-K8T-Neo2Fir",
362 .matches = { 362 .matches = {
@@ -413,7 +413,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
413 */ 413 */
414static u16 toshiba_line_size; 414static u16 toshiba_line_size;
415 415
416static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = { 416static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = {
417 { 417 {
418 .ident = "Toshiba PS5 based laptop", 418 .ident = "Toshiba PS5 based laptop",
419 .matches = { 419 .matches = {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5ead808dd70c..f234a37bd428 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -319,6 +319,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
319 return -EINVAL; 319 return -EINVAL;
320 } 320 }
321 flags = new_flags; 321 flags = new_flags;
322 vma->vm_page_prot = __pgprot(
323 (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK) |
324 flags);
322 } 325 }
323 326
324 if (((vma->vm_pgoff < max_low_pfn_mapped) || 327 if (((vma->vm_pgoff < max_low_pfn_mapped) ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index cb6afa4ec95c..db3802fb7b84 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1723,9 +1723,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1723{ 1723{
1724 pmd_t *kernel_pmd; 1724 pmd_t *kernel_pmd;
1725 1725
1726 init_pg_tables_start = __pa(pgd); 1726 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1727 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1727 xen_start_info->nr_pt_frames * PAGE_SIZE +
1728 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); 1728 512*1024);
1729 1729
1730 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1730 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1731 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1731 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1870,7 +1870,6 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1870 1870
1871#ifdef CONFIG_X86_PAE 1871#ifdef CONFIG_X86_PAE
1872 .set_pte_atomic = xen_set_pte_atomic, 1872 .set_pte_atomic = xen_set_pte_atomic,
1873 .set_pte_present = xen_set_pte_at,
1874 .pte_clear = xen_pte_clear, 1873 .pte_clear = xen_pte_clear,
1875 .pmd_clear = xen_pmd_clear, 1874 .pmd_clear = xen_pmd_clear,
1876#endif /* CONFIG_X86_PAE */ 1875#endif /* CONFIG_X86_PAE */
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 5f333403c2ea..d313039e2fdf 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -31,6 +31,8 @@
31#include <linux/iova.h> 31#include <linux/iova.h>
32#include <linux/intel-iommu.h> 32#include <linux/intel-iommu.h>
33#include <linux/timer.h> 33#include <linux/timer.h>
34#include <linux/irq.h>
35#include <linux/interrupt.h>
34 36
35#undef PREFIX 37#undef PREFIX
36#define PREFIX "DMAR:" 38#define PREFIX "DMAR:"
@@ -509,6 +511,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
509 return -ENOMEM; 511 return -ENOMEM;
510 512
511 iommu->seq_id = iommu_allocated++; 513 iommu->seq_id = iommu_allocated++;
514 sprintf (iommu->name, "dmar%d", iommu->seq_id);
512 515
513 iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE); 516 iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
514 if (!iommu->reg) { 517 if (!iommu->reg) {
@@ -751,6 +754,42 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
751} 754}
752 755
753/* 756/*
757 * Disable Queued Invalidation interface.
758 */
759void dmar_disable_qi(struct intel_iommu *iommu)
760{
761 unsigned long flags;
762 u32 sts;
763 cycles_t start_time = get_cycles();
764
765 if (!ecap_qis(iommu->ecap))
766 return;
767
768 spin_lock_irqsave(&iommu->register_lock, flags);
769
770 sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
771 if (!(sts & DMA_GSTS_QIES))
772 goto end;
773
774 /*
775 * Give a chance to HW to complete the pending invalidation requests.
776 */
777 while ((readl(iommu->reg + DMAR_IQT_REG) !=
778 readl(iommu->reg + DMAR_IQH_REG)) &&
779 (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
780 cpu_relax();
781
782 iommu->gcmd &= ~DMA_GCMD_QIE;
783
784 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
785
786 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
787 !(sts & DMA_GSTS_QIES), sts);
788end:
789 spin_unlock_irqrestore(&iommu->register_lock, flags);
790}
791
792/*
754 * Enable Queued Invalidation interface. This is a must to support 793 * Enable Queued Invalidation interface. This is a must to support
755 * interrupt-remapping. Also used by DMA-remapping, which replaces 794 * interrupt-remapping. Also used by DMA-remapping, which replaces
756 * register based IOTLB invalidation. 795 * register based IOTLB invalidation.
@@ -770,20 +809,20 @@ int dmar_enable_qi(struct intel_iommu *iommu)
770 if (iommu->qi) 809 if (iommu->qi)
771 return 0; 810 return 0;
772 811
773 iommu->qi = kmalloc(sizeof(*qi), GFP_KERNEL); 812 iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
774 if (!iommu->qi) 813 if (!iommu->qi)
775 return -ENOMEM; 814 return -ENOMEM;
776 815
777 qi = iommu->qi; 816 qi = iommu->qi;
778 817
779 qi->desc = (void *)(get_zeroed_page(GFP_KERNEL)); 818 qi->desc = (void *)(get_zeroed_page(GFP_ATOMIC));
780 if (!qi->desc) { 819 if (!qi->desc) {
781 kfree(qi); 820 kfree(qi);
782 iommu->qi = 0; 821 iommu->qi = 0;
783 return -ENOMEM; 822 return -ENOMEM;
784 } 823 }
785 824
786 qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_KERNEL); 825 qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_ATOMIC);
787 if (!qi->desc_status) { 826 if (!qi->desc_status) {
788 free_page((unsigned long) qi->desc); 827 free_page((unsigned long) qi->desc);
789 kfree(qi); 828 kfree(qi);
@@ -812,3 +851,254 @@ int dmar_enable_qi(struct intel_iommu *iommu)
812 851
813 return 0; 852 return 0;
814} 853}
854
855/* iommu interrupt handling. Most stuff are MSI-like. */
856
857enum faulttype {
858 DMA_REMAP,
859 INTR_REMAP,
860 UNKNOWN,
861};
862
863static const char *dma_remap_fault_reasons[] =
864{
865 "Software",
866 "Present bit in root entry is clear",
867 "Present bit in context entry is clear",
868 "Invalid context entry",
869 "Access beyond MGAW",
870 "PTE Write access is not set",
871 "PTE Read access is not set",
872 "Next page table ptr is invalid",
873 "Root table address invalid",
874 "Context table ptr is invalid",
875 "non-zero reserved fields in RTP",
876 "non-zero reserved fields in CTP",
877 "non-zero reserved fields in PTE",
878};
879
880static const char *intr_remap_fault_reasons[] =
881{
882 "Detected reserved fields in the decoded interrupt-remapped request",
883 "Interrupt index exceeded the interrupt-remapping table size",
884 "Present field in the IRTE entry is clear",
885 "Error accessing interrupt-remapping table pointed by IRTA_REG",
886 "Detected reserved fields in the IRTE entry",
887 "Blocked a compatibility format interrupt request",
888 "Blocked an interrupt request due to source-id verification failure",
889};
890
891#define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
892
893const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
894{
895 if (fault_reason >= 0x20 && (fault_reason <= 0x20 +
896 ARRAY_SIZE(intr_remap_fault_reasons))) {
897 *fault_type = INTR_REMAP;
898 return intr_remap_fault_reasons[fault_reason - 0x20];
899 } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
900 *fault_type = DMA_REMAP;
901 return dma_remap_fault_reasons[fault_reason];
902 } else {
903 *fault_type = UNKNOWN;
904 return "Unknown";
905 }
906}
907
908void dmar_msi_unmask(unsigned int irq)
909{
910 struct intel_iommu *iommu = get_irq_data(irq);
911 unsigned long flag;
912
913 /* unmask it */
914 spin_lock_irqsave(&iommu->register_lock, flag);
915 writel(0, iommu->reg + DMAR_FECTL_REG);
916 /* Read a reg to force flush the post write */
917 readl(iommu->reg + DMAR_FECTL_REG);
918 spin_unlock_irqrestore(&iommu->register_lock, flag);
919}
920
921void dmar_msi_mask(unsigned int irq)
922{
923 unsigned long flag;
924 struct intel_iommu *iommu = get_irq_data(irq);
925
926 /* mask it */
927 spin_lock_irqsave(&iommu->register_lock, flag);
928 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
929 /* Read a reg to force flush the post write */
930 readl(iommu->reg + DMAR_FECTL_REG);
931 spin_unlock_irqrestore(&iommu->register_lock, flag);
932}
933
934void dmar_msi_write(int irq, struct msi_msg *msg)
935{
936 struct intel_iommu *iommu = get_irq_data(irq);
937 unsigned long flag;
938
939 spin_lock_irqsave(&iommu->register_lock, flag);
940 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
941 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
942 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
943 spin_unlock_irqrestore(&iommu->register_lock, flag);
944}
945
946void dmar_msi_read(int irq, struct msi_msg *msg)
947{
948 struct intel_iommu *iommu = get_irq_data(irq);
949 unsigned long flag;
950
951 spin_lock_irqsave(&iommu->register_lock, flag);
952 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
953 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
954 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
955 spin_unlock_irqrestore(&iommu->register_lock, flag);
956}
957
958static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
959 u8 fault_reason, u16 source_id, unsigned long long addr)
960{
961 const char *reason;
962 int fault_type;
963
964 reason = dmar_get_fault_reason(fault_reason, &fault_type);
965
966 if (fault_type == INTR_REMAP)
967 printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] "
968 "fault index %llx\n"
969 "INTR-REMAP:[fault reason %02d] %s\n",
970 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
971 PCI_FUNC(source_id & 0xFF), addr >> 48,
972 fault_reason, reason);
973 else
974 printk(KERN_ERR
975 "DMAR:[%s] Request device [%02x:%02x.%d] "
976 "fault addr %llx \n"
977 "DMAR:[fault reason %02d] %s\n",
978 (type ? "DMA Read" : "DMA Write"),
979 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
980 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
981 return 0;
982}
983
984#define PRIMARY_FAULT_REG_LEN (16)
985irqreturn_t dmar_fault(int irq, void *dev_id)
986{
987 struct intel_iommu *iommu = dev_id;
988 int reg, fault_index;
989 u32 fault_status;
990 unsigned long flag;
991
992 spin_lock_irqsave(&iommu->register_lock, flag);
993 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
994 if (fault_status)
995 printk(KERN_ERR "DRHD: handling fault status reg %x\n",
996 fault_status);
997
998 /* TBD: ignore advanced fault log currently */
999 if (!(fault_status & DMA_FSTS_PPF))
1000 goto clear_rest;
1001
1002 fault_index = dma_fsts_fault_record_index(fault_status);
1003 reg = cap_fault_reg_offset(iommu->cap);
1004 while (1) {
1005 u8 fault_reason;
1006 u16 source_id;
1007 u64 guest_addr;
1008 int type;
1009 u32 data;
1010
1011 /* highest 32 bits */
1012 data = readl(iommu->reg + reg +
1013 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1014 if (!(data & DMA_FRCD_F))
1015 break;
1016
1017 fault_reason = dma_frcd_fault_reason(data);
1018 type = dma_frcd_type(data);
1019
1020 data = readl(iommu->reg + reg +
1021 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1022 source_id = dma_frcd_source_id(data);
1023
1024 guest_addr = dmar_readq(iommu->reg + reg +
1025 fault_index * PRIMARY_FAULT_REG_LEN);
1026 guest_addr = dma_frcd_page_addr(guest_addr);
1027 /* clear the fault */
1028 writel(DMA_FRCD_F, iommu->reg + reg +
1029 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1030
1031 spin_unlock_irqrestore(&iommu->register_lock, flag);
1032
1033 dmar_fault_do_one(iommu, type, fault_reason,
1034 source_id, guest_addr);
1035
1036 fault_index++;
1037 if (fault_index > cap_num_fault_regs(iommu->cap))
1038 fault_index = 0;
1039 spin_lock_irqsave(&iommu->register_lock, flag);
1040 }
1041clear_rest:
1042 /* clear all the other faults */
1043 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1044 writel(fault_status, iommu->reg + DMAR_FSTS_REG);
1045
1046 spin_unlock_irqrestore(&iommu->register_lock, flag);
1047 return IRQ_HANDLED;
1048}
1049
1050int dmar_set_interrupt(struct intel_iommu *iommu)
1051{
1052 int irq, ret;
1053
1054 /*
1055 * Check if the fault interrupt is already initialized.
1056 */
1057 if (iommu->irq)
1058 return 0;
1059
1060 irq = create_irq();
1061 if (!irq) {
1062 printk(KERN_ERR "IOMMU: no free vectors\n");
1063 return -EINVAL;
1064 }
1065
1066 set_irq_data(irq, iommu);
1067 iommu->irq = irq;
1068
1069 ret = arch_setup_dmar_msi(irq);
1070 if (ret) {
1071 set_irq_data(irq, NULL);
1072 iommu->irq = 0;
1073 destroy_irq(irq);
1074 return 0;
1075 }
1076
1077 ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu);
1078 if (ret)
1079 printk(KERN_ERR "IOMMU: can't request irq\n");
1080 return ret;
1081}
1082
1083int __init enable_drhd_fault_handling(void)
1084{
1085 struct dmar_drhd_unit *drhd;
1086
1087 /*
1088 * Enable fault control interrupt.
1089 */
1090 for_each_drhd_unit(drhd) {
1091 int ret;
1092 struct intel_iommu *iommu = drhd->iommu;
1093 ret = dmar_set_interrupt(iommu);
1094
1095 if (ret) {
1096 printk(KERN_ERR "DRHD %Lx: failed to enable fault, "
1097 " interrupt, ret %d\n",
1098 (unsigned long long)drhd->reg_base_addr, ret);
1099 return -1;
1100 }
1101 }
1102
1103 return 0;
1104}
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index f3f686581a90..ef167b8b047d 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1004,194 +1004,6 @@ static int iommu_disable_translation(struct intel_iommu *iommu)
1004 return 0; 1004 return 0;
1005} 1005}
1006 1006
1007/* iommu interrupt handling. Most stuff are MSI-like. */
1008
1009static const char *fault_reason_strings[] =
1010{
1011 "Software",
1012 "Present bit in root entry is clear",
1013 "Present bit in context entry is clear",
1014 "Invalid context entry",
1015 "Access beyond MGAW",
1016 "PTE Write access is not set",
1017 "PTE Read access is not set",
1018 "Next page table ptr is invalid",
1019 "Root table address invalid",
1020 "Context table ptr is invalid",
1021 "non-zero reserved fields in RTP",
1022 "non-zero reserved fields in CTP",
1023 "non-zero reserved fields in PTE",
1024};
1025#define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1026
1027const char *dmar_get_fault_reason(u8 fault_reason)
1028{
1029 if (fault_reason > MAX_FAULT_REASON_IDX)
1030 return "Unknown";
1031 else
1032 return fault_reason_strings[fault_reason];
1033}
1034
1035void dmar_msi_unmask(unsigned int irq)
1036{
1037 struct intel_iommu *iommu = get_irq_data(irq);
1038 unsigned long flag;
1039
1040 /* unmask it */
1041 spin_lock_irqsave(&iommu->register_lock, flag);
1042 writel(0, iommu->reg + DMAR_FECTL_REG);
1043 /* Read a reg to force flush the post write */
1044 readl(iommu->reg + DMAR_FECTL_REG);
1045 spin_unlock_irqrestore(&iommu->register_lock, flag);
1046}
1047
1048void dmar_msi_mask(unsigned int irq)
1049{
1050 unsigned long flag;
1051 struct intel_iommu *iommu = get_irq_data(irq);
1052
1053 /* mask it */
1054 spin_lock_irqsave(&iommu->register_lock, flag);
1055 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1056 /* Read a reg to force flush the post write */
1057 readl(iommu->reg + DMAR_FECTL_REG);
1058 spin_unlock_irqrestore(&iommu->register_lock, flag);
1059}
1060
1061void dmar_msi_write(int irq, struct msi_msg *msg)
1062{
1063 struct intel_iommu *iommu = get_irq_data(irq);
1064 unsigned long flag;
1065
1066 spin_lock_irqsave(&iommu->register_lock, flag);
1067 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1068 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1069 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1070 spin_unlock_irqrestore(&iommu->register_lock, flag);
1071}
1072
1073void dmar_msi_read(int irq, struct msi_msg *msg)
1074{
1075 struct intel_iommu *iommu = get_irq_data(irq);
1076 unsigned long flag;
1077
1078 spin_lock_irqsave(&iommu->register_lock, flag);
1079 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1080 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1081 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1082 spin_unlock_irqrestore(&iommu->register_lock, flag);
1083}
1084
1085static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1086 u8 fault_reason, u16 source_id, unsigned long long addr)
1087{
1088 const char *reason;
1089
1090 reason = dmar_get_fault_reason(fault_reason);
1091
1092 printk(KERN_ERR
1093 "DMAR:[%s] Request device [%02x:%02x.%d] "
1094 "fault addr %llx \n"
1095 "DMAR:[fault reason %02d] %s\n",
1096 (type ? "DMA Read" : "DMA Write"),
1097 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1098 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1099 return 0;
1100}
1101
1102#define PRIMARY_FAULT_REG_LEN (16)
1103static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1104{
1105 struct intel_iommu *iommu = dev_id;
1106 int reg, fault_index;
1107 u32 fault_status;
1108 unsigned long flag;
1109
1110 spin_lock_irqsave(&iommu->register_lock, flag);
1111 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1112
1113 /* TBD: ignore advanced fault log currently */
1114 if (!(fault_status & DMA_FSTS_PPF))
1115 goto clear_overflow;
1116
1117 fault_index = dma_fsts_fault_record_index(fault_status);
1118 reg = cap_fault_reg_offset(iommu->cap);
1119 while (1) {
1120 u8 fault_reason;
1121 u16 source_id;
1122 u64 guest_addr;
1123 int type;
1124 u32 data;
1125
1126 /* highest 32 bits */
1127 data = readl(iommu->reg + reg +
1128 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1129 if (!(data & DMA_FRCD_F))
1130 break;
1131
1132 fault_reason = dma_frcd_fault_reason(data);
1133 type = dma_frcd_type(data);
1134
1135 data = readl(iommu->reg + reg +
1136 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1137 source_id = dma_frcd_source_id(data);
1138
1139 guest_addr = dmar_readq(iommu->reg + reg +
1140 fault_index * PRIMARY_FAULT_REG_LEN);
1141 guest_addr = dma_frcd_page_addr(guest_addr);
1142 /* clear the fault */
1143 writel(DMA_FRCD_F, iommu->reg + reg +
1144 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1145
1146 spin_unlock_irqrestore(&iommu->register_lock, flag);
1147
1148 iommu_page_fault_do_one(iommu, type, fault_reason,
1149 source_id, guest_addr);
1150
1151 fault_index++;
1152 if (fault_index > cap_num_fault_regs(iommu->cap))
1153 fault_index = 0;
1154 spin_lock_irqsave(&iommu->register_lock, flag);
1155 }
1156clear_overflow:
1157 /* clear primary fault overflow */
1158 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1159 if (fault_status & DMA_FSTS_PFO)
1160 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1161
1162 spin_unlock_irqrestore(&iommu->register_lock, flag);
1163 return IRQ_HANDLED;
1164}
1165
1166int dmar_set_interrupt(struct intel_iommu *iommu)
1167{
1168 int irq, ret;
1169
1170 irq = create_irq();
1171 if (!irq) {
1172 printk(KERN_ERR "IOMMU: no free vectors\n");
1173 return -EINVAL;
1174 }
1175
1176 set_irq_data(irq, iommu);
1177 iommu->irq = irq;
1178
1179 ret = arch_setup_dmar_msi(irq);
1180 if (ret) {
1181 set_irq_data(irq, NULL);
1182 iommu->irq = 0;
1183 destroy_irq(irq);
1184 return 0;
1185 }
1186
1187 /* Force fault register is cleared */
1188 iommu_page_fault(irq, iommu);
1189
1190 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1191 if (ret)
1192 printk(KERN_ERR "IOMMU: can't request irq\n");
1193 return ret;
1194}
1195 1007
1196static int iommu_init_domains(struct intel_iommu *iommu) 1008static int iommu_init_domains(struct intel_iommu *iommu)
1197{ 1009{
@@ -1987,7 +1799,7 @@ static int __init init_dmars(void)
1987 struct dmar_rmrr_unit *rmrr; 1799 struct dmar_rmrr_unit *rmrr;
1988 struct pci_dev *pdev; 1800 struct pci_dev *pdev;
1989 struct intel_iommu *iommu; 1801 struct intel_iommu *iommu;
1990 int i, ret, unit = 0; 1802 int i, ret;
1991 1803
1992 /* 1804 /*
1993 * for each drhd 1805 * for each drhd
@@ -2043,11 +1855,40 @@ static int __init init_dmars(void)
2043 } 1855 }
2044 } 1856 }
2045 1857
1858 /*
1859 * Start from the sane iommu hardware state.
1860 */
1861 for_each_drhd_unit(drhd) {
1862 if (drhd->ignored)
1863 continue;
1864
1865 iommu = drhd->iommu;
1866
1867 /*
1868 * If the queued invalidation is already initialized by us
1869 * (for example, while enabling interrupt-remapping) then
1870 * we got the things already rolling from a sane state.
1871 */
1872 if (iommu->qi)
1873 continue;
1874
1875 /*
1876 * Clear any previous faults.
1877 */
1878 dmar_fault(-1, iommu);
1879 /*
1880 * Disable queued invalidation if supported and already enabled
1881 * before OS handover.
1882 */
1883 dmar_disable_qi(iommu);
1884 }
1885
2046 for_each_drhd_unit(drhd) { 1886 for_each_drhd_unit(drhd) {
2047 if (drhd->ignored) 1887 if (drhd->ignored)
2048 continue; 1888 continue;
2049 1889
2050 iommu = drhd->iommu; 1890 iommu = drhd->iommu;
1891
2051 if (dmar_enable_qi(iommu)) { 1892 if (dmar_enable_qi(iommu)) {
2052 /* 1893 /*
2053 * Queued Invalidate not enabled, use Register Based 1894 * Queued Invalidate not enabled, use Register Based
@@ -2109,7 +1950,6 @@ static int __init init_dmars(void)
2109 if (drhd->ignored) 1950 if (drhd->ignored)
2110 continue; 1951 continue;
2111 iommu = drhd->iommu; 1952 iommu = drhd->iommu;
2112 sprintf (iommu->name, "dmar%d", unit++);
2113 1953
2114 iommu_flush_write_buffer(iommu); 1954 iommu_flush_write_buffer(iommu);
2115 1955
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 9d07a05d26f1..b041a409f4a7 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -117,21 +117,22 @@ int get_irte(int irq, struct irte *entry)
117{ 117{
118 int index; 118 int index;
119 struct irq_2_iommu *irq_iommu; 119 struct irq_2_iommu *irq_iommu;
120 unsigned long flags;
120 121
121 if (!entry) 122 if (!entry)
122 return -1; 123 return -1;
123 124
124 spin_lock(&irq_2_ir_lock); 125 spin_lock_irqsave(&irq_2_ir_lock, flags);
125 irq_iommu = valid_irq_2_iommu(irq); 126 irq_iommu = valid_irq_2_iommu(irq);
126 if (!irq_iommu) { 127 if (!irq_iommu) {
127 spin_unlock(&irq_2_ir_lock); 128 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
128 return -1; 129 return -1;
129 } 130 }
130 131
131 index = irq_iommu->irte_index + irq_iommu->sub_handle; 132 index = irq_iommu->irte_index + irq_iommu->sub_handle;
132 *entry = *(irq_iommu->iommu->ir_table->base + index); 133 *entry = *(irq_iommu->iommu->ir_table->base + index);
133 134
134 spin_unlock(&irq_2_ir_lock); 135 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
135 return 0; 136 return 0;
136} 137}
137 138
@@ -141,6 +142,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
141 struct irq_2_iommu *irq_iommu; 142 struct irq_2_iommu *irq_iommu;
142 u16 index, start_index; 143 u16 index, start_index;
143 unsigned int mask = 0; 144 unsigned int mask = 0;
145 unsigned long flags;
144 int i; 146 int i;
145 147
146 if (!count) 148 if (!count)
@@ -170,7 +172,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
170 return -1; 172 return -1;
171 } 173 }
172 174
173 spin_lock(&irq_2_ir_lock); 175 spin_lock_irqsave(&irq_2_ir_lock, flags);
174 do { 176 do {
175 for (i = index; i < index + count; i++) 177 for (i = index; i < index + count; i++)
176 if (table->base[i].present) 178 if (table->base[i].present)
@@ -182,7 +184,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
182 index = (index + count) % INTR_REMAP_TABLE_ENTRIES; 184 index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
183 185
184 if (index == start_index) { 186 if (index == start_index) {
185 spin_unlock(&irq_2_ir_lock); 187 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
186 printk(KERN_ERR "can't allocate an IRTE\n"); 188 printk(KERN_ERR "can't allocate an IRTE\n");
187 return -1; 189 return -1;
188 } 190 }
@@ -193,7 +195,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
193 195
194 irq_iommu = irq_2_iommu_alloc(irq); 196 irq_iommu = irq_2_iommu_alloc(irq);
195 if (!irq_iommu) { 197 if (!irq_iommu) {
196 spin_unlock(&irq_2_ir_lock); 198 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
197 printk(KERN_ERR "can't allocate irq_2_iommu\n"); 199 printk(KERN_ERR "can't allocate irq_2_iommu\n");
198 return -1; 200 return -1;
199 } 201 }
@@ -203,7 +205,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
203 irq_iommu->sub_handle = 0; 205 irq_iommu->sub_handle = 0;
204 irq_iommu->irte_mask = mask; 206 irq_iommu->irte_mask = mask;
205 207
206 spin_unlock(&irq_2_ir_lock); 208 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
207 209
208 return index; 210 return index;
209} 211}
@@ -223,30 +225,32 @@ int map_irq_to_irte_handle(int irq, u16 *sub_handle)
223{ 225{
224 int index; 226 int index;
225 struct irq_2_iommu *irq_iommu; 227 struct irq_2_iommu *irq_iommu;
228 unsigned long flags;
226 229
227 spin_lock(&irq_2_ir_lock); 230 spin_lock_irqsave(&irq_2_ir_lock, flags);
228 irq_iommu = valid_irq_2_iommu(irq); 231 irq_iommu = valid_irq_2_iommu(irq);
229 if (!irq_iommu) { 232 if (!irq_iommu) {
230 spin_unlock(&irq_2_ir_lock); 233 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
231 return -1; 234 return -1;
232 } 235 }
233 236
234 *sub_handle = irq_iommu->sub_handle; 237 *sub_handle = irq_iommu->sub_handle;
235 index = irq_iommu->irte_index; 238 index = irq_iommu->irte_index;
236 spin_unlock(&irq_2_ir_lock); 239 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
237 return index; 240 return index;
238} 241}
239 242
240int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) 243int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
241{ 244{
242 struct irq_2_iommu *irq_iommu; 245 struct irq_2_iommu *irq_iommu;
246 unsigned long flags;
243 247
244 spin_lock(&irq_2_ir_lock); 248 spin_lock_irqsave(&irq_2_ir_lock, flags);
245 249
246 irq_iommu = irq_2_iommu_alloc(irq); 250 irq_iommu = irq_2_iommu_alloc(irq);
247 251
248 if (!irq_iommu) { 252 if (!irq_iommu) {
249 spin_unlock(&irq_2_ir_lock); 253 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
250 printk(KERN_ERR "can't allocate irq_2_iommu\n"); 254 printk(KERN_ERR "can't allocate irq_2_iommu\n");
251 return -1; 255 return -1;
252 } 256 }
@@ -256,7 +260,7 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
256 irq_iommu->sub_handle = subhandle; 260 irq_iommu->sub_handle = subhandle;
257 irq_iommu->irte_mask = 0; 261 irq_iommu->irte_mask = 0;
258 262
259 spin_unlock(&irq_2_ir_lock); 263 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
260 264
261 return 0; 265 return 0;
262} 266}
@@ -264,11 +268,12 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
264int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index) 268int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
265{ 269{
266 struct irq_2_iommu *irq_iommu; 270 struct irq_2_iommu *irq_iommu;
271 unsigned long flags;
267 272
268 spin_lock(&irq_2_ir_lock); 273 spin_lock_irqsave(&irq_2_ir_lock, flags);
269 irq_iommu = valid_irq_2_iommu(irq); 274 irq_iommu = valid_irq_2_iommu(irq);
270 if (!irq_iommu) { 275 if (!irq_iommu) {
271 spin_unlock(&irq_2_ir_lock); 276 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
272 return -1; 277 return -1;
273 } 278 }
274 279
@@ -277,7 +282,7 @@ int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
277 irq_iommu->sub_handle = 0; 282 irq_iommu->sub_handle = 0;
278 irq_2_iommu(irq)->irte_mask = 0; 283 irq_2_iommu(irq)->irte_mask = 0;
279 284
280 spin_unlock(&irq_2_ir_lock); 285 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
281 286
282 return 0; 287 return 0;
283} 288}
@@ -289,11 +294,12 @@ int modify_irte(int irq, struct irte *irte_modified)
289 struct irte *irte; 294 struct irte *irte;
290 struct intel_iommu *iommu; 295 struct intel_iommu *iommu;
291 struct irq_2_iommu *irq_iommu; 296 struct irq_2_iommu *irq_iommu;
297 unsigned long flags;
292 298
293 spin_lock(&irq_2_ir_lock); 299 spin_lock_irqsave(&irq_2_ir_lock, flags);
294 irq_iommu = valid_irq_2_iommu(irq); 300 irq_iommu = valid_irq_2_iommu(irq);
295 if (!irq_iommu) { 301 if (!irq_iommu) {
296 spin_unlock(&irq_2_ir_lock); 302 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
297 return -1; 303 return -1;
298 } 304 }
299 305
@@ -302,11 +308,11 @@ int modify_irte(int irq, struct irte *irte_modified)
302 index = irq_iommu->irte_index + irq_iommu->sub_handle; 308 index = irq_iommu->irte_index + irq_iommu->sub_handle;
303 irte = &iommu->ir_table->base[index]; 309 irte = &iommu->ir_table->base[index];
304 310
305 set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1)); 311 set_64bit((unsigned long *)irte, irte_modified->low);
306 __iommu_flush_cache(iommu, irte, sizeof(*irte)); 312 __iommu_flush_cache(iommu, irte, sizeof(*irte));
307 313
308 rc = qi_flush_iec(iommu, index, 0); 314 rc = qi_flush_iec(iommu, index, 0);
309 spin_unlock(&irq_2_ir_lock); 315 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
310 316
311 return rc; 317 return rc;
312} 318}
@@ -317,11 +323,12 @@ int flush_irte(int irq)
317 int index; 323 int index;
318 struct intel_iommu *iommu; 324 struct intel_iommu *iommu;
319 struct irq_2_iommu *irq_iommu; 325 struct irq_2_iommu *irq_iommu;
326 unsigned long flags;
320 327
321 spin_lock(&irq_2_ir_lock); 328 spin_lock_irqsave(&irq_2_ir_lock, flags);
322 irq_iommu = valid_irq_2_iommu(irq); 329 irq_iommu = valid_irq_2_iommu(irq);
323 if (!irq_iommu) { 330 if (!irq_iommu) {
324 spin_unlock(&irq_2_ir_lock); 331 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
325 return -1; 332 return -1;
326 } 333 }
327 334
@@ -330,7 +337,7 @@ int flush_irte(int irq)
330 index = irq_iommu->irte_index + irq_iommu->sub_handle; 337 index = irq_iommu->irte_index + irq_iommu->sub_handle;
331 338
332 rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask); 339 rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
333 spin_unlock(&irq_2_ir_lock); 340 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
334 341
335 return rc; 342 return rc;
336} 343}
@@ -363,11 +370,12 @@ int free_irte(int irq)
363 struct irte *irte; 370 struct irte *irte;
364 struct intel_iommu *iommu; 371 struct intel_iommu *iommu;
365 struct irq_2_iommu *irq_iommu; 372 struct irq_2_iommu *irq_iommu;
373 unsigned long flags;
366 374
367 spin_lock(&irq_2_ir_lock); 375 spin_lock_irqsave(&irq_2_ir_lock, flags);
368 irq_iommu = valid_irq_2_iommu(irq); 376 irq_iommu = valid_irq_2_iommu(irq);
369 if (!irq_iommu) { 377 if (!irq_iommu) {
370 spin_unlock(&irq_2_ir_lock); 378 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
371 return -1; 379 return -1;
372 } 380 }
373 381
@@ -378,7 +386,7 @@ int free_irte(int irq)
378 386
379 if (!irq_iommu->sub_handle) { 387 if (!irq_iommu->sub_handle) {
380 for (i = 0; i < (1 << irq_iommu->irte_mask); i++) 388 for (i = 0; i < (1 << irq_iommu->irte_mask); i++)
381 set_64bit((unsigned long *)irte, 0); 389 set_64bit((unsigned long *)(irte + i), 0);
382 rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask); 390 rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
383 } 391 }
384 392
@@ -387,7 +395,7 @@ int free_irte(int irq)
387 irq_iommu->sub_handle = 0; 395 irq_iommu->sub_handle = 0;
388 irq_iommu->irte_mask = 0; 396 irq_iommu->irte_mask = 0;
389 397
390 spin_unlock(&irq_2_ir_lock); 398 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
391 399
392 return rc; 400 return rc;
393} 401}
@@ -439,12 +447,12 @@ static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
439 struct page *pages; 447 struct page *pages;
440 448
441 ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table), 449 ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
442 GFP_KERNEL); 450 GFP_ATOMIC);
443 451
444 if (!iommu->ir_table) 452 if (!iommu->ir_table)
445 return -ENOMEM; 453 return -ENOMEM;
446 454
447 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, INTR_REMAP_PAGE_ORDER); 455 pages = alloc_pages(GFP_ATOMIC | __GFP_ZERO, INTR_REMAP_PAGE_ORDER);
448 456
449 if (!pages) { 457 if (!pages) {
450 printk(KERN_ERR "failed to allocate pages of order %d\n", 458 printk(KERN_ERR "failed to allocate pages of order %d\n",
@@ -459,11 +467,55 @@ static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
459 return 0; 467 return 0;
460} 468}
461 469
470/*
471 * Disable Interrupt Remapping.
472 */
473static void disable_intr_remapping(struct intel_iommu *iommu)
474{
475 unsigned long flags;
476 u32 sts;
477
478 if (!ecap_ir_support(iommu->ecap))
479 return;
480
481 spin_lock_irqsave(&iommu->register_lock, flags);
482
483 sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
484 if (!(sts & DMA_GSTS_IRES))
485 goto end;
486
487 iommu->gcmd &= ~DMA_GCMD_IRE;
488 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
489
490 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
491 readl, !(sts & DMA_GSTS_IRES), sts);
492
493end:
494 spin_unlock_irqrestore(&iommu->register_lock, flags);
495}
496
462int __init enable_intr_remapping(int eim) 497int __init enable_intr_remapping(int eim)
463{ 498{
464 struct dmar_drhd_unit *drhd; 499 struct dmar_drhd_unit *drhd;
465 int setup = 0; 500 int setup = 0;
466 501
502 for_each_drhd_unit(drhd) {
503 struct intel_iommu *iommu = drhd->iommu;
504
505 /*
506 * Clear previous faults.
507 */
508 dmar_fault(-1, iommu);
509
510 /*
511 * Disable intr remapping and queued invalidation, if already
512 * enabled prior to OS handover.
513 */
514 disable_intr_remapping(iommu);
515
516 dmar_disable_qi(iommu);
517 }
518
467 /* 519 /*
468 * check for the Interrupt-remapping support 520 * check for the Interrupt-remapping support
469 */ 521 */
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index f28440784cf0..2f3427468956 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -24,10 +24,10 @@
24#include <linux/acpi.h> 24#include <linux/acpi.h>
25#include <linux/types.h> 25#include <linux/types.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <linux/irqreturn.h>
27 28
28#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
29struct intel_iommu; 29struct intel_iommu;
30 30#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
31struct dmar_drhd_unit { 31struct dmar_drhd_unit {
32 struct list_head list; /* list of drhd units */ 32 struct list_head list; /* list of drhd units */
33 struct acpi_dmar_header *hdr; /* ACPI header */ 33 struct acpi_dmar_header *hdr; /* ACPI header */
@@ -49,7 +49,7 @@ extern int dmar_dev_scope_init(void);
49 49
50/* Intel IOMMU detection */ 50/* Intel IOMMU detection */
51extern void detect_intel_iommu(void); 51extern void detect_intel_iommu(void);
52 52extern int enable_drhd_fault_handling(void);
53 53
54extern int parse_ioapics_under_ir(void); 54extern int parse_ioapics_under_ir(void);
55extern int alloc_iommu(struct dmar_drhd_unit *); 55extern int alloc_iommu(struct dmar_drhd_unit *);
@@ -63,12 +63,12 @@ static inline int dmar_table_init(void)
63{ 63{
64 return -ENODEV; 64 return -ENODEV;
65} 65}
66static inline int enable_drhd_fault_handling(void)
67{
68 return -1;
69}
66#endif /* !CONFIG_DMAR && !CONFIG_INTR_REMAP */ 70#endif /* !CONFIG_DMAR && !CONFIG_INTR_REMAP */
67 71
68#ifdef CONFIG_INTR_REMAP
69extern int intr_remapping_enabled;
70extern int enable_intr_remapping(int);
71
72struct irte { 72struct irte {
73 union { 73 union {
74 struct { 74 struct {
@@ -97,6 +97,10 @@ struct irte {
97 __u64 high; 97 __u64 high;
98 }; 98 };
99}; 99};
100#ifdef CONFIG_INTR_REMAP
101extern int intr_remapping_enabled;
102extern int enable_intr_remapping(int);
103
100extern int get_irte(int irq, struct irte *entry); 104extern int get_irte(int irq, struct irte *entry);
101extern int modify_irte(int irq, struct irte *irte_modified); 105extern int modify_irte(int irq, struct irte *irte_modified);
102extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count); 106extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
@@ -111,14 +115,40 @@ extern int irq_remapped(int irq);
111extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev); 115extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
112extern struct intel_iommu *map_ioapic_to_ir(int apic); 116extern struct intel_iommu *map_ioapic_to_ir(int apic);
113#else 117#else
118static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
119{
120 return -1;
121}
122static inline int modify_irte(int irq, struct irte *irte_modified)
123{
124 return -1;
125}
126static inline int free_irte(int irq)
127{
128 return -1;
129}
130static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle)
131{
132 return -1;
133}
134static inline int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
135 u16 sub_handle)
136{
137 return -1;
138}
139static inline struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
140{
141 return NULL;
142}
143static inline struct intel_iommu *map_ioapic_to_ir(int apic)
144{
145 return NULL;
146}
114#define irq_remapped(irq) (0) 147#define irq_remapped(irq) (0)
115#define enable_intr_remapping(mode) (-1) 148#define enable_intr_remapping(mode) (-1)
116#define intr_remapping_enabled (0) 149#define intr_remapping_enabled (0)
117#endif 150#endif
118 151
119#ifdef CONFIG_DMAR
120extern const char *dmar_get_fault_reason(u8 fault_reason);
121
122/* Can't use the common MSI interrupt functions 152/* Can't use the common MSI interrupt functions
123 * since DMAR is not a pci device 153 * since DMAR is not a pci device
124 */ 154 */
@@ -127,8 +157,10 @@ extern void dmar_msi_mask(unsigned int irq);
127extern void dmar_msi_read(int irq, struct msi_msg *msg); 157extern void dmar_msi_read(int irq, struct msi_msg *msg);
128extern void dmar_msi_write(int irq, struct msi_msg *msg); 158extern void dmar_msi_write(int irq, struct msi_msg *msg);
129extern int dmar_set_interrupt(struct intel_iommu *iommu); 159extern int dmar_set_interrupt(struct intel_iommu *iommu);
160extern irqreturn_t dmar_fault(int irq, void *dev_id);
130extern int arch_setup_dmar_msi(unsigned int irq); 161extern int arch_setup_dmar_msi(unsigned int irq);
131 162
163#ifdef CONFIG_DMAR
132extern int iommu_detected, no_iommu; 164extern int iommu_detected, no_iommu;
133extern struct list_head dmar_rmrr_units; 165extern struct list_head dmar_rmrr_units;
134struct dmar_rmrr_unit { 166struct dmar_rmrr_unit {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d2e3cbfba14f..78c1262e8704 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -292,6 +292,8 @@ struct intel_iommu {
292 spinlock_t register_lock; /* protect register handling */ 292 spinlock_t register_lock; /* protect register handling */
293 int seq_id; /* sequence id of the iommu */ 293 int seq_id; /* sequence id of the iommu */
294 int agaw; /* agaw of this iommu */ 294 int agaw; /* agaw of this iommu */
295 unsigned int irq;
296 unsigned char name[13]; /* Device Name */
295 297
296#ifdef CONFIG_DMAR 298#ifdef CONFIG_DMAR
297 unsigned long *domain_ids; /* bitmap of domains */ 299 unsigned long *domain_ids; /* bitmap of domains */
@@ -299,8 +301,6 @@ struct intel_iommu {
299 spinlock_t lock; /* protect context, domain ids */ 301 spinlock_t lock; /* protect context, domain ids */
300 struct root_entry *root_entry; /* virtual address */ 302 struct root_entry *root_entry; /* virtual address */
301 303
302 unsigned int irq;
303 unsigned char name[7]; /* Device Name */
304 struct iommu_flush flush; 304 struct iommu_flush flush;
305#endif 305#endif
306 struct q_inval *qi; /* Queued invalidation info */ 306 struct q_inval *qi; /* Queued invalidation info */
@@ -321,6 +321,7 @@ extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
321extern int alloc_iommu(struct dmar_drhd_unit *drhd); 321extern int alloc_iommu(struct dmar_drhd_unit *drhd);
322extern void free_iommu(struct intel_iommu *iommu); 322extern void free_iommu(struct intel_iommu *iommu);
323extern int dmar_enable_qi(struct intel_iommu *iommu); 323extern int dmar_enable_qi(struct intel_iommu *iommu);
324extern void dmar_disable_qi(struct intel_iommu *iommu);
324extern void qi_global_iec(struct intel_iommu *iommu); 325extern void qi_global_iec(struct intel_iommu *iommu);
325 326
326extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, 327extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065cdf8c09fb..b1ea37fc7a24 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -104,6 +104,7 @@ extern unsigned int kobjsize(const void *objp);
104#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ 104#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */
105#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ 105#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
106#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ 106#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
107#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
107 108
108#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ 109#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
109#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 110#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -145,7 +146,7 @@ extern pgprot_t protection_map[16];
145 */ 146 */
146static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) 147static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
147{ 148{
148 return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff); 149 return (vma->vm_flags & VM_PFN_AT_MMAP);
149} 150}
150 151
151static inline int is_pfn_mapping(struct vm_area_struct *vma) 152static inline int is_pfn_mapping(struct vm_area_struct *vma)
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd2..2032ad2fc34b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1665,9 +1665,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1665 * behaviour that some programs depend on. We mark the "original" 1665 * behaviour that some programs depend on. We mark the "original"
1666 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 1666 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
1667 */ 1667 */
1668 if (addr == vma->vm_start && end == vma->vm_end) 1668 if (addr == vma->vm_start && end == vma->vm_end) {
1669 vma->vm_pgoff = pfn; 1669 vma->vm_pgoff = pfn;
1670 else if (is_cow_mapping(vma->vm_flags)) 1670 vma->vm_flags |= VM_PFN_AT_MMAP;
1671 } else if (is_cow_mapping(vma->vm_flags))
1671 return -EINVAL; 1672 return -EINVAL;
1672 1673
1673 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1674 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
@@ -1679,6 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1679 * needed from higher level routine calling unmap_vmas 1680 * needed from higher level routine calling unmap_vmas
1680 */ 1681 */
1681 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); 1682 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1683 vma->vm_flags &= ~VM_PFN_AT_MMAP;
1682 return -EINVAL; 1684 return -EINVAL;
1683 } 1685 }
1684 1686