aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Kconfig108
-rw-r--r--arch/x86/Kconfig.debug2
-rw-r--r--arch/x86/Makefile19
-rw-r--r--arch/x86/boot/compressed/misc.c4
-rw-r--r--arch/x86/boot/memory.c3
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c332
-rw-r--r--arch/x86/kernel/apic_32.c12
-rw-r--r--arch/x86/kernel/apic_64.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c38
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c901
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h3
-rw-r--r--arch/x86/kernel/e820.c896
-rw-r--r--arch/x86/kernel/e820_32.c536
-rw-r--r--arch/x86/kernel/e820_64.c787
-rw-r--r--arch/x86/kernel/efi.c59
-rw-r--r--arch/x86/kernel/efi_64.c8
-rw-r--r--arch/x86/kernel/genapic_64.c2
-rw-r--r--arch/x86/kernel/head.c73
-rw-r--r--arch/x86/kernel/head32.c27
-rw-r--r--arch/x86/kernel/head64.c68
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/io_apic_32.c127
-rw-r--r--arch/x86/kernel/io_apic_64.c82
-rw-r--r--arch/x86/kernel/mpparse.c803
-rw-r--r--arch/x86/kernel/numaq_32.c25
-rw-r--r--arch/x86/kernel/setup.c23
-rw-r--r--arch/x86/kernel/setup_32.c269
-rw-r--r--arch/x86/kernel/setup_64.c42
-rw-r--r--arch/x86/kernel/smpboot.c17
-rw-r--r--arch/x86/kernel/srat_32.c43
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/trampoline.c2
-rw-r--r--arch/x86/lguest/boot.c5
-rw-r--r--arch/x86/mach-default/setup.c7
-rw-r--r--arch/x86/mach-es7000/Makefile1
-rw-r--r--arch/x86/mach-es7000/es7000plat.c49
-rw-r--r--arch/x86/mach-generic/Makefile10
-rw-r--r--arch/x86/mach-generic/bigsmp.c4
-rw-r--r--arch/x86/mach-generic/numaq.c41
-rw-r--r--arch/x86/mach-generic/probe.c15
-rw-r--r--arch/x86/mach-visws/mpparse.c7
-rw-r--r--arch/x86/mach-voyager/setup.c7
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c14
-rw-r--r--arch/x86/mm/discontig_32.c156
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/numa_64.c2
-rw-r--r--arch/x86/pci/Makefile_325
-rw-r--r--arch/x86/pci/k8-bus_64.c4
-rw-r--r--arch/x86/pci/numa.c29
-rw-r--r--arch/x86/xen/enlighten.c3
51 files changed, 3394 insertions, 2297 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bf07b6f50fa1..07276ac01c20 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -261,36 +261,6 @@ config X86_VOYAGER
261 If you do not specifically know you have a Voyager based machine, 261 If you do not specifically know you have a Voyager based machine,
262 say N here, otherwise the kernel you build will not be bootable. 262 say N here, otherwise the kernel you build will not be bootable.
263 263
264config X86_NUMAQ
265 bool "NUMAQ (IBM/Sequent)"
266 depends on SMP && X86_32
267 select NUMA
268 help
269 This option is used for getting Linux to run on a (IBM/Sequent) NUMA
270 multiquad box. This changes the way that processors are bootstrapped,
271 and uses Clustered Logical APIC addressing mode instead of Flat Logical.
272 You will need a new lynxer.elf file to flash your firmware with - send
273 email to <Martin.Bligh@us.ibm.com>.
274
275config X86_SUMMIT
276 bool "Summit/EXA (IBM x440)"
277 depends on X86_32 && SMP
278 help
279 This option is needed for IBM systems that use the Summit/EXA chipset.
280 In particular, it is needed for the x440.
281
282 If you don't have one of these computers, you should say N here.
283 If you want to build a NUMA kernel, you must select ACPI.
284
285config X86_BIGSMP
286 bool "Support for other sub-arch SMP systems with more than 8 CPUs"
287 depends on X86_32 && SMP
288 help
289 This option is needed for the systems that have more than 8 CPUs
290 and if the system is not of any sub-arch type above.
291
292 If you don't have such a system, you should say N here.
293
294config X86_VISWS 264config X86_VISWS
295 bool "SGI 320/540 (Visual Workstation)" 265 bool "SGI 320/540 (Visual Workstation)"
296 depends on X86_32 266 depends on X86_32
@@ -304,12 +274,33 @@ config X86_VISWS
304 and vice versa. See <file:Documentation/sgi-visws.txt> for details. 274 and vice versa. See <file:Documentation/sgi-visws.txt> for details.
305 275
306config X86_GENERICARCH 276config X86_GENERICARCH
307 bool "Generic architecture (Summit, bigsmp, ES7000, default)" 277 bool "Generic architecture"
308 depends on X86_32 278 depends on X86_32
309 help 279 help
310 This option compiles in the Summit, bigsmp, ES7000, default subarchitectures. 280 This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
311 It is intended for a generic binary kernel. 281 subarchitectures. It is intended for a generic binary kernel.
312 If you want a NUMA kernel, select ACPI. We need SRAT for NUMA. 282 if you select them all, kernel will probe it one by one. and will
283 fallback to default.
284
285if X86_GENERICARCH
286
287config X86_NUMAQ
288 bool "NUMAQ (IBM/Sequent)"
289 depends on SMP && X86_32
290 select NUMA
291 help
292 This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
293 NUMA multiquad box. This changes the way that processors are
294 bootstrapped, and uses Clustered Logical APIC addressing mode instead
295 of Flat Logical. You will need a new lynxer.elf file to flash your
296 firmware with - send email to <Martin.Bligh@us.ibm.com>.
297
298config X86_SUMMIT
299 bool "Summit/EXA (IBM x440)"
300 depends on X86_32 && SMP
301 help
302 This option is needed for IBM systems that use the Summit/EXA chipset.
303 In particular, it is needed for the x440.
313 304
314config X86_ES7000 305config X86_ES7000
315 bool "Support for Unisys ES7000 IA32 series" 306 bool "Support for Unisys ES7000 IA32 series"
@@ -317,8 +308,15 @@ config X86_ES7000
317 help 308 help
318 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is 309 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
319 supposed to run on an IA32-based Unisys ES7000 system. 310 supposed to run on an IA32-based Unisys ES7000 system.
320 Only choose this option if you have such a system, otherwise you 311
321 should say N here. 312config X86_BIGSMP
313 bool "Support for big SMP systems with more than 8 CPUs"
314 depends on X86_32 && SMP
315 help
316 This option is needed for the systems that have more than 8 CPUs
317 and if the system is not of any sub-arch type above.
318
319endif
322 320
323config X86_RDC321X 321config X86_RDC321X
324 bool "RDC R-321x SoC" 322 bool "RDC R-321x SoC"
@@ -911,9 +909,9 @@ config X86_PAE
911config NUMA 909config NUMA
912 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" 910 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
913 depends on SMP 911 depends on SMP
914 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL) 912 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || X86_SUMMIT && ACPI) && EXPERIMENTAL)
915 default n if X86_PC 913 default n if X86_PC
916 default y if (X86_NUMAQ || X86_SUMMIT) 914 default y if (X86_NUMAQ || X86_SUMMIT || X86_GENERICARCH)
917 help 915 help
918 Enable NUMA (Non Uniform Memory Access) support. 916 Enable NUMA (Non Uniform Memory Access) support.
919 The kernel will try to allocate memory used by a CPU on the 917 The kernel will try to allocate memory used by a CPU on the
@@ -1090,6 +1088,40 @@ config MTRR
1090 1088
1091 See <file:Documentation/mtrr.txt> for more information. 1089 See <file:Documentation/mtrr.txt> for more information.
1092 1090
1091config MTRR_SANITIZER
1092 def_bool y
1093 prompt "MTRR cleanup support"
1094 depends on MTRR
1095 help
1096 Convert MTRR layout from continuous to discrete, so some X driver
1097 could add WB entries.
1098
1099 Say N here if you see bootup problems (boot crash, boot hang,
1100 spontaneous reboots).
1101
1102 Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size
1103 could be used to send largest mtrr entry size for continuous block
1104 to hold holes (aka. UC entries)
1105
1106 If unsure, say Y.
1107
1108config MTRR_SANITIZER_ENABLE_DEFAULT
1109 int "MTRR cleanup enable value (0-1)"
1110 range 0 1
1111 default "0"
1112 depends on MTRR_SANITIZER
1113 help
1114 Enable mtrr cleanup default value
1115
1116config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1117 int "MTRR cleanup spare reg num (0-7)"
1118 range 0 7
1119 default "1"
1120 depends on MTRR_SANITIZER
1121 help
1122 mtrr cleanup spare entries default, it can be changed via
1123 mtrr_spare_reg_nr=
1124
1093config X86_PAT 1125config X86_PAT
1094 bool 1126 bool
1095 prompt "x86 PAT support" 1127 prompt "x86 PAT support"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 18363374d51a..253e7a5706d3 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -131,7 +131,7 @@ config 4KSTACKS
131 131
132config X86_FIND_SMP_CONFIG 132config X86_FIND_SMP_CONFIG
133 def_bool y 133 def_bool y
134 depends on X86_LOCAL_APIC || X86_VOYAGER 134 depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS
135 depends on X86_32 135 depends on X86_32
136 136
137config X86_MPPARSE 137config X86_MPPARSE
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3cff3c894cf3..d6650131659e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -117,29 +117,11 @@ mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
117mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws 117mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws
118mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/ 118mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/
119 119
120# NUMAQ subarch support
121mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq
122mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default/
123
124# BIGSMP subarch support
125mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp
126mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/
127
128#Summit subarch support
129mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
130mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/
131
132# generic subarchitecture 120# generic subarchitecture
133mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic 121mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
134fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 122fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
135mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 123mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
136 124
137
138# ES7000 subarch support
139mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000
140fcore-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
141mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default/
142
143# RDC R-321x subarch support 125# RDC R-321x subarch support
144mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x 126mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
145mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/ 127mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
@@ -160,6 +142,7 @@ KBUILD_AFLAGS += $(mflags-y)
160 142
161head-y := arch/x86/kernel/head_$(BITS).o 143head-y := arch/x86/kernel/head_$(BITS).o
162head-y += arch/x86/kernel/head$(BITS).o 144head-y += arch/x86/kernel/head$(BITS).o
145head-y += arch/x86/kernel/head.o
163head-y += arch/x86/kernel/init_task.o 146head-y += arch/x86/kernel/init_task.o
164 147
165libs-y += arch/x86/lib/ 148libs-y += arch/x86/lib/
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 90456cee47c3..ba0be6a25ff7 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -221,10 +221,6 @@ static char *vidmem;
221static int vidport; 221static int vidport;
222static int lines, cols; 222static int lines, cols;
223 223
224#ifdef CONFIG_X86_NUMAQ
225void *xquad_portio;
226#endif
227
228#include "../../../../lib/inflate.c" 224#include "../../../../lib/inflate.c"
229 225
230static void *malloc(int size) 226static void *malloc(int size)
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index acad32eb4290..53165c97336b 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include "boot.h" 15#include "boot.h"
16#include <linux/kernel.h>
16 17
17#define SMAP 0x534d4150 /* ASCII "SMAP" */ 18#define SMAP 0x534d4150 /* ASCII "SMAP" */
18 19
@@ -53,7 +54,7 @@ static int detect_memory_e820(void)
53 54
54 count++; 55 count++;
55 desc++; 56 desc++;
56 } while (next && count < E820MAX); 57 } while (next && count < ARRAY_SIZE(boot_params.e820_map));
57 58
58 return boot_params.e820_entries = count; 59 return boot_params.e820_entries = count;
59} 60}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 77807d4769c9..dc3c636d113e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds 5extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
6 6
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
@@ -22,7 +22,7 @@ obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o
22obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 22obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
23obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 23obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
24obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o 24obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o
25obj-y += bootflag.o e820_$(BITS).o 25obj-y += bootflag.o e820_$(BITS).o e820.o
26obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 26obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
27obj-y += alternative.o i8253.o pci-nommu.o 27obj-y += alternative.o i8253.o pci-nommu.o
28obj-$(CONFIG_X86_64) += bugs_64.o 28obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 33c5216fd3e1..caf4ed7ca069 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -338,8 +338,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
338 338
339#ifdef CONFIG_X86_IO_APIC 339#ifdef CONFIG_X86_IO_APIC
340 340
341struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
342
343static int __init 341static int __init
344acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) 342acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
345{ 343{
@@ -860,6 +858,336 @@ static int __init acpi_parse_madt_lapic_entries(void)
860#endif /* CONFIG_X86_LOCAL_APIC */ 858#endif /* CONFIG_X86_LOCAL_APIC */
861 859
862#ifdef CONFIG_X86_IO_APIC 860#ifdef CONFIG_X86_IO_APIC
861#define MP_ISA_BUS 0
862
863#ifdef CONFIG_X86_ES7000
864extern int es7000_plat;
865#endif
866
867static struct {
868 int apic_id;
869 int gsi_base;
870 int gsi_end;
871 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
872} mp_ioapic_routing[MAX_IO_APICS];
873
874static int mp_find_ioapic(int gsi)
875{
876 int i = 0;
877
878 /* Find the IOAPIC that manages this GSI. */
879 for (i = 0; i < nr_ioapics; i++) {
880 if ((gsi >= mp_ioapic_routing[i].gsi_base)
881 && (gsi <= mp_ioapic_routing[i].gsi_end))
882 return i;
883 }
884
885 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
886 return -1;
887}
888
889static u8 __init uniq_ioapic_id(u8 id)
890{
891#ifdef CONFIG_X86_32
892 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
893 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
894 return io_apic_get_unique_id(nr_ioapics, id);
895 else
896 return id;
897#else
898 int i;
899 DECLARE_BITMAP(used, 256);
900 bitmap_zero(used, 256);
901 for (i = 0; i < nr_ioapics; i++) {
902 struct mp_config_ioapic *ia = &mp_ioapics[i];
903 __set_bit(ia->mp_apicid, used);
904 }
905 if (!test_bit(id, used))
906 return id;
907 return find_first_zero_bit(used, 256);
908#endif
909}
910
911static int bad_ioapic(unsigned long address)
912{
913 if (nr_ioapics >= MAX_IO_APICS) {
914 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
915 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
916 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
917 }
918 if (!address) {
919 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
920 " found in table, skipping!\n");
921 return 1;
922 }
923 return 0;
924}
925
926void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
927{
928 int idx = 0;
929
930 if (bad_ioapic(address))
931 return;
932
933 idx = nr_ioapics;
934
935 mp_ioapics[idx].mp_type = MP_IOAPIC;
936 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
937 mp_ioapics[idx].mp_apicaddr = address;
938
939 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
940 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
941#ifdef CONFIG_X86_32
942 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
943#else
944 mp_ioapics[idx].mp_apicver = 0;
945#endif
946 /*
947 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
948 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
949 */
950 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
951 mp_ioapic_routing[idx].gsi_base = gsi_base;
952 mp_ioapic_routing[idx].gsi_end = gsi_base +
953 io_apic_get_redir_entries(idx);
954
955 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
956 "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
957 mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
958 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
959
960 nr_ioapics++;
961}
962
963void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
964{
965 int ioapic = -1;
966 int pin = -1;
967
968 /*
969 * Convert 'gsi' to 'ioapic.pin'.
970 */
971 ioapic = mp_find_ioapic(gsi);
972 if (ioapic < 0)
973 return;
974 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
975
976 /*
977 * TBD: This check is for faulty timer entries, where the override
978 * erroneously sets the trigger to level, resulting in a HUGE
979 * increase of timer interrupts!
980 */
981 if ((bus_irq == 0) && (trigger == 3))
982 trigger = 1;
983
984 mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
985 mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
986 mp_irqs[mp_irq_entries].mp_irqflag = (trigger << 2) | polarity;
987 mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
988 mp_irqs[mp_irq_entries].mp_srcbusirq = bus_irq; /* IRQ */
989 mp_irqs[mp_irq_entries].mp_dstapic =
990 mp_ioapics[ioapic].mp_apicid; /* APIC ID */
991 mp_irqs[mp_irq_entries].mp_dstirq = pin; /* INTIN# */
992
993 if (++mp_irq_entries == MAX_IRQ_SOURCES)
994 panic("Max # of irq sources exceeded!!\n");
995
996}
997
998void __init mp_config_acpi_legacy_irqs(void)
999{
1000 int i = 0;
1001 int ioapic = -1;
1002
1003#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1004 /*
1005 * Fabricate the legacy ISA bus (bus #31).
1006 */
1007 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
1008#endif
1009 set_bit(MP_ISA_BUS, mp_bus_not_pci);
1010 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
1011
1012#ifdef CONFIG_X86_ES7000
1013 /*
1014 * Older generations of ES7000 have no legacy identity mappings
1015 */
1016 if (es7000_plat == 1)
1017 return;
1018#endif
1019
1020 /*
1021 * Locate the IOAPIC that manages the ISA IRQs (0-15).
1022 */
1023 ioapic = mp_find_ioapic(0);
1024 if (ioapic < 0)
1025 return;
1026
1027 /*
1028 * Use the default configuration for the IRQs 0-15. Unless
1029 * overridden by (MADT) interrupt source override entries.
1030 */
1031 for (i = 0; i < 16; i++) {
1032 int idx;
1033
1034 mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
1035 mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */
1036 mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
1037 mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid;
1038
1039 for (idx = 0; idx < mp_irq_entries; idx++) {
1040 struct mp_config_intsrc *irq = mp_irqs + idx;
1041
1042 /* Do we already have a mapping for this ISA IRQ? */
1043 if (irq->mp_srcbus == MP_ISA_BUS
1044 && irq->mp_srcbusirq == i)
1045 break;
1046
1047 /* Do we already have a mapping for this IOAPIC pin */
1048 if ((irq->mp_dstapic ==
1049 mp_irqs[mp_irq_entries].mp_dstapic) &&
1050 (irq->mp_dstirq == i))
1051 break;
1052 }
1053
1054 if (idx != mp_irq_entries) {
1055 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
1056 continue; /* IRQ already used */
1057 }
1058
1059 mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
1060 mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */
1061 mp_irqs[mp_irq_entries].mp_dstirq = i;
1062
1063 if (++mp_irq_entries == MAX_IRQ_SOURCES)
1064 panic("Max # of irq sources exceeded!!\n");
1065 }
1066}
1067
1068int mp_register_gsi(u32 gsi, int triggering, int polarity)
1069{
1070 int ioapic;
1071 int ioapic_pin;
1072#ifdef CONFIG_X86_32
1073#define MAX_GSI_NUM 4096
1074#define IRQ_COMPRESSION_START 64
1075
1076 static int pci_irq = IRQ_COMPRESSION_START;
1077 /*
1078 * Mapping between Global System Interrupts, which
1079 * represent all possible interrupts, and IRQs
1080 * assigned to actual devices.
1081 */
1082 static int gsi_to_irq[MAX_GSI_NUM];
1083#else
1084
1085 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1086 return gsi;
1087#endif
1088
1089 /* Don't set up the ACPI SCI because it's already set up */
1090 if (acpi_gbl_FADT.sci_interrupt == gsi)
1091 return gsi;
1092
1093 ioapic = mp_find_ioapic(gsi);
1094 if (ioapic < 0) {
1095 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1096 return gsi;
1097 }
1098
1099 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
1100
1101#ifdef CONFIG_X86_32
1102 if (ioapic_renumber_irq)
1103 gsi = ioapic_renumber_irq(ioapic, gsi);
1104#endif
1105
1106 /*
1107 * Avoid pin reprogramming. PRTs typically include entries
1108 * with redundant pin->gsi mappings (but unique PCI devices);
1109 * we only program the IOAPIC on the first.
1110 */
1111 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1112 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1113 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1114 ioapic_pin);
1115 return gsi;
1116 }
1117 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1118 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
1119 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1120#ifdef CONFIG_X86_32
1121 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1122#else
1123 return gsi;
1124#endif
1125 }
1126
1127 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1128#ifdef CONFIG_X86_32
1129 /*
1130 * For GSI >= 64, use IRQ compression
1131 */
1132 if ((gsi >= IRQ_COMPRESSION_START)
1133 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1134 /*
1135 * For PCI devices assign IRQs in order, avoiding gaps
1136 * due to unused I/O APIC pins.
1137 */
1138 int irq = gsi;
1139 if (gsi < MAX_GSI_NUM) {
1140 /*
1141 * Retain the VIA chipset work-around (gsi > 15), but
1142 * avoid a problem where the 8254 timer (IRQ0) is setup
1143 * via an override (so it's not on pin 0 of the ioapic),
1144 * and at the same time, the pin 0 interrupt is a PCI
1145 * type. The gsi > 15 test could cause these two pins
1146 * to be shared as IRQ0, and they are not shareable.
1147 * So test for this condition, and if necessary, avoid
1148 * the pin collision.
1149 */
1150 gsi = pci_irq++;
1151 /*
1152 * Don't assign IRQ used by ACPI SCI
1153 */
1154 if (gsi == acpi_gbl_FADT.sci_interrupt)
1155 gsi = pci_irq++;
1156 gsi_to_irq[irq] = gsi;
1157 } else {
1158 printk(KERN_ERR "GSI %u is too high\n", gsi);
1159 return gsi;
1160 }
1161 }
1162#endif
1163 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1164 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1165 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1166 return gsi;
1167}
1168
1169int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
1170 u32 gsi, int triggering, int polarity)
1171{
1172 struct mpc_config_intsrc intsrc;
1173 int ioapic;
1174
1175 /* print the entry should happen on mptable identically */
1176 intsrc.mpc_type = MP_INTSRC;
1177 intsrc.mpc_irqtype = mp_INT;
1178 intsrc.mpc_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1179 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1180 intsrc.mpc_srcbus = number;
1181 intsrc.mpc_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1182 ioapic = mp_find_ioapic(gsi);
1183 intsrc.mpc_dstapic = mp_ioapic_routing[ioapic].apic_id;
1184 intsrc.mpc_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
1185
1186 MP_intsrc_info(&intsrc);
1187
1188 return 0;
1189}
1190
863/* 1191/*
864 * Parse IOAPIC related entries in MADT 1192 * Parse IOAPIC related entries in MADT
865 * returns 0 on success, < 0 on error 1193 * returns 0 on success, < 0 on error
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..954d67931a50 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -76,6 +76,11 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
76 */ 76 */
77int apic_verbosity; 77int apic_verbosity;
78 78
79int pic_mode;
80
81/* Have we found an MP table */
82int smp_found_config;
83
79static unsigned int calibration_result; 84static unsigned int calibration_result;
80 85
81static int lapic_next_event(unsigned long delta, 86static int lapic_next_event(unsigned long delta,
@@ -1202,7 +1207,7 @@ void __init init_apic_mappings(void)
1202 1207
1203 for (i = 0; i < nr_ioapics; i++) { 1208 for (i = 0; i < nr_ioapics; i++) {
1204 if (smp_found_config) { 1209 if (smp_found_config) {
1205 ioapic_phys = mp_ioapics[i].mpc_apicaddr; 1210 ioapic_phys = mp_ioapics[i].mp_apicaddr;
1206 if (!ioapic_phys) { 1211 if (!ioapic_phys) {
1207 printk(KERN_ERR 1212 printk(KERN_ERR
1208 "WARNING: bogus zero IO-APIC " 1213 "WARNING: bogus zero IO-APIC "
@@ -1513,6 +1518,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1513 */ 1518 */
1514 cpu = 0; 1519 cpu = 0;
1515 1520
1521 if (apicid > max_physical_apicid)
1522 max_physical_apicid = apicid;
1523
1516 /* 1524 /*
1517 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1525 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1518 * but we need to work other dependencies like SMP_SUSPEND etc 1526 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1520,7 +1528,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1520 * if (CPU_HOTPLUG_ENABLED || num_processors > 8) 1528 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1521 * - Ashok Raj <ashok.raj@intel.com> 1529 * - Ashok Raj <ashok.raj@intel.com>
1522 */ 1530 */
1523 if (num_processors > 8) { 1531 if (max_physical_apicid >= 8) {
1524 switch (boot_cpu_data.x86_vendor) { 1532 switch (boot_cpu_data.x86_vendor) {
1525 case X86_VENDOR_INTEL: 1533 case X86_VENDOR_INTEL:
1526 if (!APIC_XAPIC(version)) { 1534 if (!APIC_XAPIC(version)) {
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0633cfd0dc29..a4bd8fbb78a9 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
56 */ 56 */
57int apic_verbosity; 57int apic_verbosity;
58 58
59/* Have we found an MP table */
60int smp_found_config;
61
59static struct resource lapic_resource = { 62static struct resource lapic_resource = {
60 .name = "Local APIC", 63 .name = "Local APIC",
61 .flags = IORESOURCE_MEM | IORESOURCE_BUSY, 64 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -1090,6 +1093,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1090 */ 1093 */
1091 cpu = 0; 1094 cpu = 0;
1092 } 1095 }
1096 if (apicid > max_physical_apicid)
1097 max_physical_apicid = apicid;
1098
1093 /* are we being called early in kernel startup? */ 1099 /* are we being called early in kernel startup? */
1094 if (x86_cpu_to_apicid_early_ptr) { 1100 if (x86_cpu_to_apicid_early_ptr) {
1095 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; 1101 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 5d241ce94a44..509bd3d9eacd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
37static unsigned long smp_changes_mask; 37static unsigned long smp_changes_mask;
38static struct mtrr_state mtrr_state = {}; 38static struct mtrr_state mtrr_state = {};
39static int mtrr_state_set; 39static int mtrr_state_set;
40static u64 tom2; 40u64 mtrr_tom2;
41 41
42#undef MODULE_PARAM_PREFIX 42#undef MODULE_PARAM_PREFIX
43#define MODULE_PARAM_PREFIX "mtrr." 43#define MODULE_PARAM_PREFIX "mtrr."
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
139 } 139 }
140 } 140 }
141 141
142 if (tom2) { 142 if (mtrr_tom2) {
143 if (start >= (1ULL<<32) && (end < tom2)) 143 if (start >= (1ULL<<32) && (end < mtrr_tom2))
144 return MTRR_TYPE_WRBACK; 144 return MTRR_TYPE_WRBACK;
145 } 145 }
146 146
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
158 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 158 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
159} 159}
160 160
161/* fill the MSR pair relating to a var range */
162void fill_mtrr_var_range(unsigned int index,
163 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
164{
165 struct mtrr_var_range *vr;
166
167 vr = mtrr_state.var_ranges;
168
169 vr[index].base_lo = base_lo;
170 vr[index].base_hi = base_hi;
171 vr[index].mask_lo = mask_lo;
172 vr[index].mask_hi = mask_hi;
173}
174
161static void 175static void
162get_fixed_ranges(mtrr_type * frs) 176get_fixed_ranges(mtrr_type * frs)
163{ 177{
@@ -213,13 +227,13 @@ void __init get_mtrr_state(void)
213 mtrr_state.enabled = (lo & 0xc00) >> 10; 227 mtrr_state.enabled = (lo & 0xc00) >> 10;
214 228
215 if (amd_special_default_mtrr()) { 229 if (amd_special_default_mtrr()) {
216 unsigned lo, hi; 230 unsigned low, high;
217 /* TOP_MEM2 */ 231 /* TOP_MEM2 */
218 rdmsr(MSR_K8_TOP_MEM2, lo, hi); 232 rdmsr(MSR_K8_TOP_MEM2, low, high);
219 tom2 = hi; 233 mtrr_tom2 = high;
220 tom2 <<= 32; 234 mtrr_tom2 <<= 32;
221 tom2 |= lo; 235 mtrr_tom2 |= low;
222 tom2 &= 0xffffff8000000ULL; 236 mtrr_tom2 &= 0xffffff800000ULL;
223 } 237 }
224 if (mtrr_show) { 238 if (mtrr_show) {
225 int high_width; 239 int high_width;
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void)
251 else 265 else
252 printk(KERN_INFO "MTRR %u disabled\n", i); 266 printk(KERN_INFO "MTRR %u disabled\n", i);
253 } 267 }
254 if (tom2) { 268 if (mtrr_tom2) {
255 printk(KERN_INFO "TOM2: %016llx aka %lldM\n", 269 printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
256 tom2, tom2>>20); 270 mtrr_tom2, mtrr_tom2>>20);
257 } 271 }
258 } 272 }
259 mtrr_state_set = 1; 273 mtrr_state_set = 1;
@@ -328,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
328 342
329 if (lo != msrwords[0] || hi != msrwords[1]) { 343 if (lo != msrwords[0] || hi != msrwords[1]) {
330 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 344 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
331 boot_cpu_data.x86 == 15 && 345 (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
332 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) 346 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
333 k8_enable_fixed_iorrs(); 347 k8_enable_fixed_iorrs();
334 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 348 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d9323..0642201784e0 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -37,6 +37,7 @@
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/cpu.h> 38#include <linux/cpu.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/sort.h>
40 41
41#include <asm/e820.h> 42#include <asm/e820.h>
42#include <asm/mtrr.h> 43#include <asm/mtrr.h>
@@ -609,6 +610,787 @@ static struct sysdev_driver mtrr_sysdev_driver = {
609 .resume = mtrr_restore, 610 .resume = mtrr_restore,
610}; 611};
611 612
613/* should be related to MTRR_VAR_RANGES nums */
614#define RANGE_NUM 256
615
616struct res_range {
617 unsigned long start;
618 unsigned long end;
619};
620
621static int __init
622add_range(struct res_range *range, int nr_range, unsigned long start,
623 unsigned long end)
624{
625 /* out of slots */
626 if (nr_range >= RANGE_NUM)
627 return nr_range;
628
629 range[nr_range].start = start;
630 range[nr_range].end = end;
631
632 nr_range++;
633
634 return nr_range;
635}
636
637static int __init
638add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
639 unsigned long end)
640{
641 int i;
642
643 /* try to merge it with old one */
644 for (i = 0; i < nr_range; i++) {
645 unsigned long final_start, final_end;
646 unsigned long common_start, common_end;
647
648 if (!range[i].end)
649 continue;
650
651 common_start = max(range[i].start, start);
652 common_end = min(range[i].end, end);
653 if (common_start > common_end + 1)
654 continue;
655
656 final_start = min(range[i].start, start);
657 final_end = max(range[i].end, end);
658
659 range[i].start = final_start;
660 range[i].end = final_end;
661 return nr_range;
662 }
663
664 /* need to add that */
665 return add_range(range, nr_range, start, end);
666}
667
668static void __init
669subtract_range(struct res_range *range, unsigned long start, unsigned long end)
670{
671 int i, j;
672
673 for (j = 0; j < RANGE_NUM; j++) {
674 if (!range[j].end)
675 continue;
676
677 if (start <= range[j].start && end >= range[j].end) {
678 range[j].start = 0;
679 range[j].end = 0;
680 continue;
681 }
682
683 if (start <= range[j].start && end < range[j].end &&
684 range[j].start < end + 1) {
685 range[j].start = end + 1;
686 continue;
687 }
688
689
690 if (start > range[j].start && end >= range[j].end &&
691 range[j].end > start - 1) {
692 range[j].end = start - 1;
693 continue;
694 }
695
696 if (start > range[j].start && end < range[j].end) {
697 /* find the new spare */
698 for (i = 0; i < RANGE_NUM; i++) {
699 if (range[i].end == 0)
700 break;
701 }
702 if (i < RANGE_NUM) {
703 range[i].end = range[j].end;
704 range[i].start = end + 1;
705 } else {
706 printk(KERN_ERR "run of slot in ranges\n");
707 }
708 range[j].end = start - 1;
709 continue;
710 }
711 }
712}
713
714static int __init cmp_range(const void *x1, const void *x2)
715{
716 const struct res_range *r1 = x1;
717 const struct res_range *r2 = x2;
718 long start1, start2;
719
720 start1 = r1->start;
721 start2 = r2->start;
722
723 return start1 - start2;
724}
725
726struct var_mtrr_range_state {
727 unsigned long base_pfn;
728 unsigned long size_pfn;
729 mtrr_type type;
730};
731
732struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print;
734
735static int __init
736x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
737 unsigned long extra_remove_base,
738 unsigned long extra_remove_size)
739{
740 unsigned long i, base, size;
741 mtrr_type type;
742
743 for (i = 0; i < num_var_ranges; i++) {
744 type = range_state[i].type;
745 if (type != MTRR_TYPE_WRBACK)
746 continue;
747 base = range_state[i].base_pfn;
748 size = range_state[i].size_pfn;
749 nr_range = add_range_with_merge(range, nr_range, base,
750 base + size - 1);
751 }
752 if (debug_print) {
753 printk(KERN_DEBUG "After WB checking\n");
754 for (i = 0; i < nr_range; i++)
755 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
756 range[i].start, range[i].end + 1);
757 }
758
759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE)
763 continue;
764 size = range_state[i].size_pfn;
765 if (!size)
766 continue;
767 base = range_state[i].base_pfn;
768 subtract_range(range, base, base + size - 1);
769 }
770 if (extra_remove_size)
771 subtract_range(range, extra_remove_base,
772 extra_remove_base + extra_remove_size - 1);
773
774 /* get new range num */
775 nr_range = 0;
776 for (i = 0; i < RANGE_NUM; i++) {
777 if (!range[i].end)
778 continue;
779 nr_range++;
780 }
781 if (debug_print) {
782 printk(KERN_DEBUG "After UC checking\n");
783 for (i = 0; i < nr_range; i++)
784 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
785 range[i].start, range[i].end + 1);
786 }
787
788 /* sort the ranges */
789 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
790 if (debug_print) {
791 printk(KERN_DEBUG "After sorting\n");
792 for (i = 0; i < nr_range; i++)
793 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
794 range[i].start, range[i].end + 1);
795 }
796
797 /* clear those is not used */
798 for (i = nr_range; i < RANGE_NUM; i++)
799 memset(&range[i], 0, sizeof(range[i]));
800
801 return nr_range;
802}
803
804static struct res_range __initdata range[RANGE_NUM];
805
806#ifdef CONFIG_MTRR_SANITIZER
807
808static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
809{
810 unsigned long sum;
811 int i;
812
813 sum = 0;
814 for (i = 0; i < nr_range; i++)
815 sum += range[i].end + 1 - range[i].start;
816
817 return sum;
818}
819
820static int enable_mtrr_cleanup __initdata =
821 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
822
823static int __init disable_mtrr_cleanup_setup(char *str)
824{
825 if (enable_mtrr_cleanup != -1)
826 enable_mtrr_cleanup = 0;
827 return 0;
828}
829early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
830
831static int __init enable_mtrr_cleanup_setup(char *str)
832{
833 if (enable_mtrr_cleanup != -1)
834 enable_mtrr_cleanup = 1;
835 return 0;
836}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
838
839struct var_mtrr_state {
840 unsigned long range_startk;
841 unsigned long range_sizek;
842 unsigned long chunk_sizek;
843 unsigned long gran_sizek;
844 unsigned int reg;
845};
846
847static void __init
848set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
849 unsigned char type, unsigned int address_bits)
850{
851 u32 base_lo, base_hi, mask_lo, mask_hi;
852 u64 base, mask;
853
854 if (!sizek) {
855 fill_mtrr_var_range(reg, 0, 0, 0, 0);
856 return;
857 }
858
859 mask = (1ULL << address_bits) - 1;
860 mask &= ~((((u64)sizek) << 10) - 1);
861
862 base = ((u64)basek) << 10;
863
864 base |= type;
865 mask |= 0x800;
866
867 base_lo = base & ((1ULL<<32) - 1);
868 base_hi = base >> 32;
869
870 mask_lo = mask & ((1ULL<<32) - 1);
871 mask_hi = mask >> 32;
872
873 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
874}
875
876static void __init
877save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
878 unsigned char type)
879{
880 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
881 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
882 range_state[reg].type = type;
883}
884
885static void __init
886set_var_mtrr_all(unsigned int address_bits)
887{
888 unsigned long basek, sizek;
889 unsigned char type;
890 unsigned int reg;
891
892 for (reg = 0; reg < num_var_ranges; reg++) {
893 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
894 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
895 type = range_state[reg].type;
896
897 set_var_mtrr(reg, basek, sizek, type, address_bits);
898 }
899}
900
901static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type)
904{
905 if (!range_sizek || (reg >= num_var_ranges))
906 return reg;
907
908 while (range_sizek) {
909 unsigned long max_align, align;
910 unsigned long sizek;
911
912 /* Compute the maximum size I can make a range */
913 if (range_startk)
914 max_align = ffs(range_startk) - 1;
915 else
916 max_align = 32;
917 align = fls(range_sizek) - 1;
918 if (align > max_align)
919 align = max_align;
920
921 sizek = 1 << align;
922 if (debug_print)
923 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 );
929 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek;
931 range_sizek -= sizek;
932 if (reg >= num_var_ranges)
933 break;
934 }
935 return reg;
936}
937
938static unsigned __init
939range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
940 unsigned long sizek)
941{
942 unsigned long hole_basek, hole_sizek;
943 unsigned long second_basek, second_sizek;
944 unsigned long range0_basek, range0_sizek;
945 unsigned long range_basek, range_sizek;
946 unsigned long chunk_sizek;
947 unsigned long gran_sizek;
948
949 hole_basek = 0;
950 hole_sizek = 0;
951 second_basek = 0;
952 second_sizek = 0;
953 chunk_sizek = state->chunk_sizek;
954 gran_sizek = state->gran_sizek;
955
956 /* align with gran size, prevent small block used up MTRRs */
957 range_basek = ALIGN(state->range_startk, gran_sizek);
958 if ((range_basek > basek) && basek)
959 return second_sizek;
960 state->range_sizek -= (range_basek - state->range_startk);
961 range_sizek = ALIGN(state->range_sizek, gran_sizek);
962
963 while (range_sizek > state->range_sizek) {
964 range_sizek -= gran_sizek;
965 if (!range_sizek)
966 return 0;
967 }
968 state->range_sizek = range_sizek;
969
970 /* try to append some small hole */
971 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
973 if (range0_sizek == state->range_sizek) {
974 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
976 range0_basek<<10,
977 (range0_basek + state->range_sizek)<<10);
978 state->reg = range_to_mtrr(state->reg, range0_basek,
979 state->range_sizek, MTRR_TYPE_WRBACK);
980 return 0;
981 }
982
983 range0_sizek -= chunk_sizek;
984 if (range0_sizek && sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek;
987 if (!range0_sizek)
988 break;
989 }
990 }
991
992 if (range0_sizek) {
993 if (debug_print)
994 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
995 range0_basek<<10,
996 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 }
1011
1012 /* if last piece, only could one hole near end */
1013 if ((second_basek || !basek) &&
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0;
1028 second_sizek = 0;
1029 }
1030
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) {
1037 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10);
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
1041 MTRR_TYPE_UNCACHABLE);
1042
1043 }
1044
1045 return second_sizek;
1046}
1047
1048static void __init
1049set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
1050 unsigned long size_pfn)
1051{
1052 unsigned long basek, sizek;
1053 unsigned long second_sizek = 0;
1054
1055 if (state->reg >= num_var_ranges)
1056 return;
1057
1058 basek = base_pfn << (PAGE_SHIFT - 10);
1059 sizek = size_pfn << (PAGE_SHIFT - 10);
1060
1061 /* See if I can merge with the last range */
1062 if ((basek <= 1024) ||
1063 (state->range_startk + state->range_sizek == basek)) {
1064 unsigned long endk = basek + sizek;
1065 state->range_sizek = endk - state->range_startk;
1066 return;
1067 }
1068 /* Write the range mtrrs */
1069 if (state->range_sizek != 0)
1070 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
1071
1072 /* Allocate an msr */
1073 state->range_startk = basek + second_sizek;
1074 state->range_sizek = sizek - second_sizek;
1075}
1076
1077/* mininum size of mtrr block that can take hole */
1078static u64 mtrr_chunk_size __initdata = (256ULL<<20);
1079
1080static int __init parse_mtrr_chunk_size_opt(char *p)
1081{
1082 if (!p)
1083 return -EINVAL;
1084 mtrr_chunk_size = memparse(p, &p);
1085 return 0;
1086}
1087early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
1088
1089/* granity of mtrr of block */
1090static u64 mtrr_gran_size __initdata;
1091
1092static int __init parse_mtrr_gran_size_opt(char *p)
1093{
1094 if (!p)
1095 return -EINVAL;
1096 mtrr_gran_size = memparse(p, &p);
1097 return 0;
1098}
1099early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
1100
1101static int nr_mtrr_spare_reg __initdata =
1102 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
1103
1104static int __init parse_mtrr_spare_reg(char *arg)
1105{
1106 if (arg)
1107 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
1108 return 0;
1109}
1110
1111early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
1112
1113static int __init
1114x86_setup_var_mtrrs(struct res_range *range, int nr_range,
1115 u64 chunk_size, u64 gran_size)
1116{
1117 struct var_mtrr_state var_state;
1118 int i;
1119 int num_reg;
1120
1121 var_state.range_startk = 0;
1122 var_state.range_sizek = 0;
1123 var_state.reg = 0;
1124 var_state.chunk_sizek = chunk_size >> 10;
1125 var_state.gran_sizek = gran_size >> 10;
1126
1127 memset(range_state, 0, sizeof(range_state));
1128
1129 /* Write the range etc */
1130 for (i = 0; i < nr_range; i++)
1131 set_var_mtrr_range(&var_state, range[i].start,
1132 range[i].end - range[i].start + 1);
1133
1134 /* Write the last range */
1135 if (var_state.range_sizek != 0)
1136 range_to_mtrr_with_hole(&var_state, 0, 0);
1137
1138 num_reg = var_state.reg;
1139 /* Clear out the extra MTRR's */
1140 while (var_state.reg < num_var_ranges) {
1141 save_var_mtrr(var_state.reg, 0, 0, 0);
1142 var_state.reg++;
1143 }
1144
1145 return num_reg;
1146}
1147
1148struct mtrr_cleanup_result {
1149 unsigned long gran_sizek;
1150 unsigned long chunk_sizek;
1151 unsigned long lose_cover_sizek;
1152 unsigned int num_reg;
1153 int bad;
1154};
1155
1156/*
1157 * gran_size: 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G
1159 * so we need (2+13)*6
1160 */
1161#define NUM_RESULT 90
1162#define PSHIFT (PAGE_SHIFT - 10)
1163
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1165static struct res_range __initdata range_new[RANGE_NUM];
1166static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1167
1168static int __init mtrr_cleanup(unsigned address_bits)
1169{
1170 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy;
1172 mtrr_type type;
1173 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new;
1176 int index_good;
1177 int num_reg_good;
1178
1179 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1];
1181
1182 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1183 return 0;
1184 rdmsr(MTRRdefType_MSR, def, dummy);
1185 def &= 0xff;
1186 if (def != MTRR_TYPE_UNCACHABLE)
1187 return 0;
1188
1189 /* get it and store it aside */
1190 memset(range_state, 0, sizeof(range_state));
1191 for (i = 0; i < num_var_ranges; i++) {
1192 mtrr_if->get(i, &base, &size, &type);
1193 range_state[i].base_pfn = base;
1194 range_state[i].size_pfn = size;
1195 range_state[i].type = type;
1196 }
1197
1198 /* check entries number */
1199 memset(num, 0, sizeof(num));
1200 for (i = 0; i < num_var_ranges; i++) {
1201 type = range_state[i].type;
1202 size = range_state[i].size_pfn;
1203 if (type >= MTRR_NUM_TYPES)
1204 continue;
1205 if (!size)
1206 type = MTRR_NUM_TYPES;
1207 num[type]++;
1208 }
1209
1210 /* check if we got UC entries */
1211 if (!num[MTRR_TYPE_UNCACHABLE])
1212 return 0;
1213
1214 /* check if we only had WB and UC */
1215 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1216 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0;
1218
1219 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0;
1221 if (mtrr_tom2) {
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1223 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size);
1228 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT));
1231
1232 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg;
1234
1235 debug_print = 1;
1236 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size);
1239
1240 /* we got new setting in range_state, check it */
1241 memset(range_new, 0, sizeof(range_new));
1242 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1243 extra_remove_base,
1244 extra_remove_size);
1245 range_sums_new = sum_ranges(range_new, nr_range_new);
1246
1247 i = 0;
1248 result[i].chunk_sizek = mtrr_chunk_size >> 10;
1249 result[i].gran_sizek = mtrr_gran_size >> 10;
1250 result[i].num_reg = num_reg;
1251 if (range_sums < range_sums_new) {
1252 result[i].lose_cover_sizek =
1253 (range_sums_new - range_sums) << PSHIFT;
1254 result[i].bad = 1;
1255 } else
1256 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT;
1258
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
1261 result[i].chunk_sizek >> 10);
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
1263 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10);
1265 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits);
1267 return 1;
1268 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n");
1271 debug_print = 0;
1272 memset(result, 0, sizeof(result[0]));
1273 }
1274
1275 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33);
1280 chunk_size <<= 1) {
1281 int num_reg;
1282
1283 if (debug_print)
1284 printk(KERN_INFO
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n",
1286 gran_size >> 20, chunk_size >> 20);
1287 if (i >= NUM_RESULT)
1288 continue;
1289
1290 /* convert ranges to var ranges state */
1291 num_reg = x86_setup_var_mtrrs(range, nr_range,
1292 chunk_size, gran_size);
1293
1294 /* we got new setting in range_state, check it */
1295 memset(range_new, 0, sizeof(range_new));
1296 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1297 extra_remove_base, extra_remove_size);
1298 range_sums_new = sum_ranges(range_new, nr_range_new);
1299
1300 result[i].chunk_sizek = chunk_size >> 10;
1301 result[i].gran_sizek = gran_size >> 10;
1302 result[i].num_reg = num_reg;
1303 if (range_sums < range_sums_new) {
1304 result[i].lose_cover_sizek =
1305 (range_sums_new - range_sums) << PSHIFT;
1306 result[i].bad = 1;
1307 } else
1308 result[i].lose_cover_sizek =
1309 (range_sums - range_sums_new) << PSHIFT;
1310
1311 /* double check it */
1312 if (!result[i].bad && !result[i].lose_cover_sizek) {
1313 if (nr_range_new != nr_range ||
1314 memcmp(range, range_new, sizeof(range)))
1315 result[i].bad = 1;
1316 }
1317
1318 if (!result[i].bad && (range_sums - range_sums_new <
1319 min_loss_pfn[num_reg])) {
1320 min_loss_pfn[num_reg] =
1321 range_sums - range_sums_new;
1322 }
1323 i++;
1324 }
1325 }
1326
1327 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
1331 result[i].chunk_sizek >> 10);
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
1333 result[i].num_reg, result[i].bad?"-":"",
1334 result[i].lose_cover_sizek >> 10);
1335 }
1336
1337 /* try to find the optimal index */
1338 if (nr_mtrr_spare_reg >= num_var_ranges)
1339 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) {
1343 num_reg_good = i;
1344 break;
1345 }
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 if (index_good != -1) {
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
1364 result[i].gran_sizek >> 10,
1365 result[i].chunk_sizek >> 10);
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
1367 result[i].num_reg,
1368 result[i].lose_cover_sizek >> 10);
1369 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10;
1374 debug_print = 1;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1376 set_var_mtrr_all(address_bits);
1377 return 1;
1378 }
1379
1380 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
1381 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
1382
1383 return 0;
1384}
1385#else
1386static int __init mtrr_cleanup(unsigned address_bits)
1387{
1388 return 0;
1389}
1390#endif
1391
1392static int __initdata changed_by_mtrr_cleanup;
1393
612static int disable_mtrr_trim; 1394static int disable_mtrr_trim;
613 1395
614static int __init disable_mtrr_trim_setup(char *str) 1396static int __init disable_mtrr_trim_setup(char *str)
@@ -648,6 +1430,19 @@ int __init amd_special_default_mtrr(void)
648 return 0; 1430 return 0;
649} 1431}
650 1432
1433static u64 __init real_trim_memory(unsigned long start_pfn,
1434 unsigned long limit_pfn)
1435{
1436 u64 trim_start, trim_size;
1437 trim_start = start_pfn;
1438 trim_start <<= PAGE_SHIFT;
1439 trim_size = limit_pfn;
1440 trim_size <<= PAGE_SHIFT;
1441 trim_size -= trim_start;
1442
1443 return update_memory_range(trim_start, trim_size, E820_RAM,
1444 E820_RESERVED);
1445}
651/** 1446/**
652 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs 1447 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
653 * @end_pfn: ending page frame number 1448 * @end_pfn: ending page frame number
@@ -663,8 +1458,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
663{ 1458{
664 unsigned long i, base, size, highest_pfn = 0, def, dummy; 1459 unsigned long i, base, size, highest_pfn = 0, def, dummy;
665 mtrr_type type; 1460 mtrr_type type;
666 u64 trim_start, trim_size; 1461 int nr_range;
1462 u64 total_trim_size;
667 1463
1464 /* extra one for all 0 */
1465 int num[MTRR_NUM_TYPES + 1];
668 /* 1466 /*
669 * Make sure we only trim uncachable memory on machines that 1467 * Make sure we only trim uncachable memory on machines that
670 * support the Intel MTRR architecture: 1468 * support the Intel MTRR architecture:
@@ -676,14 +1474,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
676 if (def != MTRR_TYPE_UNCACHABLE) 1474 if (def != MTRR_TYPE_UNCACHABLE)
677 return 0; 1475 return 0;
678 1476
679 if (amd_special_default_mtrr()) 1477 /* get it and store it aside */
680 return 0; 1478 memset(range_state, 0, sizeof(range_state));
1479 for (i = 0; i < num_var_ranges; i++) {
1480 mtrr_if->get(i, &base, &size, &type);
1481 range_state[i].base_pfn = base;
1482 range_state[i].size_pfn = size;
1483 range_state[i].type = type;
1484 }
681 1485
682 /* Find highest cached pfn */ 1486 /* Find highest cached pfn */
683 for (i = 0; i < num_var_ranges; i++) { 1487 for (i = 0; i < num_var_ranges; i++) {
684 mtrr_if->get(i, &base, &size, &type); 1488 type = range_state[i].type;
685 if (type != MTRR_TYPE_WRBACK) 1489 if (type != MTRR_TYPE_WRBACK)
686 continue; 1490 continue;
1491 base = range_state[i].base_pfn;
1492 size = range_state[i].size_pfn;
687 if (highest_pfn < base + size) 1493 if (highest_pfn < base + size)
688 highest_pfn = base + size; 1494 highest_pfn = base + size;
689 } 1495 }
@@ -698,22 +1504,65 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
698 return 0; 1504 return 0;
699 } 1505 }
700 1506
701 if (highest_pfn < end_pfn) { 1507 /* check entries number */
1508 memset(num, 0, sizeof(num));
1509 for (i = 0; i < num_var_ranges; i++) {
1510 type = range_state[i].type;
1511 if (type >= MTRR_NUM_TYPES)
1512 continue;
1513 size = range_state[i].size_pfn;
1514 if (!size)
1515 type = MTRR_NUM_TYPES;
1516 num[type]++;
1517 }
1518
1519 /* no entry for WB? */
1520 if (!num[MTRR_TYPE_WRBACK])
1521 return 0;
1522
1523 /* check if we only had WB and UC */
1524 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1525 num_var_ranges - num[MTRR_NUM_TYPES])
1526 return 0;
1527
1528 memset(range, 0, sizeof(range));
1529 nr_range = 0;
1530 if (mtrr_tom2) {
1531 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1532 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1533 if (highest_pfn < range[nr_range].end + 1)
1534 highest_pfn = range[nr_range].end + 1;
1535 nr_range++;
1536 }
1537 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1538
1539 total_trim_size = 0;
1540 /* check the head */
1541 if (range[0].start)
1542 total_trim_size += real_trim_memory(0, range[0].start);
1543 /* check the holes */
1544 for (i = 0; i < nr_range - 1; i++) {
1545 if (range[i].end + 1 < range[i+1].start)
1546 total_trim_size += real_trim_memory(range[i].end + 1,
1547 range[i+1].start);
1548 }
1549 /* check the top */
1550 i = nr_range - 1;
1551 if (range[i].end + 1 < end_pfn)
1552 total_trim_size += real_trim_memory(range[i].end + 1,
1553 end_pfn);
1554
1555 if (total_trim_size) {
702 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" 1556 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
703 " all of memory, losing %luMB of RAM.\n", 1557 " all of memory, losing %lluMB of RAM.\n",
704 (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); 1558 total_trim_size >> 20);
705 1559
706 WARN_ON(1); 1560 if (!changed_by_mtrr_cleanup)
1561 WARN_ON(1);
707 1562
708 printk(KERN_INFO "update e820 for mtrr\n"); 1563 printk(KERN_INFO "update e820 for mtrr\n");
709 trim_start = highest_pfn;
710 trim_start <<= PAGE_SHIFT;
711 trim_size = end_pfn;
712 trim_size <<= PAGE_SHIFT;
713 trim_size -= trim_start;
714 update_memory_range(trim_start, trim_size, E820_RAM,
715 E820_RESERVED);
716 update_e820(); 1564 update_e820();
1565
717 return 1; 1566 return 1;
718 } 1567 }
719 1568
@@ -729,18 +1578,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
729 */ 1578 */
730void __init mtrr_bp_init(void) 1579void __init mtrr_bp_init(void)
731{ 1580{
1581 u32 phys_addr;
732 init_ifs(); 1582 init_ifs();
733 1583
1584 phys_addr = 32;
1585
734 if (cpu_has_mtrr) { 1586 if (cpu_has_mtrr) {
735 mtrr_if = &generic_mtrr_ops; 1587 mtrr_if = &generic_mtrr_ops;
736 size_or_mask = 0xff000000; /* 36 bits */ 1588 size_or_mask = 0xff000000; /* 36 bits */
737 size_and_mask = 0x00f00000; 1589 size_and_mask = 0x00f00000;
1590 phys_addr = 36;
738 1591
739 /* This is an AMD specific MSR, but we assume(hope?) that 1592 /* This is an AMD specific MSR, but we assume(hope?) that
740 Intel will implement it to when they extend the address 1593 Intel will implement it to when they extend the address
741 bus of the Xeon. */ 1594 bus of the Xeon. */
742 if (cpuid_eax(0x80000000) >= 0x80000008) { 1595 if (cpuid_eax(0x80000000) >= 0x80000008) {
743 u32 phys_addr;
744 phys_addr = cpuid_eax(0x80000008) & 0xff; 1596 phys_addr = cpuid_eax(0x80000008) & 0xff;
745 /* CPUID workaround for Intel 0F33/0F34 CPU */ 1597 /* CPUID workaround for Intel 0F33/0F34 CPU */
746 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 1598 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -758,6 +1610,7 @@ void __init mtrr_bp_init(void)
758 don't support PAE */ 1610 don't support PAE */
759 size_or_mask = 0xfff00000; /* 32 bits */ 1611 size_or_mask = 0xfff00000; /* 32 bits */
760 size_and_mask = 0; 1612 size_and_mask = 0;
1613 phys_addr = 32;
761 } 1614 }
762 } else { 1615 } else {
763 switch (boot_cpu_data.x86_vendor) { 1616 switch (boot_cpu_data.x86_vendor) {
@@ -791,8 +1644,15 @@ void __init mtrr_bp_init(void)
791 if (mtrr_if) { 1644 if (mtrr_if) {
792 set_num_var_ranges(); 1645 set_num_var_ranges();
793 init_table(); 1646 init_table();
794 if (use_intel()) 1647 if (use_intel()) {
795 get_mtrr_state(); 1648 get_mtrr_state();
1649
1650 if (mtrr_cleanup(phys_addr)) {
1651 changed_by_mtrr_cleanup = 1;
1652 mtrr_if->set_all();
1653 }
1654
1655 }
796 } 1656 }
797} 1657}
798 1658
@@ -829,9 +1689,10 @@ static int __init mtrr_init_finialize(void)
829{ 1689{
830 if (!mtrr_if) 1690 if (!mtrr_if)
831 return 0; 1691 return 0;
832 if (use_intel()) 1692 if (use_intel()) {
833 mtrr_state_warn(); 1693 if (!changed_by_mtrr_cleanup)
834 else { 1694 mtrr_state_warn();
1695 } else {
835 /* The CPUs haven't MTRR and seem to not support SMP. They have 1696 /* The CPUs haven't MTRR and seem to not support SMP. They have
836 * specific drivers, we use a tricky method to support 1697 * specific drivers, we use a tricky method to support
837 * suspend/resume for them. 1698 * suspend/resume for them.
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2cc77eb6fea3..2dc4ec656b23 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt);
81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); 81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); 82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
83 83
84void fill_mtrr_var_range(unsigned int index,
85 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
84void get_mtrr_state(void); 86void get_mtrr_state(void);
85 87
86extern void set_mtrr_ops(struct mtrr_ops * ops); 88extern void set_mtrr_ops(struct mtrr_ops * ops);
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if;
92#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 94#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
93 95
94extern unsigned int num_var_ranges; 96extern unsigned int num_var_ranges;
97extern u64 mtrr_tom2;
95 98
96void mtrr_state_warn(void); 99void mtrr_state_warn(void);
97const char *mtrr_attrib_to_str(int x); 100const char *mtrr_attrib_to_str(int x);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
new file mode 100644
index 000000000000..a706e9057ba5
--- /dev/null
+++ b/arch/x86/kernel/e820.c
@@ -0,0 +1,896 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 *
5 * Getting sanitize_e820_map() in sync with i386 version by applying change:
6 * - Provisions for empty E820 memory regions (reported by certain BIOSes).
7 * Alex Achenbach <xela@slit.de>, December 2002.
8 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
9 *
10 */
11#include <linux/kernel.h>
12#include <linux/types.h>
13#include <linux/init.h>
14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h>
21#include <linux/suspend.h>
22
23#include <asm/pgtable.h>
24#include <asm/page.h>
25#include <asm/e820.h>
26#include <asm/proto.h>
27#include <asm/setup.h>
28#include <asm/trampoline.h>
29
30struct e820map e820;
31
32/* For PCI or other memory-mapped resources */
33unsigned long pci_mem_start = 0xaeedbabe;
34#ifdef CONFIG_PCI
35EXPORT_SYMBOL(pci_mem_start);
36#endif
37
38/*
39 * This function checks if any part of the range <start,end> is mapped
40 * with type.
41 */
42int
43e820_any_mapped(u64 start, u64 end, unsigned type)
44{
45 int i;
46
47 for (i = 0; i < e820.nr_map; i++) {
48 struct e820entry *ei = &e820.map[i];
49
50 if (type && ei->type != type)
51 continue;
52 if (ei->addr >= end || ei->addr + ei->size <= start)
53 continue;
54 return 1;
55 }
56 return 0;
57}
58EXPORT_SYMBOL_GPL(e820_any_mapped);
59
60/*
61 * This function checks if the entire range <start,end> is mapped with type.
62 *
63 * Note: this function only works correct if the e820 table is sorted and
64 * not-overlapping, which is the case
65 */
66int __init e820_all_mapped(u64 start, u64 end, unsigned type)
67{
68 int i;
69
70 for (i = 0; i < e820.nr_map; i++) {
71 struct e820entry *ei = &e820.map[i];
72
73 if (type && ei->type != type)
74 continue;
75 /* is the region (part) in overlap with the current region ?*/
76 if (ei->addr >= end || ei->addr + ei->size <= start)
77 continue;
78
79 /* if the region is at the beginning of <start,end> we move
80 * start to the end of the region since it's ok until there
81 */
82 if (ei->addr <= start)
83 start = ei->addr + ei->size;
84 /*
85 * if start is now at or beyond end, we're done, full
86 * coverage
87 */
88 if (start >= end)
89 return 1;
90 }
91 return 0;
92}
93
94/*
95 * Add a memory region to the kernel e820 map.
96 */
97void __init add_memory_region(u64 start, u64 size, int type)
98{
99 int x = e820.nr_map;
100
101 if (x == ARRAY_SIZE(e820.map)) {
102 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
103 return;
104 }
105
106 e820.map[x].addr = start;
107 e820.map[x].size = size;
108 e820.map[x].type = type;
109 e820.nr_map++;
110}
111
112void __init e820_print_map(char *who)
113{
114 int i;
115
116 for (i = 0; i < e820.nr_map; i++) {
117 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
118 (unsigned long long) e820.map[i].addr,
119 (unsigned long long)
120 (e820.map[i].addr + e820.map[i].size));
121 switch (e820.map[i].type) {
122 case E820_RAM:
123 printk(KERN_CONT "(usable)\n");
124 break;
125 case E820_RESERVED:
126 printk(KERN_CONT "(reserved)\n");
127 break;
128 case E820_ACPI:
129 printk(KERN_CONT "(ACPI data)\n");
130 break;
131 case E820_NVS:
132 printk(KERN_CONT "(ACPI NVS)\n");
133 break;
134 default:
135 printk(KERN_CONT "type %u\n", e820.map[i].type);
136 break;
137 }
138 }
139}
140
141/*
142 * Sanitize the BIOS e820 map.
143 *
144 * Some e820 responses include overlapping entries. The following
145 * replaces the original e820 map with a new one, removing overlaps,
146 * and resolving conflicting memory types in favor of highest
147 * numbered type.
148 *
149 * The input parameter biosmap points to an array of 'struct
150 * e820entry' which on entry has elements in the range [0, *pnr_map)
151 * valid, and which has space for up to max_nr_map entries.
152 * On return, the resulting sanitized e820 map entries will be in
153 * overwritten in the same location, starting at biosmap.
154 *
155 * The integer pointed to by pnr_map must be valid on entry (the
156 * current number of valid entries located at biosmap) and will
157 * be updated on return, with the new number of valid entries
158 * (something no more than max_nr_map.)
159 *
160 * The return value from sanitize_e820_map() is zero if it
161 * successfully 'sanitized' the map entries passed in, and is -1
162 * if it did nothing, which can happen if either of (1) it was
163 * only passed one map entry, or (2) any of the input map entries
164 * were invalid (start + size < start, meaning that the size was
165 * so big the described memory range wrapped around through zero.)
166 *
167 * Visually we're performing the following
168 * (1,2,3,4 = memory types)...
169 *
170 * Sample memory map (w/overlaps):
171 * ____22__________________
172 * ______________________4_
173 * ____1111________________
174 * _44_____________________
175 * 11111111________________
176 * ____________________33__
177 * ___________44___________
178 * __________33333_________
179 * ______________22________
180 * ___________________2222_
181 * _________111111111______
182 * _____________________11_
183 * _________________4______
184 *
185 * Sanitized equivalent (no overlap):
186 * 1_______________________
187 * _44_____________________
188 * ___1____________________
189 * ____22__________________
190 * ______11________________
191 * _________1______________
192 * __________3_____________
193 * ___________44___________
194 * _____________33_________
195 * _______________2________
196 * ________________1_______
197 * _________________4______
198 * ___________________2____
199 * ____________________33__
200 * ______________________4_
201 */
202
203int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
204 int *pnr_map)
205{
206 struct change_member {
207 struct e820entry *pbios; /* pointer to original bios entry */
208 unsigned long long addr; /* address for this change point */
209 };
210static struct change_member change_point_list[2*E820_X_MAX] __initdata;
211static struct change_member *change_point[2*E820_X_MAX] __initdata;
212static struct e820entry *overlap_list[E820_X_MAX] __initdata;
213static struct e820entry new_bios[E820_X_MAX] __initdata;
214 struct change_member *change_tmp;
215 unsigned long current_type, last_type;
216 unsigned long long last_addr;
217 int chgidx, still_changing;
218 int overlap_entries;
219 int new_bios_entry;
220 int old_nr, new_nr, chg_nr;
221 int i;
222
223 /* if there's only one memory region, don't bother */
224 if (*pnr_map < 2)
225 return -1;
226
227 old_nr = *pnr_map;
228 BUG_ON(old_nr > max_nr_map);
229
230 /* bail out if we find any unreasonable addresses in bios map */
231 for (i = 0; i < old_nr; i++)
232 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
233 return -1;
234
235 /* create pointers for initial change-point information (for sorting) */
236 for (i = 0; i < 2 * old_nr; i++)
237 change_point[i] = &change_point_list[i];
238
239 /* record all known change-points (starting and ending addresses),
240 omitting those that are for empty memory regions */
241 chgidx = 0;
242 for (i = 0; i < old_nr; i++) {
243 if (biosmap[i].size != 0) {
244 change_point[chgidx]->addr = biosmap[i].addr;
245 change_point[chgidx++]->pbios = &biosmap[i];
246 change_point[chgidx]->addr = biosmap[i].addr +
247 biosmap[i].size;
248 change_point[chgidx++]->pbios = &biosmap[i];
249 }
250 }
251 chg_nr = chgidx;
252
253 /* sort change-point list by memory addresses (low -> high) */
254 still_changing = 1;
255 while (still_changing) {
256 still_changing = 0;
257 for (i = 1; i < chg_nr; i++) {
258 unsigned long long curaddr, lastaddr;
259 unsigned long long curpbaddr, lastpbaddr;
260
261 curaddr = change_point[i]->addr;
262 lastaddr = change_point[i - 1]->addr;
263 curpbaddr = change_point[i]->pbios->addr;
264 lastpbaddr = change_point[i - 1]->pbios->addr;
265
266 /*
267 * swap entries, when:
268 *
269 * curaddr > lastaddr or
270 * curaddr == lastaddr and curaddr == curpbaddr and
271 * lastaddr != lastpbaddr
272 */
273 if (curaddr < lastaddr ||
274 (curaddr == lastaddr && curaddr == curpbaddr &&
275 lastaddr != lastpbaddr)) {
276 change_tmp = change_point[i];
277 change_point[i] = change_point[i-1];
278 change_point[i-1] = change_tmp;
279 still_changing = 1;
280 }
281 }
282 }
283
284 /* create a new bios memory map, removing overlaps */
285 overlap_entries = 0; /* number of entries in the overlap table */
286 new_bios_entry = 0; /* index for creating new bios map entries */
287 last_type = 0; /* start with undefined memory type */
288 last_addr = 0; /* start with 0 as last starting address */
289
290 /* loop through change-points, determining affect on the new bios map */
291 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
292 /* keep track of all overlapping bios entries */
293 if (change_point[chgidx]->addr ==
294 change_point[chgidx]->pbios->addr) {
295 /*
296 * add map entry to overlap list (> 1 entry
297 * implies an overlap)
298 */
299 overlap_list[overlap_entries++] =
300 change_point[chgidx]->pbios;
301 } else {
302 /*
303 * remove entry from list (order independent,
304 * so swap with last)
305 */
306 for (i = 0; i < overlap_entries; i++) {
307 if (overlap_list[i] ==
308 change_point[chgidx]->pbios)
309 overlap_list[i] =
310 overlap_list[overlap_entries-1];
311 }
312 overlap_entries--;
313 }
314 /*
315 * if there are overlapping entries, decide which
316 * "type" to use (larger value takes precedence --
317 * 1=usable, 2,3,4,4+=unusable)
318 */
319 current_type = 0;
320 for (i = 0; i < overlap_entries; i++)
321 if (overlap_list[i]->type > current_type)
322 current_type = overlap_list[i]->type;
323 /*
324 * continue building up new bios map based on this
325 * information
326 */
327 if (current_type != last_type) {
328 if (last_type != 0) {
329 new_bios[new_bios_entry].size =
330 change_point[chgidx]->addr - last_addr;
331 /*
332 * move forward only if the new size
333 * was non-zero
334 */
335 if (new_bios[new_bios_entry].size != 0)
336 /*
337 * no more space left for new
338 * bios entries ?
339 */
340 if (++new_bios_entry >= max_nr_map)
341 break;
342 }
343 if (current_type != 0) {
344 new_bios[new_bios_entry].addr =
345 change_point[chgidx]->addr;
346 new_bios[new_bios_entry].type = current_type;
347 last_addr = change_point[chgidx]->addr;
348 }
349 last_type = current_type;
350 }
351 }
352 /* retain count for new bios entries */
353 new_nr = new_bios_entry;
354
355 /* copy new bios mapping into original location */
356 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
357 *pnr_map = new_nr;
358
359 return 0;
360}
361
362/*
363 * Copy the BIOS e820 map into a safe place.
364 *
365 * Sanity-check it while we're at it..
366 *
367 * If we're lucky and live on a modern system, the setup code
368 * will have given us a memory map that we can use to properly
369 * set up memory. If we aren't, we'll fake a memory map.
370 */
371int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
372{
373 /* Only one memory region (or negative)? Ignore it */
374 if (nr_map < 2)
375 return -1;
376
377 do {
378 u64 start = biosmap->addr;
379 u64 size = biosmap->size;
380 u64 end = start + size;
381 u32 type = biosmap->type;
382
383 /* Overflow in 64 bits? Ignore the memory map. */
384 if (start > end)
385 return -1;
386
387 add_memory_region(start, size, type);
388 } while (biosmap++, --nr_map);
389 return 0;
390}
391
392u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
393 unsigned new_type)
394{
395 int i;
396 u64 real_updated_size = 0;
397
398 BUG_ON(old_type == new_type);
399
400 for (i = 0; i < e820.nr_map; i++) {
401 struct e820entry *ei = &e820.map[i];
402 u64 final_start, final_end;
403 if (ei->type != old_type)
404 continue;
405 /* totally covered? */
406 if (ei->addr >= start &&
407 (ei->addr + ei->size) <= (start + size)) {
408 ei->type = new_type;
409 real_updated_size += ei->size;
410 continue;
411 }
412 /* partially covered */
413 final_start = max(start, ei->addr);
414 final_end = min(start + size, ei->addr + ei->size);
415 if (final_start >= final_end)
416 continue;
417 add_memory_region(final_start, final_end - final_start,
418 new_type);
419 real_updated_size += final_end - final_start;
420 }
421 return real_updated_size;
422}
423
424void __init update_e820(void)
425{
426 int nr_map;
427
428 nr_map = e820.nr_map;
429 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
430 return;
431 e820.nr_map = nr_map;
432 printk(KERN_INFO "modified physical RAM map:\n");
433 e820_print_map("modified");
434}
435
436/*
437 * Search for the biggest gap in the low 32 bits of the e820
438 * memory space. We pass this space to PCI to assign MMIO resources
439 * for hotplug or unconfigured devices in.
440 * Hopefully the BIOS let enough space left.
441 */
442__init void e820_setup_gap(void)
443{
444 unsigned long gapstart, gapsize, round;
445 unsigned long long last;
446 int i;
447 int found = 0;
448
449 last = 0x100000000ull;
450 gapstart = 0x10000000;
451 gapsize = 0x400000;
452 i = e820.nr_map;
453 while (--i >= 0) {
454 unsigned long long start = e820.map[i].addr;
455 unsigned long long end = start + e820.map[i].size;
456
457 /*
458 * Since "last" is at most 4GB, we know we'll
459 * fit in 32 bits if this condition is true
460 */
461 if (last > end) {
462 unsigned long gap = last - end;
463
464 if (gap > gapsize) {
465 gapsize = gap;
466 gapstart = end;
467 found = 1;
468 }
469 }
470 if (start < last)
471 last = start;
472 }
473
474#ifdef CONFIG_X86_64
475 if (!found) {
476 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
477 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
478 "address range\n"
479 KERN_ERR "PCI: Unassigned devices with 32bit resource "
480 "registers may break!\n");
481 }
482#endif
483
484 /*
485 * See how much we want to round up: start off with
486 * rounding to the next 1MB area.
487 */
488 round = 0x100000;
489 while ((gapsize >> 4) > round)
490 round += round;
491 /* Fun with two's complement */
492 pci_mem_start = (gapstart + round) & -round;
493
494 printk(KERN_INFO
495 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
496 pci_mem_start, gapstart, gapsize);
497}
498
499#if defined(CONFIG_X86_64) || \
500 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
501/**
502 * Find the ranges of physical addresses that do not correspond to
503 * e820 RAM areas and mark the corresponding pages as nosave for
504 * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
505 *
506 * This function requires the e820 map to be sorted and without any
507 * overlapping entries and assumes the first e820 area to be RAM.
508 */
509void __init e820_mark_nosave_regions(unsigned long limit_pfn)
510{
511 int i;
512 unsigned long pfn;
513
514 pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
515 for (i = 1; i < e820.nr_map; i++) {
516 struct e820entry *ei = &e820.map[i];
517
518 if (pfn < PFN_UP(ei->addr))
519 register_nosave_region(pfn, PFN_UP(ei->addr));
520
521 pfn = PFN_DOWN(ei->addr + ei->size);
522 if (ei->type != E820_RAM)
523 register_nosave_region(PFN_UP(ei->addr), pfn);
524
525 if (pfn >= limit_pfn)
526 break;
527 }
528}
529#endif
530
531/*
532 * Early reserved memory areas.
533 */
534#define MAX_EARLY_RES 20
535
536struct early_res {
537 u64 start, end;
538 char name[16];
539};
540static struct early_res early_res[MAX_EARLY_RES] __initdata = {
541 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
542#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
543 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
544#endif
545#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
546 /*
547 * But first pinch a few for the stack/trampoline stuff
548 * FIXME: Don't need the extra page at 4K, but need to fix
549 * trampoline before removing it. (see the GDT stuff)
550 */
551 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
552 /*
553 * Has to be in very low memory so we can execute
554 * real-mode AP code.
555 */
556 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
557#endif
558 {}
559};
560
561static int __init find_overlapped_early(u64 start, u64 end)
562{
563 int i;
564 struct early_res *r;
565
566 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
567 r = &early_res[i];
568 if (end > r->start && start < r->end)
569 break;
570 }
571
572 return i;
573}
574
575void __init reserve_early(u64 start, u64 end, char *name)
576{
577 int i;
578 struct early_res *r;
579
580 i = find_overlapped_early(start, end);
581 if (i >= MAX_EARLY_RES)
582 panic("Too many early reservations");
583 r = &early_res[i];
584 if (r->end)
585 panic("Overlapping early reservations "
586 "%llx-%llx %s to %llx-%llx %s\n",
587 start, end - 1, name?name:"", r->start,
588 r->end - 1, r->name);
589 r->start = start;
590 r->end = end;
591 if (name)
592 strncpy(r->name, name, sizeof(r->name) - 1);
593}
594
595void __init free_early(u64 start, u64 end)
596{
597 struct early_res *r;
598 int i, j;
599
600 i = find_overlapped_early(start, end);
601 r = &early_res[i];
602 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
603 panic("free_early on not reserved area: %llx-%llx!",
604 start, end - 1);
605
606 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
607 ;
608
609 memmove(&early_res[i], &early_res[i + 1],
610 (j - 1 - i) * sizeof(struct early_res));
611
612 early_res[j - 1].end = 0;
613}
614
615int __init page_is_reserved_early(unsigned long pagenr)
616{
617 u64 start = (u64)pagenr << PAGE_SHIFT;
618 int i;
619 struct early_res *r;
620
621 i = find_overlapped_early(start, start + PAGE_SIZE);
622 r = &early_res[i];
623 return (i < MAX_EARLY_RES && r->end);
624}
625
626void __init early_res_to_bootmem(u64 start, u64 end)
627{
628 int i;
629 u64 final_start, final_end;
630 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
631 struct early_res *r = &early_res[i];
632 final_start = max(start, r->start);
633 final_end = min(end, r->end);
634 if (final_start >= final_end)
635 continue;
636 printk(KERN_INFO " early res: %d [%llx-%llx] %s\n", i,
637 final_start, final_end - 1, r->name);
638#ifdef CONFIG_X86_64
639 reserve_bootmem_generic(final_start, final_end - final_start);
640#else
641 reserve_bootmem(final_start, final_end - final_start,
642 BOOTMEM_DEFAULT);
643#endif
644 }
645}
646
647/* Check for already reserved areas */
648static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
649{
650 int i;
651 u64 addr = *addrp;
652 int changed = 0;
653 struct early_res *r;
654again:
655 i = find_overlapped_early(addr, addr + size);
656 r = &early_res[i];
657 if (i < MAX_EARLY_RES && r->end) {
658 *addrp = addr = round_up(r->end, align);
659 changed = 1;
660 goto again;
661 }
662 return changed;
663}
664
665/* Check for already reserved areas */
666static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
667{
668 int i;
669 u64 addr = *addrp, last;
670 u64 size = *sizep;
671 int changed = 0;
672again:
673 last = addr + size;
674 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
675 struct early_res *r = &early_res[i];
676 if (last > r->start && addr < r->start) {
677 size = r->start - addr;
678 changed = 1;
679 goto again;
680 }
681 if (last > r->end && addr < r->end) {
682 addr = round_up(r->end, align);
683 size = last - addr;
684 changed = 1;
685 goto again;
686 }
687 if (last <= r->end && addr >= r->start) {
688 (*sizep)++;
689 return 0;
690 }
691 }
692 if (changed) {
693 *addrp = addr;
694 *sizep = size;
695 }
696 return changed;
697}
698
699/*
700 * Find a free area with specified alignment in a specific range.
701 */
702u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
703{
704 int i;
705
706 for (i = 0; i < e820.nr_map; i++) {
707 struct e820entry *ei = &e820.map[i];
708 u64 addr, last;
709 u64 ei_last;
710
711 if (ei->type != E820_RAM)
712 continue;
713 addr = round_up(ei->addr, align);
714 ei_last = ei->addr + ei->size;
715 if (addr < start)
716 addr = round_up(start, align);
717 if (addr >= ei_last)
718 continue;
719 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
720 ;
721 last = addr + size;
722 if (last > ei_last)
723 continue;
724 if (last > end)
725 continue;
726 return addr;
727 }
728 return -1ULL;
729}
730
731/*
732 * Find next free range after *start
733 */
734u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
735{
736 int i;
737
738 for (i = 0; i < e820.nr_map; i++) {
739 struct e820entry *ei = &e820.map[i];
740 u64 addr, last;
741 u64 ei_last;
742
743 if (ei->type != E820_RAM)
744 continue;
745 addr = round_up(ei->addr, align);
746 ei_last = ei->addr + ei->size;
747 if (addr < start)
748 addr = round_up(start, align);
749 if (addr >= ei_last)
750 continue;
751 *sizep = ei_last - addr;
752 while (bad_addr_size(&addr, sizep, align) &&
753 addr + *sizep <= ei_last)
754 ;
755 last = addr + *sizep;
756 if (last > ei_last)
757 continue;
758 return addr;
759 }
760 return -1UL;
761
762}
763
764/*
765 * pre allocated 4k and reserved it in e820
766 */
767u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
768{
769 u64 size = 0;
770 u64 addr;
771 u64 start;
772
773 start = startt;
774 while (size < sizet)
775 start = find_e820_area_size(start, &size, align);
776
777 if (size < sizet)
778 return 0;
779
780 addr = round_down(start + size - sizet, align);
781 update_memory_range(addr, sizet, E820_RAM, E820_RESERVED);
782 printk(KERN_INFO "update e820 for early_reserve_e820\n");
783 update_e820();
784
785 return addr;
786}
787
788#ifdef CONFIG_X86_32
789# ifdef CONFIG_X86_PAE
790# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT))
791# else
792# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
793# endif
794#else /* CONFIG_X86_32 */
795# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
796#endif
797
798/*
799 * Last pfn which the user wants to use.
800 */
801unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
802
803/*
804 * Find the highest page frame number we have available
805 */
806unsigned long __init e820_end_of_ram(void)
807{
808 unsigned long last_pfn;
809 unsigned long max_arch_pfn = MAX_ARCH_PFN;
810
811 last_pfn = find_max_pfn_with_active_regions();
812
813 if (last_pfn > max_arch_pfn)
814 last_pfn = max_arch_pfn;
815 if (last_pfn > end_user_pfn)
816 last_pfn = end_user_pfn;
817
818 printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n",
819 last_pfn, max_arch_pfn);
820 return last_pfn;
821}
822
823/*
824 * Finds an active region in the address range from start_pfn to last_pfn and
825 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
826 */
827int __init e820_find_active_region(const struct e820entry *ei,
828 unsigned long start_pfn,
829 unsigned long last_pfn,
830 unsigned long *ei_startpfn,
831 unsigned long *ei_endpfn)
832{
833 u64 align = PAGE_SIZE;
834
835 *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
836 *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
837
838 /* Skip map entries smaller than a page */
839 if (*ei_startpfn >= *ei_endpfn)
840 return 0;
841
842 /* Skip if map is outside the node */
843 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
844 *ei_startpfn >= last_pfn)
845 return 0;
846
847 /* Check for overlaps */
848 if (*ei_startpfn < start_pfn)
849 *ei_startpfn = start_pfn;
850 if (*ei_endpfn > last_pfn)
851 *ei_endpfn = last_pfn;
852
853 /* Obey end_user_pfn to save on memmap */
854 if (*ei_startpfn >= end_user_pfn)
855 return 0;
856 if (*ei_endpfn > end_user_pfn)
857 *ei_endpfn = end_user_pfn;
858
859 return 1;
860}
861
862/* Walk the e820 map and register active regions within a node */
863void __init e820_register_active_regions(int nid, unsigned long start_pfn,
864 unsigned long last_pfn)
865{
866 unsigned long ei_startpfn;
867 unsigned long ei_endpfn;
868 int i;
869
870 for (i = 0; i < e820.nr_map; i++)
871 if (e820_find_active_region(&e820.map[i],
872 start_pfn, last_pfn,
873 &ei_startpfn, &ei_endpfn))
874 add_active_range(nid, ei_startpfn, ei_endpfn);
875}
876
877/*
878 * Find the hole size (in bytes) in the memory range.
879 * @start: starting address of the memory range to scan
880 * @end: ending address of the memory range to scan
881 */
882u64 __init e820_hole_size(u64 start, u64 end)
883{
884 unsigned long start_pfn = start >> PAGE_SHIFT;
885 unsigned long last_pfn = end >> PAGE_SHIFT;
886 unsigned long ei_startpfn, ei_endpfn, ram = 0;
887 int i;
888
889 for (i = 0; i < e820.nr_map; i++) {
890 if (e820_find_active_region(&e820.map[i],
891 start_pfn, last_pfn,
892 &ei_startpfn, &ei_endpfn))
893 ram += ei_endpfn - ei_startpfn;
894 }
895 return end - start - ((u64)ram << PAGE_SHIFT);
896}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index ed733e7cf4e6..e8a3b968c9fa 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -9,29 +9,12 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/pfn.h> 10#include <linux/pfn.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/suspend.h>
13 12
14#include <asm/pgtable.h> 13#include <asm/pgtable.h>
15#include <asm/page.h> 14#include <asm/page.h>
16#include <asm/e820.h> 15#include <asm/e820.h>
17#include <asm/setup.h> 16#include <asm/setup.h>
18 17
19struct e820map e820;
20struct change_member {
21 struct e820entry *pbios; /* pointer to original bios entry */
22 unsigned long long addr; /* address for this change point */
23};
24static struct change_member change_point_list[2*E820MAX] __initdata;
25static struct change_member *change_point[2*E820MAX] __initdata;
26static struct e820entry *overlap_list[E820MAX] __initdata;
27static struct e820entry new_bios[E820MAX] __initdata;
28/* For PCI or other memory-mapped resources */
29unsigned long pci_mem_start = 0x10000000;
30#ifdef CONFIG_PCI
31EXPORT_SYMBOL(pci_mem_start);
32#endif
33extern int user_defined_memmap;
34
35static struct resource system_rom_resource = { 18static struct resource system_rom_resource = {
36 .name = "System ROM", 19 .name = "System ROM",
37 .start = 0xf0000, 20 .start = 0xf0000,
@@ -224,398 +207,12 @@ void __init init_iomem_resources(struct resource *code_resource,
224 } 207 }
225} 208}
226 209
227#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
228/**
229 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
230 * correspond to e820 RAM areas and mark the corresponding pages as nosave for
231 * hibernation.
232 *
233 * This function requires the e820 map to be sorted and without any
234 * overlapping entries and assumes the first e820 area to be RAM.
235 */
236void __init e820_mark_nosave_regions(void)
237{
238 int i;
239 unsigned long pfn;
240
241 pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
242 for (i = 1; i < e820.nr_map; i++) {
243 struct e820entry *ei = &e820.map[i];
244
245 if (pfn < PFN_UP(ei->addr))
246 register_nosave_region(pfn, PFN_UP(ei->addr));
247
248 pfn = PFN_DOWN(ei->addr + ei->size);
249 if (ei->type != E820_RAM)
250 register_nosave_region(PFN_UP(ei->addr), pfn);
251
252 if (pfn >= max_low_pfn)
253 break;
254 }
255}
256#endif
257
258void __init add_memory_region(unsigned long long start,
259 unsigned long long size, int type)
260{
261 int x;
262
263 x = e820.nr_map;
264
265 if (x == E820MAX) {
266 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
267 return;
268 }
269
270 e820.map[x].addr = start;
271 e820.map[x].size = size;
272 e820.map[x].type = type;
273 e820.nr_map++;
274} /* add_memory_region */
275
276/*
277 * Sanitize the BIOS e820 map.
278 *
279 * Some e820 responses include overlapping entries. The following
280 * replaces the original e820 map with a new one, removing overlaps.
281 *
282 */
283int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
284{
285 struct change_member *change_tmp;
286 unsigned long current_type, last_type;
287 unsigned long long last_addr;
288 int chgidx, still_changing;
289 int overlap_entries;
290 int new_bios_entry;
291 int old_nr, new_nr, chg_nr;
292 int i;
293
294 /*
295 Visually we're performing the following (1,2,3,4 = memory types)...
296
297 Sample memory map (w/overlaps):
298 ____22__________________
299 ______________________4_
300 ____1111________________
301 _44_____________________
302 11111111________________
303 ____________________33__
304 ___________44___________
305 __________33333_________
306 ______________22________
307 ___________________2222_
308 _________111111111______
309 _____________________11_
310 _________________4______
311
312 Sanitized equivalent (no overlap):
313 1_______________________
314 _44_____________________
315 ___1____________________
316 ____22__________________
317 ______11________________
318 _________1______________
319 __________3_____________
320 ___________44___________
321 _____________33_________
322 _______________2________
323 ________________1_______
324 _________________4______
325 ___________________2____
326 ____________________33__
327 ______________________4_
328 */
329 /* if there's only one memory region, don't bother */
330 if (*pnr_map < 2) {
331 return -1;
332 }
333
334 old_nr = *pnr_map;
335
336 /* bail out if we find any unreasonable addresses in bios map */
337 for (i=0; i<old_nr; i++)
338 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
339 return -1;
340 }
341
342 /* create pointers for initial change-point information (for sorting) */
343 for (i=0; i < 2*old_nr; i++)
344 change_point[i] = &change_point_list[i];
345
346 /* record all known change-points (starting and ending addresses),
347 omitting those that are for empty memory regions */
348 chgidx = 0;
349 for (i=0; i < old_nr; i++) {
350 if (biosmap[i].size != 0) {
351 change_point[chgidx]->addr = biosmap[i].addr;
352 change_point[chgidx++]->pbios = &biosmap[i];
353 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
354 change_point[chgidx++]->pbios = &biosmap[i];
355 }
356 }
357 chg_nr = chgidx; /* true number of change-points */
358
359 /* sort change-point list by memory addresses (low -> high) */
360 still_changing = 1;
361 while (still_changing) {
362 still_changing = 0;
363 for (i=1; i < chg_nr; i++) {
364 /* if <current_addr> > <last_addr>, swap */
365 /* or, if current=<start_addr> & last=<end_addr>, swap */
366 if ((change_point[i]->addr < change_point[i-1]->addr) ||
367 ((change_point[i]->addr == change_point[i-1]->addr) &&
368 (change_point[i]->addr == change_point[i]->pbios->addr) &&
369 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
370 )
371 {
372 change_tmp = change_point[i];
373 change_point[i] = change_point[i-1];
374 change_point[i-1] = change_tmp;
375 still_changing=1;
376 }
377 }
378 }
379
380 /* create a new bios memory map, removing overlaps */
381 overlap_entries=0; /* number of entries in the overlap table */
382 new_bios_entry=0; /* index for creating new bios map entries */
383 last_type = 0; /* start with undefined memory type */
384 last_addr = 0; /* start with 0 as last starting address */
385 /* loop through change-points, determining affect on the new bios map */
386 for (chgidx=0; chgidx < chg_nr; chgidx++)
387 {
388 /* keep track of all overlapping bios entries */
389 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
390 {
391 /* add map entry to overlap list (> 1 entry implies an overlap) */
392 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
393 }
394 else
395 {
396 /* remove entry from list (order independent, so swap with last) */
397 for (i=0; i<overlap_entries; i++)
398 {
399 if (overlap_list[i] == change_point[chgidx]->pbios)
400 overlap_list[i] = overlap_list[overlap_entries-1];
401 }
402 overlap_entries--;
403 }
404 /* if there are overlapping entries, decide which "type" to use */
405 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
406 current_type = 0;
407 for (i=0; i<overlap_entries; i++)
408 if (overlap_list[i]->type > current_type)
409 current_type = overlap_list[i]->type;
410 /* continue building up new bios map based on this information */
411 if (current_type != last_type) {
412 if (last_type != 0) {
413 new_bios[new_bios_entry].size =
414 change_point[chgidx]->addr - last_addr;
415 /* move forward only if the new size was non-zero */
416 if (new_bios[new_bios_entry].size != 0)
417 if (++new_bios_entry >= E820MAX)
418 break; /* no more space left for new bios entries */
419 }
420 if (current_type != 0) {
421 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
422 new_bios[new_bios_entry].type = current_type;
423 last_addr=change_point[chgidx]->addr;
424 }
425 last_type = current_type;
426 }
427 }
428 new_nr = new_bios_entry; /* retain count for new bios entries */
429
430 /* copy new bios mapping into original location */
431 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
432 *pnr_map = new_nr;
433
434 return 0;
435}
436
437/*
438 * Copy the BIOS e820 map into a safe place.
439 *
440 * Sanity-check it while we're at it..
441 *
442 * If we're lucky and live on a modern system, the setup code
443 * will have given us a memory map that we can use to properly
444 * set up memory. If we aren't, we'll fake a memory map.
445 *
446 * We check to see that the memory map contains at least 2 elements
447 * before we'll use it, because the detection code in setup.S may
448 * not be perfect and most every PC known to man has two memory
449 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
450 * thinkpad 560x, for example, does not cooperate with the memory
451 * detection code.)
452 */
453int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
454{
455 /* Only one memory region (or negative)? Ignore it */
456 if (nr_map < 2)
457 return -1;
458
459 do {
460 u64 start = biosmap->addr;
461 u64 size = biosmap->size;
462 u64 end = start + size;
463 u32 type = biosmap->type;
464
465 /* Overflow in 64 bits? Ignore the memory map. */
466 if (start > end)
467 return -1;
468
469 add_memory_region(start, size, type);
470 } while (biosmap++, --nr_map);
471
472 return 0;
473}
474
475/*
476 * Find the highest page frame number we have available
477 */
478void __init propagate_e820_map(void)
479{
480 int i;
481
482 max_pfn = 0;
483
484 for (i = 0; i < e820.nr_map; i++) {
485 unsigned long start, end;
486 /* RAM? */
487 if (e820.map[i].type != E820_RAM)
488 continue;
489 start = PFN_UP(e820.map[i].addr);
490 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
491 if (start >= end)
492 continue;
493 if (end > max_pfn)
494 max_pfn = end;
495 memory_present(0, start, end);
496 }
497}
498
499/*
500 * Register fully available low RAM pages with the bootmem allocator.
501 */
502void __init register_bootmem_low_pages(unsigned long max_low_pfn)
503{
504 int i;
505
506 for (i = 0; i < e820.nr_map; i++) {
507 unsigned long curr_pfn, last_pfn, size;
508 /*
509 * Reserve usable low memory
510 */
511 if (e820.map[i].type != E820_RAM)
512 continue;
513 /*
514 * We are rounding up the start address of usable memory:
515 */
516 curr_pfn = PFN_UP(e820.map[i].addr);
517 if (curr_pfn >= max_low_pfn)
518 continue;
519 /*
520 * ... and at the end of the usable range downwards:
521 */
522 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
523
524 if (last_pfn > max_low_pfn)
525 last_pfn = max_low_pfn;
526
527 /*
528 * .. finally, did all the rounding and playing
529 * around just make the area go away?
530 */
531 if (last_pfn <= curr_pfn)
532 continue;
533
534 size = last_pfn - curr_pfn;
535 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
536 }
537}
538
539void __init e820_register_memory(void)
540{
541 unsigned long gapstart, gapsize, round;
542 unsigned long long last;
543 int i;
544
545 /*
546 * Search for the biggest gap in the low 32 bits of the e820
547 * memory space.
548 */
549 last = 0x100000000ull;
550 gapstart = 0x10000000;
551 gapsize = 0x400000;
552 i = e820.nr_map;
553 while (--i >= 0) {
554 unsigned long long start = e820.map[i].addr;
555 unsigned long long end = start + e820.map[i].size;
556
557 /*
558 * Since "last" is at most 4GB, we know we'll
559 * fit in 32 bits if this condition is true
560 */
561 if (last > end) {
562 unsigned long gap = last - end;
563
564 if (gap > gapsize) {
565 gapsize = gap;
566 gapstart = end;
567 }
568 }
569 if (start < last)
570 last = start;
571 }
572
573 /*
574 * See how much we want to round up: start off with
575 * rounding to the next 1MB area.
576 */
577 round = 0x100000;
578 while ((gapsize >> 4) > round)
579 round += round;
580 /* Fun with two's complement */
581 pci_mem_start = (gapstart + round) & -round;
582
583 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
584 pci_mem_start, gapstart, gapsize);
585}
586
587void __init print_memory_map(char *who)
588{
589 int i;
590
591 for (i = 0; i < e820.nr_map; i++) {
592 printk(" %s: %016Lx - %016Lx ", who,
593 e820.map[i].addr,
594 e820.map[i].addr + e820.map[i].size);
595 switch (e820.map[i].type) {
596 case E820_RAM: printk("(usable)\n");
597 break;
598 case E820_RESERVED:
599 printk("(reserved)\n");
600 break;
601 case E820_ACPI:
602 printk("(ACPI data)\n");
603 break;
604 case E820_NVS:
605 printk("(ACPI NVS)\n");
606 break;
607 default: printk("type %u\n", e820.map[i].type);
608 break;
609 }
610 }
611}
612
613void __init limit_regions(unsigned long long size) 210void __init limit_regions(unsigned long long size)
614{ 211{
615 unsigned long long current_addr; 212 unsigned long long current_addr;
616 int i; 213 int i;
617 214
618 print_memory_map("limit_regions start"); 215 e820_print_map("limit_regions start");
619 for (i = 0; i < e820.nr_map; i++) { 216 for (i = 0; i < e820.nr_map; i++) {
620 current_addr = e820.map[i].addr + e820.map[i].size; 217 current_addr = e820.map[i].addr + e820.map[i].size;
621 if (current_addr < size) 218 if (current_addr < size)
@@ -634,63 +231,59 @@ void __init limit_regions(unsigned long long size)
634 e820.nr_map = i + 1; 231 e820.nr_map = i + 1;
635 e820.map[i].size -= current_addr - size; 232 e820.map[i].size -= current_addr - size;
636 } 233 }
637 print_memory_map("limit_regions endfor"); 234 e820_print_map("limit_regions endfor");
638 return; 235 return;
639 } 236 }
640 print_memory_map("limit_regions endfunc"); 237 e820_print_map("limit_regions endfunc");
641} 238}
642 239
643/* 240/* Overridden in paravirt.c if CONFIG_PARAVIRT */
644 * This function checks if any part of the range <start,end> is mapped 241char * __init __attribute__((weak)) memory_setup(void)
645 * with type.
646 */
647int
648e820_any_mapped(u64 start, u64 end, unsigned type)
649{ 242{
650 int i; 243 return machine_specific_memory_setup();
651 for (i = 0; i < e820.nr_map; i++) {
652 const struct e820entry *ei = &e820.map[i];
653 if (type && ei->type != type)
654 continue;
655 if (ei->addr >= end || ei->addr + ei->size <= start)
656 continue;
657 return 1;
658 }
659 return 0;
660} 244}
661EXPORT_SYMBOL_GPL(e820_any_mapped); 245
662 246void __init setup_memory_map(void)
663 /*
664 * This function checks if the entire range <start,end> is mapped with type.
665 *
666 * Note: this function only works correct if the e820 table is sorted and
667 * not-overlapping, which is the case
668 */
669int __init
670e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
671{ 247{
672 u64 start = s; 248 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
673 u64 end = e; 249 e820_print_map(memory_setup());
674 int i; 250}
675 for (i = 0; i < e820.nr_map; i++) { 251
676 struct e820entry *ei = &e820.map[i]; 252static int __initdata user_defined_memmap;
677 if (type && ei->type != type) 253
678 continue; 254/*
679 /* is the region (part) in overlap with the current region ?*/ 255 * "mem=nopentium" disables the 4MB page tables.
680 if (ei->addr >= end || ei->addr + ei->size <= start) 256 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
681 continue; 257 * to <mem>, overriding the bios size.
682 /* if the region is at the beginning of <start,end> we move 258 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
683 * start to the end of the region since it's ok until there 259 * <start> to <start>+<mem>, overriding the bios size.
260 *
261 * HPA tells me bootloaders need to parse mem=, so no new
262 * option should be mem= [also see Documentation/i386/boot.txt]
263 */
264static int __init parse_mem(char *arg)
265{
266 if (!arg)
267 return -EINVAL;
268
269 if (strcmp(arg, "nopentium") == 0) {
270 setup_clear_cpu_cap(X86_FEATURE_PSE);
271 } else {
272 /* If the user specifies memory size, we
273 * limit the BIOS-provided memory map to
274 * that size. exactmap can be used to specify
275 * the exact map. mem=number can be used to
276 * trim the existing memory map.
684 */ 277 */
685 if (ei->addr <= start) 278 unsigned long long mem_size;
686 start = ei->addr + ei->size; 279
687 /* if start is now at or beyond end, we're done, full 280 mem_size = memparse(arg, &arg);
688 * coverage */ 281 limit_regions(mem_size);
689 if (start >= end) 282 user_defined_memmap = 1;
690 return 1; /* we're done */
691 } 283 }
692 return 0; 284 return 0;
693} 285}
286early_param("mem", parse_mem);
694 287
695static int __init parse_memmap(char *arg) 288static int __init parse_memmap(char *arg)
696{ 289{
@@ -704,8 +297,9 @@ static int __init parse_memmap(char *arg)
704 * size before original memory map is 297 * size before original memory map is
705 * reset. 298 * reset.
706 */ 299 */
707 propagate_e820_map(); 300 e820_register_active_regions(0, 0, -1UL);
708 saved_max_pfn = max_pfn; 301 saved_max_pfn = e820_end_of_ram();
302 remove_all_active_ranges();
709#endif 303#endif
710 e820.nr_map = 0; 304 e820.nr_map = 0;
711 user_defined_memmap = 1; 305 user_defined_memmap = 1;
@@ -736,40 +330,12 @@ static int __init parse_memmap(char *arg)
736 return 0; 330 return 0;
737} 331}
738early_param("memmap", parse_memmap); 332early_param("memmap", parse_memmap);
739void __init update_memory_range(u64 start, u64 size, unsigned old_type,
740 unsigned new_type)
741{
742 int i;
743
744 BUG_ON(old_type == new_type);
745 333
746 for (i = 0; i < e820.nr_map; i++) { 334void __init finish_e820_parsing(void)
747 struct e820entry *ei = &e820.map[i]; 335{
748 u64 final_start, final_end; 336 if (user_defined_memmap) {
749 if (ei->type != old_type) 337 printk(KERN_INFO "user-defined physical RAM map:\n");
750 continue; 338 e820_print_map("user");
751 /* totally covered? */
752 if (ei->addr >= start && ei->size <= size) {
753 ei->type = new_type;
754 continue;
755 }
756 /* partially covered */
757 final_start = max(start, ei->addr);
758 final_end = min(start + size, ei->addr + ei->size);
759 if (final_start >= final_end)
760 continue;
761 add_memory_region(final_start, final_end - final_start,
762 new_type);
763 } 339 }
764} 340}
765void __init update_e820(void)
766{
767 u8 nr_map;
768 341
769 nr_map = e820.nr_map;
770 if (sanitize_e820_map(e820.map, &nr_map))
771 return;
772 e820.nr_map = nr_map;
773 printk(KERN_INFO "modified physical RAM map:\n");
774 print_memory_map("modified");
775}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 124480c0008d..0afee2ca0bf8 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -17,8 +17,8 @@
17#include <linux/kexec.h> 17#include <linux/kexec.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/suspend.h>
21#include <linux/pfn.h> 20#include <linux/pfn.h>
21#include <linux/pci.h>
22 22
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/page.h> 24#include <asm/page.h>
@@ -29,8 +29,6 @@
29#include <asm/kdebug.h> 29#include <asm/kdebug.h>
30#include <asm/trampoline.h> 30#include <asm/trampoline.h>
31 31
32struct e820map e820;
33
34/* 32/*
35 * PFN of last memory page. 33 * PFN of last memory page.
36 */ 34 */
@@ -44,285 +42,6 @@ unsigned long end_pfn;
44unsigned long max_pfn_mapped; 42unsigned long max_pfn_mapped;
45 43
46/* 44/*
47 * Last pfn which the user wants to use.
48 */
49static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
50
51/*
52 * Early reserved memory areas.
53 */
54#define MAX_EARLY_RES 20
55
56struct early_res {
57 unsigned long start, end;
58 char name[16];
59};
60static struct early_res early_res[MAX_EARLY_RES] __initdata = {
61 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
62#ifdef CONFIG_X86_TRAMPOLINE
63 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
64#endif
65 {}
66};
67
68void __init reserve_early(unsigned long start, unsigned long end, char *name)
69{
70 int i;
71 struct early_res *r;
72 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
73 r = &early_res[i];
74 if (end > r->start && start < r->end)
75 panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
76 start, end - 1, name?name:"", r->start, r->end - 1, r->name);
77 }
78 if (i >= MAX_EARLY_RES)
79 panic("Too many early reservations");
80 r = &early_res[i];
81 r->start = start;
82 r->end = end;
83 if (name)
84 strncpy(r->name, name, sizeof(r->name) - 1);
85}
86
87void __init free_early(unsigned long start, unsigned long end)
88{
89 struct early_res *r;
90 int i, j;
91
92 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
93 r = &early_res[i];
94 if (start == r->start && end == r->end)
95 break;
96 }
97 if (i >= MAX_EARLY_RES || !early_res[i].end)
98 panic("free_early on not reserved area: %lx-%lx!", start, end);
99
100 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
101 ;
102
103 memmove(&early_res[i], &early_res[i + 1],
104 (j - 1 - i) * sizeof(struct early_res));
105
106 early_res[j - 1].end = 0;
107}
108
109void __init early_res_to_bootmem(unsigned long start, unsigned long end)
110{
111 int i;
112 unsigned long final_start, final_end;
113 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
114 struct early_res *r = &early_res[i];
115 final_start = max(start, r->start);
116 final_end = min(end, r->end);
117 if (final_start >= final_end)
118 continue;
119 printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
120 final_start, final_end - 1, r->name);
121 reserve_bootmem_generic(final_start, final_end - final_start);
122 }
123}
124
125/* Check for already reserved areas */
126static inline int __init
127bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
128{
129 int i;
130 unsigned long addr = *addrp, last;
131 int changed = 0;
132again:
133 last = addr + size;
134 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
135 struct early_res *r = &early_res[i];
136 if (last >= r->start && addr < r->end) {
137 *addrp = addr = round_up(r->end, align);
138 changed = 1;
139 goto again;
140 }
141 }
142 return changed;
143}
144
145/* Check for already reserved areas */
146static inline int __init
147bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
148{
149 int i;
150 unsigned long addr = *addrp, last;
151 unsigned long size = *sizep;
152 int changed = 0;
153again:
154 last = addr + size;
155 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
156 struct early_res *r = &early_res[i];
157 if (last > r->start && addr < r->start) {
158 size = r->start - addr;
159 changed = 1;
160 goto again;
161 }
162 if (last > r->end && addr < r->end) {
163 addr = round_up(r->end, align);
164 size = last - addr;
165 changed = 1;
166 goto again;
167 }
168 if (last <= r->end && addr >= r->start) {
169 (*sizep)++;
170 return 0;
171 }
172 }
173 if (changed) {
174 *addrp = addr;
175 *sizep = size;
176 }
177 return changed;
178}
179/*
180 * This function checks if any part of the range <start,end> is mapped
181 * with type.
182 */
183int
184e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
185{
186 int i;
187
188 for (i = 0; i < e820.nr_map; i++) {
189 struct e820entry *ei = &e820.map[i];
190
191 if (type && ei->type != type)
192 continue;
193 if (ei->addr >= end || ei->addr + ei->size <= start)
194 continue;
195 return 1;
196 }
197 return 0;
198}
199EXPORT_SYMBOL_GPL(e820_any_mapped);
200
201/*
202 * This function checks if the entire range <start,end> is mapped with type.
203 *
204 * Note: this function only works correct if the e820 table is sorted and
205 * not-overlapping, which is the case
206 */
207int __init e820_all_mapped(unsigned long start, unsigned long end,
208 unsigned type)
209{
210 int i;
211
212 for (i = 0; i < e820.nr_map; i++) {
213 struct e820entry *ei = &e820.map[i];
214
215 if (type && ei->type != type)
216 continue;
217 /* is the region (part) in overlap with the current region ?*/
218 if (ei->addr >= end || ei->addr + ei->size <= start)
219 continue;
220
221 /* if the region is at the beginning of <start,end> we move
222 * start to the end of the region since it's ok until there
223 */
224 if (ei->addr <= start)
225 start = ei->addr + ei->size;
226 /*
227 * if start is now at or beyond end, we're done, full
228 * coverage
229 */
230 if (start >= end)
231 return 1;
232 }
233 return 0;
234}
235
236/*
237 * Find a free area with specified alignment in a specific range.
238 */
239unsigned long __init find_e820_area(unsigned long start, unsigned long end,
240 unsigned long size, unsigned long align)
241{
242 int i;
243
244 for (i = 0; i < e820.nr_map; i++) {
245 struct e820entry *ei = &e820.map[i];
246 unsigned long addr, last;
247 unsigned long ei_last;
248
249 if (ei->type != E820_RAM)
250 continue;
251 addr = round_up(ei->addr, align);
252 ei_last = ei->addr + ei->size;
253 if (addr < start)
254 addr = round_up(start, align);
255 if (addr >= ei_last)
256 continue;
257 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
258 ;
259 last = addr + size;
260 if (last > ei_last)
261 continue;
262 if (last > end)
263 continue;
264 return addr;
265 }
266 return -1UL;
267}
268
269/*
270 * Find next free range after *start
271 */
272unsigned long __init find_e820_area_size(unsigned long start,
273 unsigned long *sizep,
274 unsigned long align)
275{
276 int i;
277
278 for (i = 0; i < e820.nr_map; i++) {
279 struct e820entry *ei = &e820.map[i];
280 unsigned long addr, last;
281 unsigned long ei_last;
282
283 if (ei->type != E820_RAM)
284 continue;
285 addr = round_up(ei->addr, align);
286 ei_last = ei->addr + ei->size;
287 if (addr < start)
288 addr = round_up(start, align);
289 if (addr >= ei_last)
290 continue;
291 *sizep = ei_last - addr;
292 while (bad_addr_size(&addr, sizep, align) &&
293 addr + *sizep <= ei_last)
294 ;
295 last = addr + *sizep;
296 if (last > ei_last)
297 continue;
298 return addr;
299 }
300 return -1UL;
301
302}
303/*
304 * Find the highest page frame number we have available
305 */
306unsigned long __init e820_end_of_ram(void)
307{
308 unsigned long end_pfn;
309
310 end_pfn = find_max_pfn_with_active_regions();
311
312 if (end_pfn > max_pfn_mapped)
313 max_pfn_mapped = end_pfn;
314 if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
315 max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
316 if (end_pfn > end_user_pfn)
317 end_pfn = end_user_pfn;
318 if (end_pfn > max_pfn_mapped)
319 end_pfn = max_pfn_mapped;
320
321 printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
322 return end_pfn;
323}
324
325/*
326 * Mark e820 reserved areas as busy for the resource manager. 45 * Mark e820 reserved areas as busy for the resource manager.
327 */ 46 */
328void __init e820_reserve_resources(void) 47void __init e820_reserve_resources(void)
@@ -346,393 +65,6 @@ void __init e820_reserve_resources(void)
346 } 65 }
347} 66}
348 67
349/*
350 * Find the ranges of physical addresses that do not correspond to
351 * e820 RAM areas and mark the corresponding pages as nosave for software
352 * suspend and suspend to RAM.
353 *
354 * This function requires the e820 map to be sorted and without any
355 * overlapping entries and assumes the first e820 area to be RAM.
356 */
357void __init e820_mark_nosave_regions(void)
358{
359 int i;
360 unsigned long paddr;
361
362 paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
363 for (i = 1; i < e820.nr_map; i++) {
364 struct e820entry *ei = &e820.map[i];
365
366 if (paddr < ei->addr)
367 register_nosave_region(PFN_DOWN(paddr),
368 PFN_UP(ei->addr));
369
370 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
371 if (ei->type != E820_RAM)
372 register_nosave_region(PFN_UP(ei->addr),
373 PFN_DOWN(paddr));
374
375 if (paddr >= (end_pfn << PAGE_SHIFT))
376 break;
377 }
378}
379
380/*
381 * Finds an active region in the address range from start_pfn to end_pfn and
382 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
383 */
384static int __init e820_find_active_region(const struct e820entry *ei,
385 unsigned long start_pfn,
386 unsigned long end_pfn,
387 unsigned long *ei_startpfn,
388 unsigned long *ei_endpfn)
389{
390 *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
391 *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
392
393 /* Skip map entries smaller than a page */
394 if (*ei_startpfn >= *ei_endpfn)
395 return 0;
396
397 /* Check if max_pfn_mapped should be updated */
398 if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
399 max_pfn_mapped = *ei_endpfn;
400
401 /* Skip if map is outside the node */
402 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
403 *ei_startpfn >= end_pfn)
404 return 0;
405
406 /* Check for overlaps */
407 if (*ei_startpfn < start_pfn)
408 *ei_startpfn = start_pfn;
409 if (*ei_endpfn > end_pfn)
410 *ei_endpfn = end_pfn;
411
412 /* Obey end_user_pfn to save on memmap */
413 if (*ei_startpfn >= end_user_pfn)
414 return 0;
415 if (*ei_endpfn > end_user_pfn)
416 *ei_endpfn = end_user_pfn;
417
418 return 1;
419}
420
421/* Walk the e820 map and register active regions within a node */
422void __init
423e820_register_active_regions(int nid, unsigned long start_pfn,
424 unsigned long end_pfn)
425{
426 unsigned long ei_startpfn;
427 unsigned long ei_endpfn;
428 int i;
429
430 for (i = 0; i < e820.nr_map; i++)
431 if (e820_find_active_region(&e820.map[i],
432 start_pfn, end_pfn,
433 &ei_startpfn, &ei_endpfn))
434 add_active_range(nid, ei_startpfn, ei_endpfn);
435}
436
437/*
438 * Add a memory region to the kernel e820 map.
439 */
440void __init add_memory_region(unsigned long start, unsigned long size, int type)
441{
442 int x = e820.nr_map;
443
444 if (x == E820MAX) {
445 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
446 return;
447 }
448
449 e820.map[x].addr = start;
450 e820.map[x].size = size;
451 e820.map[x].type = type;
452 e820.nr_map++;
453}
454
455/*
456 * Find the hole size (in bytes) in the memory range.
457 * @start: starting address of the memory range to scan
458 * @end: ending address of the memory range to scan
459 */
460unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
461{
462 unsigned long start_pfn = start >> PAGE_SHIFT;
463 unsigned long end_pfn = end >> PAGE_SHIFT;
464 unsigned long ei_startpfn, ei_endpfn, ram = 0;
465 int i;
466
467 for (i = 0; i < e820.nr_map; i++) {
468 if (e820_find_active_region(&e820.map[i],
469 start_pfn, end_pfn,
470 &ei_startpfn, &ei_endpfn))
471 ram += ei_endpfn - ei_startpfn;
472 }
473 return end - start - (ram << PAGE_SHIFT);
474}
475
476static void __init e820_print_map(char *who)
477{
478 int i;
479
480 for (i = 0; i < e820.nr_map; i++) {
481 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
482 (unsigned long long) e820.map[i].addr,
483 (unsigned long long)
484 (e820.map[i].addr + e820.map[i].size));
485 switch (e820.map[i].type) {
486 case E820_RAM:
487 printk(KERN_CONT "(usable)\n");
488 break;
489 case E820_RESERVED:
490 printk(KERN_CONT "(reserved)\n");
491 break;
492 case E820_ACPI:
493 printk(KERN_CONT "(ACPI data)\n");
494 break;
495 case E820_NVS:
496 printk(KERN_CONT "(ACPI NVS)\n");
497 break;
498 default:
499 printk(KERN_CONT "type %u\n", e820.map[i].type);
500 break;
501 }
502 }
503}
504
505/*
506 * Sanitize the BIOS e820 map.
507 *
508 * Some e820 responses include overlapping entries. The following
509 * replaces the original e820 map with a new one, removing overlaps.
510 *
511 */
512static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
513{
514 struct change_member {
515 struct e820entry *pbios; /* pointer to original bios entry */
516 unsigned long long addr; /* address for this change point */
517 };
518 static struct change_member change_point_list[2*E820MAX] __initdata;
519 static struct change_member *change_point[2*E820MAX] __initdata;
520 static struct e820entry *overlap_list[E820MAX] __initdata;
521 static struct e820entry new_bios[E820MAX] __initdata;
522 struct change_member *change_tmp;
523 unsigned long current_type, last_type;
524 unsigned long long last_addr;
525 int chgidx, still_changing;
526 int overlap_entries;
527 int new_bios_entry;
528 int old_nr, new_nr, chg_nr;
529 int i;
530
531 /*
532 Visually we're performing the following
533 (1,2,3,4 = memory types)...
534
535 Sample memory map (w/overlaps):
536 ____22__________________
537 ______________________4_
538 ____1111________________
539 _44_____________________
540 11111111________________
541 ____________________33__
542 ___________44___________
543 __________33333_________
544 ______________22________
545 ___________________2222_
546 _________111111111______
547 _____________________11_
548 _________________4______
549
550 Sanitized equivalent (no overlap):
551 1_______________________
552 _44_____________________
553 ___1____________________
554 ____22__________________
555 ______11________________
556 _________1______________
557 __________3_____________
558 ___________44___________
559 _____________33_________
560 _______________2________
561 ________________1_______
562 _________________4______
563 ___________________2____
564 ____________________33__
565 ______________________4_
566 */
567
568 /* if there's only one memory region, don't bother */
569 if (*pnr_map < 2)
570 return -1;
571
572 old_nr = *pnr_map;
573
574 /* bail out if we find any unreasonable addresses in bios map */
575 for (i = 0; i < old_nr; i++)
576 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
577 return -1;
578
579 /* create pointers for initial change-point information (for sorting) */
580 for (i = 0; i < 2 * old_nr; i++)
581 change_point[i] = &change_point_list[i];
582
583 /* record all known change-points (starting and ending addresses),
584 omitting those that are for empty memory regions */
585 chgidx = 0;
586 for (i = 0; i < old_nr; i++) {
587 if (biosmap[i].size != 0) {
588 change_point[chgidx]->addr = biosmap[i].addr;
589 change_point[chgidx++]->pbios = &biosmap[i];
590 change_point[chgidx]->addr = biosmap[i].addr +
591 biosmap[i].size;
592 change_point[chgidx++]->pbios = &biosmap[i];
593 }
594 }
595 chg_nr = chgidx;
596
597 /* sort change-point list by memory addresses (low -> high) */
598 still_changing = 1;
599 while (still_changing) {
600 still_changing = 0;
601 for (i = 1; i < chg_nr; i++) {
602 unsigned long long curaddr, lastaddr;
603 unsigned long long curpbaddr, lastpbaddr;
604
605 curaddr = change_point[i]->addr;
606 lastaddr = change_point[i - 1]->addr;
607 curpbaddr = change_point[i]->pbios->addr;
608 lastpbaddr = change_point[i - 1]->pbios->addr;
609
610 /*
611 * swap entries, when:
612 *
613 * curaddr > lastaddr or
614 * curaddr == lastaddr and curaddr == curpbaddr and
615 * lastaddr != lastpbaddr
616 */
617 if (curaddr < lastaddr ||
618 (curaddr == lastaddr && curaddr == curpbaddr &&
619 lastaddr != lastpbaddr)) {
620 change_tmp = change_point[i];
621 change_point[i] = change_point[i-1];
622 change_point[i-1] = change_tmp;
623 still_changing = 1;
624 }
625 }
626 }
627
628 /* create a new bios memory map, removing overlaps */
629 overlap_entries = 0; /* number of entries in the overlap table */
630 new_bios_entry = 0; /* index for creating new bios map entries */
631 last_type = 0; /* start with undefined memory type */
632 last_addr = 0; /* start with 0 as last starting address */
633
634 /* loop through change-points, determining affect on the new bios map */
635 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
636 /* keep track of all overlapping bios entries */
637 if (change_point[chgidx]->addr ==
638 change_point[chgidx]->pbios->addr) {
639 /*
640 * add map entry to overlap list (> 1 entry
641 * implies an overlap)
642 */
643 overlap_list[overlap_entries++] =
644 change_point[chgidx]->pbios;
645 } else {
646 /*
647 * remove entry from list (order independent,
648 * so swap with last)
649 */
650 for (i = 0; i < overlap_entries; i++) {
651 if (overlap_list[i] ==
652 change_point[chgidx]->pbios)
653 overlap_list[i] =
654 overlap_list[overlap_entries-1];
655 }
656 overlap_entries--;
657 }
658 /*
659 * if there are overlapping entries, decide which
660 * "type" to use (larger value takes precedence --
661 * 1=usable, 2,3,4,4+=unusable)
662 */
663 current_type = 0;
664 for (i = 0; i < overlap_entries; i++)
665 if (overlap_list[i]->type > current_type)
666 current_type = overlap_list[i]->type;
667 /*
668 * continue building up new bios map based on this
669 * information
670 */
671 if (current_type != last_type) {
672 if (last_type != 0) {
673 new_bios[new_bios_entry].size =
674 change_point[chgidx]->addr - last_addr;
675 /*
676 * move forward only if the new size
677 * was non-zero
678 */
679 if (new_bios[new_bios_entry].size != 0)
680 /*
681 * no more space left for new
682 * bios entries ?
683 */
684 if (++new_bios_entry >= E820MAX)
685 break;
686 }
687 if (current_type != 0) {
688 new_bios[new_bios_entry].addr =
689 change_point[chgidx]->addr;
690 new_bios[new_bios_entry].type = current_type;
691 last_addr = change_point[chgidx]->addr;
692 }
693 last_type = current_type;
694 }
695 }
696 /* retain count for new bios entries */
697 new_nr = new_bios_entry;
698
699 /* copy new bios mapping into original location */
700 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
701 *pnr_map = new_nr;
702
703 return 0;
704}
705
706/*
707 * Copy the BIOS e820 map into a safe place.
708 *
709 * Sanity-check it while we're at it..
710 *
711 * If we're lucky and live on a modern system, the setup code
712 * will have given us a memory map that we can use to properly
713 * set up memory. If we aren't, we'll fake a memory map.
714 */
715static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
716{
717 /* Only one memory region (or negative)? Ignore it */
718 if (nr_map < 2)
719 return -1;
720
721 do {
722 u64 start = biosmap->addr;
723 u64 size = biosmap->size;
724 u64 end = start + size;
725 u32 type = biosmap->type;
726
727 /* Overflow in 64 bits? Ignore the memory map. */
728 if (start > end)
729 return -1;
730
731 add_memory_region(start, size, type);
732 } while (biosmap++, --nr_map);
733 return 0;
734}
735
736static void early_panic(char *msg) 68static void early_panic(char *msg)
737{ 69{
738 early_printk(msg); 70 early_printk(msg);
@@ -740,16 +72,21 @@ static void early_panic(char *msg)
740} 72}
741 73
742/* We're not void only for x86 32-bit compat */ 74/* We're not void only for x86 32-bit compat */
743char * __init machine_specific_memory_setup(void) 75char *__init machine_specific_memory_setup(void)
744{ 76{
745 char *who = "BIOS-e820"; 77 char *who = "BIOS-e820";
78 int new_nr;
746 /* 79 /*
747 * Try to copy the BIOS-supplied E820-map. 80 * Try to copy the BIOS-supplied E820-map.
748 * 81 *
749 * Otherwise fake a memory map; one section from 0k->640k, 82 * Otherwise fake a memory map; one section from 0k->640k,
750 * the next section from 1mb->appropriate_mem_k 83 * the next section from 1mb->appropriate_mem_k
751 */ 84 */
752 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); 85 new_nr = boot_params.e820_entries;
86 sanitize_e820_map(boot_params.e820_map,
87 ARRAY_SIZE(boot_params.e820_map),
88 &new_nr);
89 boot_params.e820_entries = new_nr;
753 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) 90 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
754 early_panic("Cannot find a valid memory map"); 91 early_panic("Cannot find a valid memory map");
755 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 92 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
@@ -787,7 +124,6 @@ static int __init parse_memmap_opt(char *p)
787 saved_max_pfn = e820_end_of_ram(); 124 saved_max_pfn = e820_end_of_ram();
788 remove_all_active_ranges(); 125 remove_all_active_ranges();
789#endif 126#endif
790 max_pfn_mapped = 0;
791 e820.nr_map = 0; 127 e820.nr_map = 0;
792 userdef = 1; 128 userdef = 1;
793 return 0; 129 return 0;
@@ -818,9 +154,9 @@ early_param("memmap", parse_memmap_opt);
818void __init finish_e820_parsing(void) 154void __init finish_e820_parsing(void)
819{ 155{
820 if (userdef) { 156 if (userdef) {
821 char nr = e820.nr_map; 157 int nr = e820.nr_map;
822 158
823 if (sanitize_e820_map(e820.map, &nr) < 0) 159 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
824 early_panic("Invalid user supplied memory map"); 160 early_panic("Invalid user supplied memory map");
825 e820.nr_map = nr; 161 e820.nr_map = nr;
826 162
@@ -829,109 +165,6 @@ void __init finish_e820_parsing(void)
829 } 165 }
830} 166}
831 167
832void __init update_memory_range(u64 start, u64 size, unsigned old_type,
833 unsigned new_type)
834{
835 int i;
836
837 BUG_ON(old_type == new_type);
838
839 for (i = 0; i < e820.nr_map; i++) {
840 struct e820entry *ei = &e820.map[i];
841 u64 final_start, final_end;
842 if (ei->type != old_type)
843 continue;
844 /* totally covered? */
845 if (ei->addr >= start && ei->size <= size) {
846 ei->type = new_type;
847 continue;
848 }
849 /* partially covered */
850 final_start = max(start, ei->addr);
851 final_end = min(start + size, ei->addr + ei->size);
852 if (final_start >= final_end)
853 continue;
854 add_memory_region(final_start, final_end - final_start,
855 new_type);
856 }
857}
858
859void __init update_e820(void)
860{
861 u8 nr_map;
862
863 nr_map = e820.nr_map;
864 if (sanitize_e820_map(e820.map, &nr_map))
865 return;
866 e820.nr_map = nr_map;
867 printk(KERN_INFO "modified physical RAM map:\n");
868 e820_print_map("modified");
869}
870
871unsigned long pci_mem_start = 0xaeedbabe;
872EXPORT_SYMBOL(pci_mem_start);
873
874/*
875 * Search for the biggest gap in the low 32 bits of the e820
876 * memory space. We pass this space to PCI to assign MMIO resources
877 * for hotplug or unconfigured devices in.
878 * Hopefully the BIOS let enough space left.
879 */
880__init void e820_setup_gap(void)
881{
882 unsigned long gapstart, gapsize, round;
883 unsigned long last;
884 int i;
885 int found = 0;
886
887 last = 0x100000000ull;
888 gapstart = 0x10000000;
889 gapsize = 0x400000;
890 i = e820.nr_map;
891 while (--i >= 0) {
892 unsigned long long start = e820.map[i].addr;
893 unsigned long long end = start + e820.map[i].size;
894
895 /*
896 * Since "last" is at most 4GB, we know we'll
897 * fit in 32 bits if this condition is true
898 */
899 if (last > end) {
900 unsigned long gap = last - end;
901
902 if (gap > gapsize) {
903 gapsize = gap;
904 gapstart = end;
905 found = 1;
906 }
907 }
908 if (start < last)
909 last = start;
910 }
911
912 if (!found) {
913 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
914 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
915 "address range\n"
916 KERN_ERR "PCI: Unassigned devices with 32bit resource "
917 "registers may break!\n");
918 }
919
920 /*
921 * See how much we want to round up: start off with
922 * rounding to the next 1MB area.
923 */
924 round = 0x100000;
925 while ((gapsize >> 4) > round)
926 round += round;
927 /* Fun with two's complement */
928 pci_mem_start = (gapstart + round) & -round;
929
930 printk(KERN_INFO
931 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
932 pci_mem_start, gapstart, gapsize);
933}
934
935int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) 168int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
936{ 169{
937 int i; 170 int i;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 77d424cf68b3..d5c7fcdd1861 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -213,6 +213,48 @@ unsigned long efi_get_time(void)
213 eft.minute, eft.second); 213 eft.minute, eft.second);
214} 214}
215 215
216/*
217 * Tell the kernel about the EFI memory map. This might include
218 * more than the max 128 entries that can fit in the e820 legacy
219 * (zeropage) memory map.
220 */
221
222static void __init add_efi_memmap(void)
223{
224 void *p;
225
226 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
227 efi_memory_desc_t *md = p;
228 unsigned long long start = md->phys_addr;
229 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
230 int e820_type;
231
232 if (md->attribute & EFI_MEMORY_WB)
233 e820_type = E820_RAM;
234 else
235 e820_type = E820_RESERVED;
236 add_memory_region(start, size, e820_type);
237 }
238 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
239}
240
241void __init efi_reserve_early(void)
242{
243 unsigned long pmap;
244
245 pmap = boot_params.efi_info.efi_memmap;
246#ifdef CONFIG_X86_64
247 pmap += (__u64)boot_params.efi_info.efi_memmap_hi << 32;
248#endif
249 memmap.phys_map = (void *)pmap;
250 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
251 boot_params.efi_info.efi_memdesc_size;
252 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
253 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
254 reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
255 "EFI memmap");
256}
257
216#if EFI_DEBUG 258#if EFI_DEBUG
217static void __init print_efi_memmap(void) 259static void __init print_efi_memmap(void)
218{ 260{
@@ -242,21 +284,11 @@ void __init efi_init(void)
242 int i = 0; 284 int i = 0;
243 void *tmp; 285 void *tmp;
244 286
245#ifdef CONFIG_X86_32
246 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; 287 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
247 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; 288#ifdef CONFIG_X86_64
248#else 289 efi_phys.systab = (void *)efi_phys.systab +
249 efi_phys.systab = (efi_system_table_t *) 290 ((__u64)boot_params.efi_info.efi_systab_hi<<32);
250 (boot_params.efi_info.efi_systab |
251 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
252 memmap.phys_map = (void *)
253 (boot_params.efi_info.efi_memmap |
254 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
255#endif 291#endif
256 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
257 boot_params.efi_info.efi_memdesc_size;
258 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
259 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
260 292
261 efi.systab = early_ioremap((unsigned long)efi_phys.systab, 293 efi.systab = early_ioremap((unsigned long)efi_phys.systab,
262 sizeof(efi_system_table_t)); 294 sizeof(efi_system_table_t));
@@ -370,6 +402,7 @@ void __init efi_init(void)
370 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 402 if (memmap.desc_size != sizeof(efi_memory_desc_t))
371 printk(KERN_WARNING "Kernel-defined memdesc" 403 printk(KERN_WARNING "Kernel-defined memdesc"
372 "doesn't match the one from EFI!\n"); 404 "doesn't match the one from EFI!\n");
405 add_efi_memmap();
373 406
374 /* Setup for EFI runtime service */ 407 /* Setup for EFI runtime service */
375 reboot_type = BOOT_EFI; 408 reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d0060fdcccac..652c5287215f 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -97,13 +97,7 @@ void __init efi_call_phys_epilog(void)
97 early_runtime_code_mapping_set_exec(0); 97 early_runtime_code_mapping_set_exec(0);
98} 98}
99 99
100void __init efi_reserve_bootmem(void) 100void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
101{
102 reserve_bootmem_generic((unsigned long)memmap.phys_map,
103 memmap.nr_map * memmap.desc_size);
104}
105
106void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
107{ 101{
108 static unsigned pages_mapped __initdata; 102 static unsigned pages_mapped __initdata;
109 unsigned i, pages; 103 unsigned i, pages;
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index cbaaf69bedb2..1fa8be5bd217 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -51,7 +51,7 @@ void __init setup_apic_routing(void)
51 else 51 else
52#endif 52#endif
53 53
54 if (num_possible_cpus() <= 8) 54 if (max_physical_apicid < 8)
55 genapic = &apic_flat; 55 genapic = &apic_flat;
56 else 56 else
57 genapic = &apic_physflat; 57 genapic = &apic_physflat;
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
new file mode 100644
index 000000000000..a727c0b9819c
--- /dev/null
+++ b/arch/x86/kernel/head.c
@@ -0,0 +1,73 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3
4#include <asm/setup.h>
5#include <asm/bios_ebda.h>
6
7#define BIOS_LOWMEM_KILOBYTES 0x413
8
9/*
10 * The BIOS places the EBDA/XBDA at the top of conventional
11 * memory, and usually decreases the reported amount of
12 * conventional memory (int 0x12) too. This also contains a
13 * workaround for Dell systems that neglect to reserve EBDA.
14 * The same workaround also avoids a problem with the AMD768MPX
15 * chipset: reserve a page before VGA to prevent PCI prefetch
16 * into it (errata #56). Usually the page is reserved anyways,
17 * unless you have no PS/2 mouse plugged in.
18 */
19void __init reserve_ebda_region(void)
20{
21 unsigned int lowmem, ebda_addr;
22
23 /* To determine the position of the EBDA and the */
24 /* end of conventional memory, we need to look at */
25 /* the BIOS data area. In a paravirtual environment */
26 /* that area is absent. We'll just have to assume */
27 /* that the paravirt case can handle memory setup */
28 /* correctly, without our help. */
29 if (paravirt_enabled())
30 return;
31
32 /* end of low (conventional) memory */
33 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
34 lowmem <<= 10;
35
36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda();
38
39 /* Fixup: bios puts an EBDA in the top 64K segment */
40 /* of conventional memory, but does not adjust lowmem. */
41 if ((lowmem - ebda_addr) <= 0x10000)
42 lowmem = ebda_addr;
43
44 /* Fixup: bios does not report an EBDA at all. */
45 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
46 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
47 lowmem = 0x9f000;
48
49 /* Paranoia: should never happen, but... */
50 if ((lowmem == 0) || (lowmem >= 0x100000))
51 lowmem = 0x9f000;
52
53 /* reserve all memory between lowmem and the 1MB mark */
54 reserve_early(lowmem, 0x100000, "BIOS reserved");
55}
56
57void __init reserve_setup_data(void)
58{
59 struct setup_data *data;
60 u64 pa_data;
61 char buf[32];
62
63 if (boot_params.hdr.version < 0x0209)
64 return;
65 pa_data = boot_params.hdr.setup_data;
66 while (pa_data) {
67 data = early_ioremap(pa_data, sizeof(*data));
68 sprintf(buf, "setup data %x", data->type);
69 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
70 pa_data = data->next;
71 early_iounmap(data, sizeof(*data));
72 }
73}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3db059058927..fa1d25dd83e3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,7 +8,34 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10 10
11#include <asm/setup.h>
12#include <asm/sections.h>
13#include <asm/e820.h>
14#include <asm/bios_ebda.h>
15
11void __init i386_start_kernel(void) 16void __init i386_start_kernel(void)
12{ 17{
18 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
19
20#ifdef CONFIG_BLK_DEV_INITRD
21 /* Reserve INITRD */
22 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
23 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
24 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
25 u64 ramdisk_end = ramdisk_image + ramdisk_size;
26 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
27 }
28#endif
29 reserve_early(init_pg_tables_start, init_pg_tables_end,
30 "INIT_PG_TABLE");
31
32 reserve_ebda_region();
33
34 /*
35 * At this point everything still needed from the boot loader
36 * or BIOS or kernel text should be early reserved or marked not
37 * RAM in e820. All other memory is free game.
38 */
39
13 start_kernel(); 40 start_kernel();
14} 41}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index e25c57b8aa84..5fbed459ff3b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -51,74 +51,6 @@ static void __init copy_bootdata(char *real_mode_data)
51 } 51 }
52} 52}
53 53
54#define BIOS_LOWMEM_KILOBYTES 0x413
55
56/*
57 * The BIOS places the EBDA/XBDA at the top of conventional
58 * memory, and usually decreases the reported amount of
59 * conventional memory (int 0x12) too. This also contains a
60 * workaround for Dell systems that neglect to reserve EBDA.
61 * The same workaround also avoids a problem with the AMD768MPX
62 * chipset: reserve a page before VGA to prevent PCI prefetch
63 * into it (errata #56). Usually the page is reserved anyways,
64 * unless you have no PS/2 mouse plugged in.
65 */
66static void __init reserve_ebda_region(void)
67{
68 unsigned int lowmem, ebda_addr;
69
70 /* To determine the position of the EBDA and the */
71 /* end of conventional memory, we need to look at */
72 /* the BIOS data area. In a paravirtual environment */
73 /* that area is absent. We'll just have to assume */
74 /* that the paravirt case can handle memory setup */
75 /* correctly, without our help. */
76 if (paravirt_enabled())
77 return;
78
79 /* end of low (conventional) memory */
80 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
81 lowmem <<= 10;
82
83 /* start of EBDA area */
84 ebda_addr = get_bios_ebda();
85
86 /* Fixup: bios puts an EBDA in the top 64K segment */
87 /* of conventional memory, but does not adjust lowmem. */
88 if ((lowmem - ebda_addr) <= 0x10000)
89 lowmem = ebda_addr;
90
91 /* Fixup: bios does not report an EBDA at all. */
92 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
93 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
94 lowmem = 0x9f000;
95
96 /* Paranoia: should never happen, but... */
97 if ((lowmem == 0) || (lowmem >= 0x100000))
98 lowmem = 0x9f000;
99
100 /* reserve all memory between lowmem and the 1MB mark */
101 reserve_early(lowmem, 0x100000, "BIOS reserved");
102}
103
104static void __init reserve_setup_data(void)
105{
106 struct setup_data *data;
107 unsigned long pa_data;
108 char buf[32];
109
110 if (boot_params.hdr.version < 0x0209)
111 return;
112 pa_data = boot_params.hdr.setup_data;
113 while (pa_data) {
114 data = early_ioremap(pa_data, sizeof(*data));
115 sprintf(buf, "setup data %x", data->type);
116 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
117 pa_data = data->next;
118 early_iounmap(data, sizeof(*data));
119 }
120}
121
122void __init x86_64_start_kernel(char * real_mode_data) 54void __init x86_64_start_kernel(char * real_mode_data)
123{ 55{
124 int i; 56 int i;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f7357cc0162c..b98b338aae1a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -194,6 +194,7 @@ default_entry:
194 xorl %ebx,%ebx /* %ebx is kept at zero */ 194 xorl %ebx,%ebx /* %ebx is kept at zero */
195 195
196 movl $pa(pg0), %edi 196 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start)
197 movl $pa(swapper_pg_pmd), %edx 198 movl $pa(swapper_pg_pmd), %edx
198 movl $PTE_ATTR, %eax 199 movl $PTE_ATTR, %eax
19910: 20010:
@@ -219,6 +220,8 @@ default_entry:
219 jb 10b 220 jb 10b
2201: 2211:
221 movl %edi,pa(init_pg_tables_end) 222 movl %edi,pa(init_pg_tables_end)
223 shrl $12, %eax
224 movl %eax, pa(max_pfn_mapped)
222 225
223 /* Do early initialization of the fixmap area */ 226 /* Do early initialization of the fixmap area */
224 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -228,6 +231,7 @@ default_entry:
228page_pde_offset = (__PAGE_OFFSET >> 20); 231page_pde_offset = (__PAGE_OFFSET >> 20);
229 232
230 movl $pa(pg0), %edi 233 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start)
231 movl $pa(swapper_pg_dir), %edx 235 movl $pa(swapper_pg_dir), %edx
232 movl $PTE_ATTR, %eax 236 movl $PTE_ATTR, %eax
23310: 23710:
@@ -249,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 cmpl %ebp,%eax 253 cmpl %ebp,%eax
250 jb 10b 254 jb 10b
251 movl %edi,pa(init_pg_tables_end) 255 movl %edi,pa(init_pg_tables_end)
256 shrl $12, %eax
257 movl %eax, pa(max_pfn_mapped)
252 258
253 /* Do early initialization of the fixmap area */ 259 /* Do early initialization of the fixmap area */
254 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d20..0662817d61bf 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -72,15 +72,21 @@ int sis_apic_bug = -1;
72int nr_ioapic_registers[MAX_IO_APICS]; 72int nr_ioapic_registers[MAX_IO_APICS];
73 73
74/* I/O APIC entries */ 74/* I/O APIC entries */
75struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; 75struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
76int nr_ioapics; 76int nr_ioapics;
77 77
78/* MP IRQ source entries */ 78/* MP IRQ source entries */
79struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 79struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
80 80
81/* # of MP IRQ source entries */ 81/* # of MP IRQ source entries */
82int mp_irq_entries; 82int mp_irq_entries;
83 83
84#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
85int mp_bus_id_to_type[MAX_MP_BUSSES];
86#endif
87
88DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
89
84static int disable_timer_pin_1 __initdata; 90static int disable_timer_pin_1 __initdata;
85 91
86/* 92/*
@@ -110,7 +116,7 @@ struct io_apic {
110static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 116static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
111{ 117{
112 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 118 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
113 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); 119 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
114} 120}
115 121
116static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 122static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -801,10 +807,10 @@ static int find_irq_entry(int apic, int pin, int type)
801 int i; 807 int i;
802 808
803 for (i = 0; i < mp_irq_entries; i++) 809 for (i = 0; i < mp_irq_entries; i++)
804 if (mp_irqs[i].mpc_irqtype == type && 810 if (mp_irqs[i].mp_irqtype == type &&
805 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || 811 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
806 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && 812 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
807 mp_irqs[i].mpc_dstirq == pin) 813 mp_irqs[i].mp_dstirq == pin)
808 return i; 814 return i;
809 815
810 return -1; 816 return -1;
@@ -818,13 +824,13 @@ static int __init find_isa_irq_pin(int irq, int type)
818 int i; 824 int i;
819 825
820 for (i = 0; i < mp_irq_entries; i++) { 826 for (i = 0; i < mp_irq_entries; i++) {
821 int lbus = mp_irqs[i].mpc_srcbus; 827 int lbus = mp_irqs[i].mp_srcbus;
822 828
823 if (test_bit(lbus, mp_bus_not_pci) && 829 if (test_bit(lbus, mp_bus_not_pci) &&
824 (mp_irqs[i].mpc_irqtype == type) && 830 (mp_irqs[i].mp_irqtype == type) &&
825 (mp_irqs[i].mpc_srcbusirq == irq)) 831 (mp_irqs[i].mp_srcbusirq == irq))
826 832
827 return mp_irqs[i].mpc_dstirq; 833 return mp_irqs[i].mp_dstirq;
828 } 834 }
829 return -1; 835 return -1;
830} 836}
@@ -834,17 +840,17 @@ static int __init find_isa_irq_apic(int irq, int type)
834 int i; 840 int i;
835 841
836 for (i = 0; i < mp_irq_entries; i++) { 842 for (i = 0; i < mp_irq_entries; i++) {
837 int lbus = mp_irqs[i].mpc_srcbus; 843 int lbus = mp_irqs[i].mp_srcbus;
838 844
839 if (test_bit(lbus, mp_bus_not_pci) && 845 if (test_bit(lbus, mp_bus_not_pci) &&
840 (mp_irqs[i].mpc_irqtype == type) && 846 (mp_irqs[i].mp_irqtype == type) &&
841 (mp_irqs[i].mpc_srcbusirq == irq)) 847 (mp_irqs[i].mp_srcbusirq == irq))
842 break; 848 break;
843 } 849 }
844 if (i < mp_irq_entries) { 850 if (i < mp_irq_entries) {
845 int apic; 851 int apic;
846 for(apic = 0; apic < nr_ioapics; apic++) { 852 for(apic = 0; apic < nr_ioapics; apic++) {
847 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) 853 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
848 return apic; 854 return apic;
849 } 855 }
850 } 856 }
@@ -864,28 +870,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
864 870
865 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " 871 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
866 "slot:%d, pin:%d.\n", bus, slot, pin); 872 "slot:%d, pin:%d.\n", bus, slot, pin);
867 if (mp_bus_id_to_pci_bus[bus] == -1) { 873 if (test_bit(bus, mp_bus_not_pci)) {
868 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); 874 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
869 return -1; 875 return -1;
870 } 876 }
871 for (i = 0; i < mp_irq_entries; i++) { 877 for (i = 0; i < mp_irq_entries; i++) {
872 int lbus = mp_irqs[i].mpc_srcbus; 878 int lbus = mp_irqs[i].mp_srcbus;
873 879
874 for (apic = 0; apic < nr_ioapics; apic++) 880 for (apic = 0; apic < nr_ioapics; apic++)
875 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || 881 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
876 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) 882 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
877 break; 883 break;
878 884
879 if (!test_bit(lbus, mp_bus_not_pci) && 885 if (!test_bit(lbus, mp_bus_not_pci) &&
880 !mp_irqs[i].mpc_irqtype && 886 !mp_irqs[i].mp_irqtype &&
881 (bus == lbus) && 887 (bus == lbus) &&
882 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { 888 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
883 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); 889 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
884 890
885 if (!(apic || IO_APIC_IRQ(irq))) 891 if (!(apic || IO_APIC_IRQ(irq)))
886 continue; 892 continue;
887 893
888 if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) 894 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
889 return irq; 895 return irq;
890 /* 896 /*
891 * Use the first all-but-pin matching entry as a 897 * Use the first all-but-pin matching entry as a
@@ -952,7 +958,7 @@ static int EISA_ELCR(unsigned int irq)
952 * EISA conforming in the MP table, that means its trigger type must 958 * EISA conforming in the MP table, that means its trigger type must
953 * be read in from the ELCR */ 959 * be read in from the ELCR */
954 960
955#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) 961#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
956#define default_EISA_polarity(idx) default_ISA_polarity(idx) 962#define default_EISA_polarity(idx) default_ISA_polarity(idx)
957 963
958/* PCI interrupts are always polarity one level triggered, 964/* PCI interrupts are always polarity one level triggered,
@@ -969,13 +975,13 @@ static int EISA_ELCR(unsigned int irq)
969 975
970static int MPBIOS_polarity(int idx) 976static int MPBIOS_polarity(int idx)
971{ 977{
972 int bus = mp_irqs[idx].mpc_srcbus; 978 int bus = mp_irqs[idx].mp_srcbus;
973 int polarity; 979 int polarity;
974 980
975 /* 981 /*
976 * Determine IRQ line polarity (high active or low active): 982 * Determine IRQ line polarity (high active or low active):
977 */ 983 */
978 switch (mp_irqs[idx].mpc_irqflag & 3) 984 switch (mp_irqs[idx].mp_irqflag & 3)
979 { 985 {
980 case 0: /* conforms, ie. bus-type dependent polarity */ 986 case 0: /* conforms, ie. bus-type dependent polarity */
981 { 987 {
@@ -1012,13 +1018,13 @@ static int MPBIOS_polarity(int idx)
1012 1018
1013static int MPBIOS_trigger(int idx) 1019static int MPBIOS_trigger(int idx)
1014{ 1020{
1015 int bus = mp_irqs[idx].mpc_srcbus; 1021 int bus = mp_irqs[idx].mp_srcbus;
1016 int trigger; 1022 int trigger;
1017 1023
1018 /* 1024 /*
1019 * Determine IRQ trigger mode (edge or level sensitive): 1025 * Determine IRQ trigger mode (edge or level sensitive):
1020 */ 1026 */
1021 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) 1027 switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
1022 { 1028 {
1023 case 0: /* conforms, ie. bus-type dependent */ 1029 case 0: /* conforms, ie. bus-type dependent */
1024 { 1030 {
@@ -1097,16 +1103,16 @@ static inline int irq_trigger(int idx)
1097static int pin_2_irq(int idx, int apic, int pin) 1103static int pin_2_irq(int idx, int apic, int pin)
1098{ 1104{
1099 int irq, i; 1105 int irq, i;
1100 int bus = mp_irqs[idx].mpc_srcbus; 1106 int bus = mp_irqs[idx].mp_srcbus;
1101 1107
1102 /* 1108 /*
1103 * Debugging check, we are in big trouble if this message pops up! 1109 * Debugging check, we are in big trouble if this message pops up!
1104 */ 1110 */
1105 if (mp_irqs[idx].mpc_dstirq != pin) 1111 if (mp_irqs[idx].mp_dstirq != pin)
1106 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 1112 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1107 1113
1108 if (test_bit(bus, mp_bus_not_pci)) 1114 if (test_bit(bus, mp_bus_not_pci))
1109 irq = mp_irqs[idx].mpc_srcbusirq; 1115 irq = mp_irqs[idx].mp_srcbusirq;
1110 else { 1116 else {
1111 /* 1117 /*
1112 * PCI IRQs are mapped in order 1118 * PCI IRQs are mapped in order
@@ -1250,12 +1256,12 @@ static void __init setup_IO_APIC_irqs(void)
1250 if (first_notcon) { 1256 if (first_notcon) {
1251 apic_printk(APIC_VERBOSE, KERN_DEBUG 1257 apic_printk(APIC_VERBOSE, KERN_DEBUG
1252 " IO-APIC (apicid-pin) %d-%d", 1258 " IO-APIC (apicid-pin) %d-%d",
1253 mp_ioapics[apic].mpc_apicid, 1259 mp_ioapics[apic].mp_apicid,
1254 pin); 1260 pin);
1255 first_notcon = 0; 1261 first_notcon = 0;
1256 } else 1262 } else
1257 apic_printk(APIC_VERBOSE, ", %d-%d", 1263 apic_printk(APIC_VERBOSE, ", %d-%d",
1258 mp_ioapics[apic].mpc_apicid, pin); 1264 mp_ioapics[apic].mp_apicid, pin);
1259 continue; 1265 continue;
1260 } 1266 }
1261 1267
@@ -1357,7 +1363,7 @@ void __init print_IO_APIC(void)
1357 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1363 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1358 for (i = 0; i < nr_ioapics; i++) 1364 for (i = 0; i < nr_ioapics; i++)
1359 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1365 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1360 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); 1366 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
1361 1367
1362 /* 1368 /*
1363 * We are a bit conservative about what we expect. We have to 1369 * We are a bit conservative about what we expect. We have to
@@ -1376,7 +1382,7 @@ void __init print_IO_APIC(void)
1376 reg_03.raw = io_apic_read(apic, 3); 1382 reg_03.raw = io_apic_read(apic, 3);
1377 spin_unlock_irqrestore(&ioapic_lock, flags); 1383 spin_unlock_irqrestore(&ioapic_lock, flags);
1378 1384
1379 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); 1385 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1380 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1386 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1381 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1387 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1382 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1388 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1716,7 +1722,6 @@ void disable_IO_APIC(void)
1716 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 1722 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1717 */ 1723 */
1718 1724
1719#ifndef CONFIG_X86_NUMAQ
1720static void __init setup_ioapic_ids_from_mpc(void) 1725static void __init setup_ioapic_ids_from_mpc(void)
1721{ 1726{
1722 union IO_APIC_reg_00 reg_00; 1727 union IO_APIC_reg_00 reg_00;
@@ -1726,6 +1731,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
1726 unsigned char old_id; 1731 unsigned char old_id;
1727 unsigned long flags; 1732 unsigned long flags;
1728 1733
1734#ifdef CONFIG_X86_NUMAQ
1735 if (found_numaq)
1736 return;
1737#endif
1738
1729 /* 1739 /*
1730 * Don't check I/O APIC IDs for xAPIC systems. They have 1740 * Don't check I/O APIC IDs for xAPIC systems. They have
1731 * no meaning without the serial APIC bus. 1741 * no meaning without the serial APIC bus.
@@ -1749,14 +1759,14 @@ static void __init setup_ioapic_ids_from_mpc(void)
1749 reg_00.raw = io_apic_read(apic, 0); 1759 reg_00.raw = io_apic_read(apic, 0);
1750 spin_unlock_irqrestore(&ioapic_lock, flags); 1760 spin_unlock_irqrestore(&ioapic_lock, flags);
1751 1761
1752 old_id = mp_ioapics[apic].mpc_apicid; 1762 old_id = mp_ioapics[apic].mp_apicid;
1753 1763
1754 if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { 1764 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1755 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 1765 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1756 apic, mp_ioapics[apic].mpc_apicid); 1766 apic, mp_ioapics[apic].mp_apicid);
1757 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1767 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1758 reg_00.bits.ID); 1768 reg_00.bits.ID);
1759 mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; 1769 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1760 } 1770 }
1761 1771
1762 /* 1772 /*
@@ -1765,9 +1775,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
1765 * 'stuck on smp_invalidate_needed IPI wait' messages. 1775 * 'stuck on smp_invalidate_needed IPI wait' messages.
1766 */ 1776 */
1767 if (check_apicid_used(phys_id_present_map, 1777 if (check_apicid_used(phys_id_present_map,
1768 mp_ioapics[apic].mpc_apicid)) { 1778 mp_ioapics[apic].mp_apicid)) {
1769 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 1779 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1770 apic, mp_ioapics[apic].mpc_apicid); 1780 apic, mp_ioapics[apic].mp_apicid);
1771 for (i = 0; i < get_physical_broadcast(); i++) 1781 for (i = 0; i < get_physical_broadcast(); i++)
1772 if (!physid_isset(i, phys_id_present_map)) 1782 if (!physid_isset(i, phys_id_present_map))
1773 break; 1783 break;
@@ -1776,13 +1786,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
1776 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1786 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1777 i); 1787 i);
1778 physid_set(i, phys_id_present_map); 1788 physid_set(i, phys_id_present_map);
1779 mp_ioapics[apic].mpc_apicid = i; 1789 mp_ioapics[apic].mp_apicid = i;
1780 } else { 1790 } else {
1781 physid_mask_t tmp; 1791 physid_mask_t tmp;
1782 tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); 1792 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1783 apic_printk(APIC_VERBOSE, "Setting %d in the " 1793 apic_printk(APIC_VERBOSE, "Setting %d in the "
1784 "phys_id_present_map\n", 1794 "phys_id_present_map\n",
1785 mp_ioapics[apic].mpc_apicid); 1795 mp_ioapics[apic].mp_apicid);
1786 physids_or(phys_id_present_map, phys_id_present_map, tmp); 1796 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1787 } 1797 }
1788 1798
@@ -1791,11 +1801,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
1791 * We need to adjust the IRQ routing table 1801 * We need to adjust the IRQ routing table
1792 * if the ID changed. 1802 * if the ID changed.
1793 */ 1803 */
1794 if (old_id != mp_ioapics[apic].mpc_apicid) 1804 if (old_id != mp_ioapics[apic].mp_apicid)
1795 for (i = 0; i < mp_irq_entries; i++) 1805 for (i = 0; i < mp_irq_entries; i++)
1796 if (mp_irqs[i].mpc_dstapic == old_id) 1806 if (mp_irqs[i].mp_dstapic == old_id)
1797 mp_irqs[i].mpc_dstapic 1807 mp_irqs[i].mp_dstapic
1798 = mp_ioapics[apic].mpc_apicid; 1808 = mp_ioapics[apic].mp_apicid;
1799 1809
1800 /* 1810 /*
1801 * Read the right value from the MPC table and 1811 * Read the right value from the MPC table and
@@ -1803,9 +1813,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
1803 */ 1813 */
1804 apic_printk(APIC_VERBOSE, KERN_INFO 1814 apic_printk(APIC_VERBOSE, KERN_INFO
1805 "...changing IO-APIC physical APIC ID to %d ...", 1815 "...changing IO-APIC physical APIC ID to %d ...",
1806 mp_ioapics[apic].mpc_apicid); 1816 mp_ioapics[apic].mp_apicid);
1807 1817
1808 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; 1818 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1809 spin_lock_irqsave(&ioapic_lock, flags); 1819 spin_lock_irqsave(&ioapic_lock, flags);
1810 io_apic_write(apic, 0, reg_00.raw); 1820 io_apic_write(apic, 0, reg_00.raw);
1811 spin_unlock_irqrestore(&ioapic_lock, flags); 1821 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1816,15 +1826,12 @@ static void __init setup_ioapic_ids_from_mpc(void)
1816 spin_lock_irqsave(&ioapic_lock, flags); 1826 spin_lock_irqsave(&ioapic_lock, flags);
1817 reg_00.raw = io_apic_read(apic, 0); 1827 reg_00.raw = io_apic_read(apic, 0);
1818 spin_unlock_irqrestore(&ioapic_lock, flags); 1828 spin_unlock_irqrestore(&ioapic_lock, flags);
1819 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) 1829 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1820 printk("could not set ID!\n"); 1830 printk("could not set ID!\n");
1821 else 1831 else
1822 apic_printk(APIC_VERBOSE, " ok.\n"); 1832 apic_printk(APIC_VERBOSE, " ok.\n");
1823 } 1833 }
1824} 1834}
1825#else
1826static void __init setup_ioapic_ids_from_mpc(void) { }
1827#endif
1828 1835
1829int no_timer_check __initdata; 1836int no_timer_check __initdata;
1830 1837
@@ -2347,8 +2354,8 @@ static int ioapic_resume(struct sys_device *dev)
2347 2354
2348 spin_lock_irqsave(&ioapic_lock, flags); 2355 spin_lock_irqsave(&ioapic_lock, flags);
2349 reg_00.raw = io_apic_read(dev->id, 0); 2356 reg_00.raw = io_apic_read(dev->id, 0);
2350 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { 2357 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
2351 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; 2358 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
2352 io_apic_write(dev->id, 0, reg_00.raw); 2359 io_apic_write(dev->id, 0, reg_00.raw);
2353 } 2360 }
2354 spin_unlock_irqrestore(&ioapic_lock, flags); 2361 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2781,7 +2788,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2781 2788
2782 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " 2789 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2783 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, 2790 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2784 mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, 2791 mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
2785 edge_level, active_high_low); 2792 edge_level, active_high_low);
2786 2793
2787 ioapic_register_intr(irq, entry.vector, edge_level); 2794 ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2802,8 +2809,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2802 return -1; 2809 return -1;
2803 2810
2804 for (i = 0; i < mp_irq_entries; i++) 2811 for (i = 0; i < mp_irq_entries; i++)
2805 if (mp_irqs[i].mpc_irqtype == mp_INT && 2812 if (mp_irqs[i].mp_irqtype == mp_INT &&
2806 mp_irqs[i].mpc_srcbusirq == bus_irq) 2813 mp_irqs[i].mp_srcbusirq == bus_irq)
2807 break; 2814 break;
2808 if (i >= mp_irq_entries) 2815 if (i >= mp_irq_entries)
2809 return -1; 2816 return -1;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..339cf6f926dc 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -104,15 +104,17 @@ DEFINE_SPINLOCK(vector_lock);
104int nr_ioapic_registers[MAX_IO_APICS]; 104int nr_ioapic_registers[MAX_IO_APICS];
105 105
106/* I/O APIC entries */ 106/* I/O APIC entries */
107struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; 107struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
108int nr_ioapics; 108int nr_ioapics;
109 109
110/* MP IRQ source entries */ 110/* MP IRQ source entries */
111struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 111struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
112 112
113/* # of MP IRQ source entries */ 113/* # of MP IRQ source entries */
114int mp_irq_entries; 114int mp_irq_entries;
115 115
116DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
117
116/* 118/*
117 * Rough estimation of how many shared IRQs there are, can 119 * Rough estimation of how many shared IRQs there are, can
118 * be changed anytime. 120 * be changed anytime.
@@ -140,7 +142,7 @@ struct io_apic {
140static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 142static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
141{ 143{
142 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 144 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
143 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); 145 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
144} 146}
145 147
146static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 148static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -453,10 +455,10 @@ static int find_irq_entry(int apic, int pin, int type)
453 int i; 455 int i;
454 456
455 for (i = 0; i < mp_irq_entries; i++) 457 for (i = 0; i < mp_irq_entries; i++)
456 if (mp_irqs[i].mpc_irqtype == type && 458 if (mp_irqs[i].mp_irqtype == type &&
457 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || 459 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
458 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && 460 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
459 mp_irqs[i].mpc_dstirq == pin) 461 mp_irqs[i].mp_dstirq == pin)
460 return i; 462 return i;
461 463
462 return -1; 464 return -1;
@@ -470,13 +472,13 @@ static int __init find_isa_irq_pin(int irq, int type)
470 int i; 472 int i;
471 473
472 for (i = 0; i < mp_irq_entries; i++) { 474 for (i = 0; i < mp_irq_entries; i++) {
473 int lbus = mp_irqs[i].mpc_srcbus; 475 int lbus = mp_irqs[i].mp_srcbus;
474 476
475 if (test_bit(lbus, mp_bus_not_pci) && 477 if (test_bit(lbus, mp_bus_not_pci) &&
476 (mp_irqs[i].mpc_irqtype == type) && 478 (mp_irqs[i].mp_irqtype == type) &&
477 (mp_irqs[i].mpc_srcbusirq == irq)) 479 (mp_irqs[i].mp_srcbusirq == irq))
478 480
479 return mp_irqs[i].mpc_dstirq; 481 return mp_irqs[i].mp_dstirq;
480 } 482 }
481 return -1; 483 return -1;
482} 484}
@@ -486,17 +488,17 @@ static int __init find_isa_irq_apic(int irq, int type)
486 int i; 488 int i;
487 489
488 for (i = 0; i < mp_irq_entries; i++) { 490 for (i = 0; i < mp_irq_entries; i++) {
489 int lbus = mp_irqs[i].mpc_srcbus; 491 int lbus = mp_irqs[i].mp_srcbus;
490 492
491 if (test_bit(lbus, mp_bus_not_pci) && 493 if (test_bit(lbus, mp_bus_not_pci) &&
492 (mp_irqs[i].mpc_irqtype == type) && 494 (mp_irqs[i].mp_irqtype == type) &&
493 (mp_irqs[i].mpc_srcbusirq == irq)) 495 (mp_irqs[i].mp_srcbusirq == irq))
494 break; 496 break;
495 } 497 }
496 if (i < mp_irq_entries) { 498 if (i < mp_irq_entries) {
497 int apic; 499 int apic;
498 for(apic = 0; apic < nr_ioapics; apic++) { 500 for(apic = 0; apic < nr_ioapics; apic++) {
499 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) 501 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
500 return apic; 502 return apic;
501 } 503 }
502 } 504 }
@@ -516,28 +518,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
516 518
517 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", 519 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
518 bus, slot, pin); 520 bus, slot, pin);
519 if (mp_bus_id_to_pci_bus[bus] == -1) { 521 if (test_bit(bus, mp_bus_not_pci)) {
520 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); 522 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
521 return -1; 523 return -1;
522 } 524 }
523 for (i = 0; i < mp_irq_entries; i++) { 525 for (i = 0; i < mp_irq_entries; i++) {
524 int lbus = mp_irqs[i].mpc_srcbus; 526 int lbus = mp_irqs[i].mp_srcbus;
525 527
526 for (apic = 0; apic < nr_ioapics; apic++) 528 for (apic = 0; apic < nr_ioapics; apic++)
527 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || 529 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
528 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) 530 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
529 break; 531 break;
530 532
531 if (!test_bit(lbus, mp_bus_not_pci) && 533 if (!test_bit(lbus, mp_bus_not_pci) &&
532 !mp_irqs[i].mpc_irqtype && 534 !mp_irqs[i].mp_irqtype &&
533 (bus == lbus) && 535 (bus == lbus) &&
534 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { 536 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
535 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); 537 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
536 538
537 if (!(apic || IO_APIC_IRQ(irq))) 539 if (!(apic || IO_APIC_IRQ(irq)))
538 continue; 540 continue;
539 541
540 if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) 542 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
541 return irq; 543 return irq;
542 /* 544 /*
543 * Use the first all-but-pin matching entry as a 545 * Use the first all-but-pin matching entry as a
@@ -565,13 +567,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
565 567
566static int MPBIOS_polarity(int idx) 568static int MPBIOS_polarity(int idx)
567{ 569{
568 int bus = mp_irqs[idx].mpc_srcbus; 570 int bus = mp_irqs[idx].mp_srcbus;
569 int polarity; 571 int polarity;
570 572
571 /* 573 /*
572 * Determine IRQ line polarity (high active or low active): 574 * Determine IRQ line polarity (high active or low active):
573 */ 575 */
574 switch (mp_irqs[idx].mpc_irqflag & 3) 576 switch (mp_irqs[idx].mp_irqflag & 3)
575 { 577 {
576 case 0: /* conforms, ie. bus-type dependent polarity */ 578 case 0: /* conforms, ie. bus-type dependent polarity */
577 if (test_bit(bus, mp_bus_not_pci)) 579 if (test_bit(bus, mp_bus_not_pci))
@@ -607,13 +609,13 @@ static int MPBIOS_polarity(int idx)
607 609
608static int MPBIOS_trigger(int idx) 610static int MPBIOS_trigger(int idx)
609{ 611{
610 int bus = mp_irqs[idx].mpc_srcbus; 612 int bus = mp_irqs[idx].mp_srcbus;
611 int trigger; 613 int trigger;
612 614
613 /* 615 /*
614 * Determine IRQ trigger mode (edge or level sensitive): 616 * Determine IRQ trigger mode (edge or level sensitive):
615 */ 617 */
616 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) 618 switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
617 { 619 {
618 case 0: /* conforms, ie. bus-type dependent */ 620 case 0: /* conforms, ie. bus-type dependent */
619 if (test_bit(bus, mp_bus_not_pci)) 621 if (test_bit(bus, mp_bus_not_pci))
@@ -660,16 +662,16 @@ static inline int irq_trigger(int idx)
660static int pin_2_irq(int idx, int apic, int pin) 662static int pin_2_irq(int idx, int apic, int pin)
661{ 663{
662 int irq, i; 664 int irq, i;
663 int bus = mp_irqs[idx].mpc_srcbus; 665 int bus = mp_irqs[idx].mp_srcbus;
664 666
665 /* 667 /*
666 * Debugging check, we are in big trouble if this message pops up! 668 * Debugging check, we are in big trouble if this message pops up!
667 */ 669 */
668 if (mp_irqs[idx].mpc_dstirq != pin) 670 if (mp_irqs[idx].mp_dstirq != pin)
669 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 671 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
670 672
671 if (test_bit(bus, mp_bus_not_pci)) { 673 if (test_bit(bus, mp_bus_not_pci)) {
672 irq = mp_irqs[idx].mpc_srcbusirq; 674 irq = mp_irqs[idx].mp_srcbusirq;
673 } else { 675 } else {
674 /* 676 /*
675 * PCI IRQs are mapped in order 677 * PCI IRQs are mapped in order
@@ -846,7 +848,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
846 apic_printk(APIC_VERBOSE,KERN_DEBUG 848 apic_printk(APIC_VERBOSE,KERN_DEBUG
847 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 849 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
848 "IRQ %d Mode:%i Active:%i)\n", 850 "IRQ %d Mode:%i Active:%i)\n",
849 apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, 851 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
850 irq, trigger, polarity); 852 irq, trigger, polarity);
851 853
852 /* 854 /*
@@ -887,10 +889,10 @@ static void __init setup_IO_APIC_irqs(void)
887 idx = find_irq_entry(apic,pin,mp_INT); 889 idx = find_irq_entry(apic,pin,mp_INT);
888 if (idx == -1) { 890 if (idx == -1) {
889 if (first_notcon) { 891 if (first_notcon) {
890 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); 892 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
891 first_notcon = 0; 893 first_notcon = 0;
892 } else 894 } else
893 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); 895 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
894 continue; 896 continue;
895 } 897 }
896 if (!first_notcon) { 898 if (!first_notcon) {
@@ -965,7 +967,7 @@ void __apicdebuginit print_IO_APIC(void)
965 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 967 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
966 for (i = 0; i < nr_ioapics; i++) 968 for (i = 0; i < nr_ioapics; i++)
967 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 969 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
968 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); 970 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
969 971
970 /* 972 /*
971 * We are a bit conservative about what we expect. We have to 973 * We are a bit conservative about what we expect. We have to
@@ -983,7 +985,7 @@ void __apicdebuginit print_IO_APIC(void)
983 spin_unlock_irqrestore(&ioapic_lock, flags); 985 spin_unlock_irqrestore(&ioapic_lock, flags);
984 986
985 printk("\n"); 987 printk("\n");
986 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); 988 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
987 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 989 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
988 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 990 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
989 991
@@ -1841,8 +1843,8 @@ static int ioapic_resume(struct sys_device *dev)
1841 1843
1842 spin_lock_irqsave(&ioapic_lock, flags); 1844 spin_lock_irqsave(&ioapic_lock, flags);
1843 reg_00.raw = io_apic_read(dev->id, 0); 1845 reg_00.raw = io_apic_read(dev->id, 0);
1844 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { 1846 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
1845 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; 1847 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
1846 io_apic_write(dev->id, 0, reg_00.raw); 1848 io_apic_write(dev->id, 0, reg_00.raw);
1847 } 1849 }
1848 spin_unlock_irqrestore(&ioapic_lock, flags); 1850 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2242,8 +2244,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2242 return -1; 2244 return -1;
2243 2245
2244 for (i = 0; i < mp_irq_entries; i++) 2246 for (i = 0; i < mp_irq_entries; i++)
2245 if (mp_irqs[i].mpc_irqtype == mp_INT && 2247 if (mp_irqs[i].mp_irqtype == mp_INT &&
2246 mp_irqs[i].mpc_srcbusirq == bus_irq) 2248 mp_irqs[i].mp_srcbusirq == bus_irq)
2247 break; 2249 break;
2248 if (i >= mp_irq_entries) 2250 if (i >= mp_irq_entries)
2249 return -1; 2251 return -1;
@@ -2336,7 +2338,7 @@ void __init ioapic_init_mappings(void)
2336 ioapic_res = ioapic_setup_resources(); 2338 ioapic_res = ioapic_setup_resources();
2337 for (i = 0; i < nr_ioapics; i++) { 2339 for (i = 0; i < nr_ioapics; i++) {
2338 if (smp_found_config) { 2340 if (smp_found_config) {
2339 ioapic_phys = mp_ioapics[i].mpc_apicaddr; 2341 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2340 } else { 2342 } else {
2341 ioapic_phys = (unsigned long) 2343 ioapic_phys = (unsigned long)
2342 alloc_bootmem_pages(PAGE_SIZE); 2344 alloc_bootmem_pages(PAGE_SIZE);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 404683b94e79..1cc7a4b8643f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -25,6 +25,8 @@
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/acpi.h> 26#include <asm/acpi.h>
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h>
29#include <asm/trampoline.h>
28 30
29#include <mach_apic.h> 31#include <mach_apic.h>
30#ifdef CONFIG_X86_32 32#ifdef CONFIG_X86_32
@@ -32,28 +34,6 @@
32#include <mach_mpparse.h> 34#include <mach_mpparse.h>
33#endif 35#endif
34 36
35/* Have we found an MP table */
36int smp_found_config;
37
38/*
39 * Various Linux-internal data structures created from the
40 * MP-table.
41 */
42#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
43int mp_bus_id_to_type[MAX_MP_BUSSES];
44#endif
45
46DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
47int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
48
49static int mp_current_pci_id;
50
51int pic_mode;
52
53/*
54 * Intel MP BIOS table parsing routines:
55 */
56
57/* 37/*
58 * Checksum an MP configuration block. 38 * Checksum an MP configuration block.
59 */ 39 */
@@ -69,15 +49,73 @@ static int __init mpf_checksum(unsigned char *mp, int len)
69} 49}
70 50
71#ifdef CONFIG_X86_NUMAQ 51#ifdef CONFIG_X86_NUMAQ
52int found_numaq;
72/* 53/*
73 * Have to match translation table entries to main table entries by counter 54 * Have to match translation table entries to main table entries by counter
74 * hence the mpc_record variable .... can't see a less disgusting way of 55 * hence the mpc_record variable .... can't see a less disgusting way of
75 * doing this .... 56 * doing this ....
76 */ 57 */
58struct mpc_config_translation {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
77 68
78static int mpc_record; 69static int mpc_record;
79static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] 70static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
80 __cpuinitdata; 71 __cpuinitdata;
72
73static inline int generate_logical_apicid(int quad, int phys_apicid)
74{
75 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
76}
77
78
79static inline int mpc_apic_id(struct mpc_config_processor *m,
80 struct mpc_config_translation *translation_record)
81{
82 int quad = translation_record->trans_quad;
83 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
84
85 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
86 m->mpc_apicid,
87 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
88 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
89 m->mpc_apicver, quad, logical_apicid);
90 return logical_apicid;
91}
92
93int mp_bus_id_to_node[MAX_MP_BUSSES];
94
95int mp_bus_id_to_local[MAX_MP_BUSSES];
96
97static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
98 struct mpc_config_translation *translation)
99{
100 int quad = translation->trans_quad;
101 int local = translation->trans_local;
102
103 mp_bus_id_to_node[m->mpc_busid] = quad;
104 mp_bus_id_to_local[m->mpc_busid] = local;
105 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
106 m->mpc_busid, name, quad);
107}
108
109int quad_local_to_mp_bus_id [NR_CPUS/4][4];
110static void mpc_oem_pci_bus(struct mpc_config_bus *m,
111 struct mpc_config_translation *translation)
112{
113 int quad = translation->trans_quad;
114 int local = translation->trans_local;
115
116 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
117}
118
81#endif 119#endif
82 120
83static void __cpuinit MP_processor_info(struct mpc_config_processor *m) 121static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
@@ -90,7 +128,10 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
90 return; 128 return;
91 } 129 }
92#ifdef CONFIG_X86_NUMAQ 130#ifdef CONFIG_X86_NUMAQ
93 apicid = mpc_apic_id(m, translation_table[mpc_record]); 131 if (found_numaq)
132 apicid = mpc_apic_id(m, translation_table[mpc_record]);
133 else
134 apicid = m->mpc_apicid;
94#else 135#else
95 apicid = m->mpc_apicid; 136 apicid = m->mpc_apicid;
96#endif 137#endif
@@ -103,17 +144,18 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
103 generic_processor_info(apicid, m->mpc_apicver); 144 generic_processor_info(apicid, m->mpc_apicver);
104} 145}
105 146
147#ifdef CONFIG_X86_IO_APIC
106static void __init MP_bus_info(struct mpc_config_bus *m) 148static void __init MP_bus_info(struct mpc_config_bus *m)
107{ 149{
108 char str[7]; 150 char str[7];
109
110 memcpy(str, m->mpc_bustype, 6); 151 memcpy(str, m->mpc_bustype, 6);
111 str[6] = 0; 152 str[6] = 0;
112 153
113#ifdef CONFIG_X86_NUMAQ 154#ifdef CONFIG_X86_NUMAQ
114 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 155 if (found_numaq)
156 mpc_oem_bus_info(m, str, translation_table[mpc_record]);
115#else 157#else
116 Dprintk("Bus #%d is %s\n", m->mpc_busid, str); 158 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
117#endif 159#endif
118 160
119#if MAX_MP_BUSSES < 256 161#if MAX_MP_BUSSES < 256
@@ -132,11 +174,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
132#endif 174#endif
133 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 175 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
134#ifdef CONFIG_X86_NUMAQ 176#ifdef CONFIG_X86_NUMAQ
135 mpc_oem_pci_bus(m, translation_table[mpc_record]); 177 if (found_numaq)
178 mpc_oem_pci_bus(m, translation_table[mpc_record]);
136#endif 179#endif
137 clear_bit(m->mpc_busid, mp_bus_not_pci); 180 clear_bit(m->mpc_busid, mp_bus_not_pci);
138 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
139 mp_current_pci_id++;
140#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 181#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
141 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 182 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
142 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { 183 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -147,6 +188,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
147 } else 188 } else
148 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 189 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
149} 190}
191#endif
150 192
151#ifdef CONFIG_X86_IO_APIC 193#ifdef CONFIG_X86_IO_APIC
152 194
@@ -176,18 +218,89 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
176 if (bad_ioapic(m->mpc_apicaddr)) 218 if (bad_ioapic(m->mpc_apicaddr))
177 return; 219 return;
178 220
179 mp_ioapics[nr_ioapics] = *m; 221 mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
222 mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
223 mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
224 mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
225 mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
180 nr_ioapics++; 226 nr_ioapics++;
181} 227}
182 228
183static void __init MP_intsrc_info(struct mpc_config_intsrc *m) 229static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
184{ 230{
185 mp_irqs[mp_irq_entries] = *m; 231 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
186 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
187 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 232 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
188 m->mpc_irqtype, m->mpc_irqflag & 3, 233 m->mpc_irqtype, m->mpc_irqflag & 3,
189 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 234 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
190 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); 235 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
236}
237
238static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
239{
240 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
241 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
242 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
243 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
244 mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
245}
246
247static void assign_to_mp_irq(struct mpc_config_intsrc *m,
248 struct mp_config_intsrc *mp_irq)
249{
250 mp_irq->mp_dstapic = m->mpc_dstapic;
251 mp_irq->mp_type = m->mpc_type;
252 mp_irq->mp_irqtype = m->mpc_irqtype;
253 mp_irq->mp_irqflag = m->mpc_irqflag;
254 mp_irq->mp_srcbus = m->mpc_srcbus;
255 mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
256 mp_irq->mp_dstirq = m->mpc_dstirq;
257}
258
259static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
260 struct mpc_config_intsrc *m)
261{
262 m->mpc_dstapic = mp_irq->mp_dstapic;
263 m->mpc_type = mp_irq->mp_type;
264 m->mpc_irqtype = mp_irq->mp_irqtype;
265 m->mpc_irqflag = mp_irq->mp_irqflag;
266 m->mpc_srcbus = mp_irq->mp_srcbus;
267 m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
268 m->mpc_dstirq = mp_irq->mp_dstirq;
269}
270
271static int mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
272 struct mpc_config_intsrc *m)
273{
274 if (mp_irq->mp_dstapic != m->mpc_dstapic)
275 return 1;
276 if (mp_irq->mp_type != m->mpc_type)
277 return 2;
278 if (mp_irq->mp_irqtype != m->mpc_irqtype)
279 return 3;
280 if (mp_irq->mp_irqflag != m->mpc_irqflag)
281 return 4;
282 if (mp_irq->mp_srcbus != m->mpc_srcbus)
283 return 5;
284 if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
285 return 6;
286 if (mp_irq->mp_dstirq != m->mpc_dstirq)
287 return 7;
288
289 return 0;
290}
291
292void MP_intsrc_info(struct mpc_config_intsrc *m)
293{
294 int i;
295
296 print_MP_intsrc_info(m);
297
298 for (i = 0; i < mp_irq_entries; i++) {
299 if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
300 return;
301 }
302
303 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
191 if (++mp_irq_entries == MAX_IRQ_SOURCES) 304 if (++mp_irq_entries == MAX_IRQ_SOURCES)
192 panic("Max # of irq sources exceeded!!\n"); 305 panic("Max # of irq sources exceeded!!\n");
193} 306}
@@ -196,7 +309,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
196 309
197static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) 310static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
198{ 311{
199 Dprintk("Lint: type %d, pol %d, trig %d, bus %d," 312 printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
200 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 313 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
201 m->mpc_irqtype, m->mpc_irqflag & 3, 314 m->mpc_irqtype, m->mpc_irqflag & 3,
202 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 315 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -266,11 +379,14 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
266 } 379 }
267} 380}
268 381
269static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, 382void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
270 char *productid) 383 char *productid)
271{ 384{
272 if (strncmp(oem, "IBM NUMA", 8)) 385 if (strncmp(oem, "IBM NUMA", 8))
273 printk("Warning! May not be a NUMA-Q system!\n"); 386 printk("Warning! Not a NUMA-Q system!\n");
387 else
388 found_numaq = 1;
389
274 if (mpc->mpc_oemptr) 390 if (mpc->mpc_oemptr)
275 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, 391 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
276 mpc->mpc_oemsize); 392 mpc->mpc_oemsize);
@@ -281,12 +397,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
281 * Read/parse the MPC 397 * Read/parse the MPC
282 */ 398 */
283 399
284static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) 400static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
401 char *str)
285{ 402{
286 char str[16];
287 char oem[10];
288 int count = sizeof(*mpc);
289 unsigned char *mpt = ((unsigned char *)mpc) + count;
290 403
291 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { 404 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
292 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", 405 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -309,19 +422,42 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
309 } 422 }
310 memcpy(oem, mpc->mpc_oem, 8); 423 memcpy(oem, mpc->mpc_oem, 8);
311 oem[8] = 0; 424 oem[8] = 0;
312 printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem); 425 printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
313 426
314 memcpy(str, mpc->mpc_productid, 12); 427 memcpy(str, mpc->mpc_productid, 12);
315 str[12] = 0; 428 str[12] = 0;
316 printk("Product ID: %s ", str);
317 429
318#ifdef CONFIG_X86_32 430 printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
319 mps_oem_check(mpc, oem, str);
320#endif
321 printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
322 431
323 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); 432 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
324 433
434 return 1;
435}
436
437static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
438{
439 char str[16];
440 char oem[10];
441
442 int count = sizeof(*mpc);
443 unsigned char *mpt = ((unsigned char *)mpc) + count;
444
445 if (!smp_check_mpc(mpc, oem, str))
446 return 0;
447
448#ifdef CONFIG_X86_32
449 /*
450 * need to make sure summit and es7000's mps_oem_check is safe to be
451 * called early via genericarch 's mps_oem_check
452 */
453 if (early) {
454#ifdef CONFIG_X86_NUMAQ
455 numaq_mps_oem_check(mpc, oem, str);
456#endif
457 } else
458 mps_oem_check(mpc, oem, str);
459#endif
460
325 /* save the local APIC address, it might be non-default */ 461 /* save the local APIC address, it might be non-default */
326 if (!acpi_lapic) 462 if (!acpi_lapic)
327 mp_lapic_addr = mpc->mpc_lapic; 463 mp_lapic_addr = mpc->mpc_lapic;
@@ -352,7 +488,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
352 { 488 {
353 struct mpc_config_bus *m = 489 struct mpc_config_bus *m =
354 (struct mpc_config_bus *)mpt; 490 (struct mpc_config_bus *)mpt;
491#ifdef CONFIG_X86_IO_APIC
355 MP_bus_info(m); 492 MP_bus_info(m);
493#endif
356 mpt += sizeof(*m); 494 mpt += sizeof(*m);
357 count += sizeof(*m); 495 count += sizeof(*m);
358 break; 496 break;
@@ -402,6 +540,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
402 ++mpc_record; 540 ++mpc_record;
403#endif 541#endif
404 } 542 }
543
544#ifdef CONFIG_X86_GENERICARCH
545 generic_bigsmp_probe();
546#endif
547
405 setup_apic_routing(); 548 setup_apic_routing();
406 if (!num_processors) 549 if (!num_processors)
407 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 550 printk(KERN_ERR "MPTABLE: no processors registered!\n");
@@ -427,7 +570,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
427 intsrc.mpc_type = MP_INTSRC; 570 intsrc.mpc_type = MP_INTSRC;
428 intsrc.mpc_irqflag = 0; /* conforming */ 571 intsrc.mpc_irqflag = 0; /* conforming */
429 intsrc.mpc_srcbus = 0; 572 intsrc.mpc_srcbus = 0;
430 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; 573 intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
431 574
432 intsrc.mpc_irqtype = mp_INT; 575 intsrc.mpc_irqtype = mp_INT;
433 576
@@ -488,40 +631,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
488 MP_intsrc_info(&intsrc); 631 MP_intsrc_info(&intsrc);
489} 632}
490 633
491#endif
492 634
493static inline void __init construct_default_ISA_mptable(int mpc_default_type) 635static void construct_ioapic_table(int mpc_default_type)
494{ 636{
495 struct mpc_config_processor processor;
496 struct mpc_config_bus bus;
497#ifdef CONFIG_X86_IO_APIC
498 struct mpc_config_ioapic ioapic; 637 struct mpc_config_ioapic ioapic;
499#endif 638 struct mpc_config_bus bus;
500 struct mpc_config_lintsrc lintsrc;
501 int linttypes[2] = { mp_ExtINT, mp_NMI };
502 int i;
503
504 /*
505 * local APIC has default address
506 */
507 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
508
509 /*
510 * 2 CPUs, numbered 0 & 1.
511 */
512 processor.mpc_type = MP_PROCESSOR;
513 /* Either an integrated APIC or a discrete 82489DX. */
514 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
515 processor.mpc_cpuflag = CPU_ENABLED;
516 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
517 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
518 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
519 processor.mpc_reserved[0] = 0;
520 processor.mpc_reserved[1] = 0;
521 for (i = 0; i < 2; i++) {
522 processor.mpc_apicid = i;
523 MP_processor_info(&processor);
524 }
525 639
526 bus.mpc_type = MP_BUS; 640 bus.mpc_type = MP_BUS;
527 bus.mpc_busid = 0; 641 bus.mpc_busid = 0;
@@ -550,7 +664,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
550 MP_bus_info(&bus); 664 MP_bus_info(&bus);
551 } 665 }
552 666
553#ifdef CONFIG_X86_IO_APIC
554 ioapic.mpc_type = MP_IOAPIC; 667 ioapic.mpc_type = MP_IOAPIC;
555 ioapic.mpc_apicid = 2; 668 ioapic.mpc_apicid = 2;
556 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; 669 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -562,7 +675,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
562 * We set up most of the low 16 IO-APIC pins according to MPS rules. 675 * We set up most of the low 16 IO-APIC pins according to MPS rules.
563 */ 676 */
564 construct_default_ioirq_mptable(mpc_default_type); 677 construct_default_ioirq_mptable(mpc_default_type);
678}
679#else
680static inline void construct_ioapic_table(int mpc_default_type) { }
565#endif 681#endif
682
683static inline void __init construct_default_ISA_mptable(int mpc_default_type)
684{
685 struct mpc_config_processor processor;
686 struct mpc_config_lintsrc lintsrc;
687 int linttypes[2] = { mp_ExtINT, mp_NMI };
688 int i;
689
690 /*
691 * local APIC has default address
692 */
693 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
694
695 /*
696 * 2 CPUs, numbered 0 & 1.
697 */
698 processor.mpc_type = MP_PROCESSOR;
699 /* Either an integrated APIC or a discrete 82489DX. */
700 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
701 processor.mpc_cpuflag = CPU_ENABLED;
702 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
703 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
704 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
705 processor.mpc_reserved[0] = 0;
706 processor.mpc_reserved[1] = 0;
707 for (i = 0; i < 2; i++) {
708 processor.mpc_apicid = i;
709 MP_processor_info(&processor);
710 }
711
712 construct_ioapic_table(mpc_default_type);
713
566 lintsrc.mpc_type = MP_LINTSRC; 714 lintsrc.mpc_type = MP_LINTSRC;
567 lintsrc.mpc_irqflag = 0; /* conforming */ 715 lintsrc.mpc_irqflag = 0; /* conforming */
568 lintsrc.mpc_srcbusid = 0; 716 lintsrc.mpc_srcbusid = 0;
@@ -600,7 +748,7 @@ static void __init __get_smp_config(unsigned early)
600 748
601 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 749 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
602 mpf->mpf_specification); 750 mpf->mpf_specification);
603#ifdef CONFIG_X86_32 751#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
604 if (mpf->mpf_feature2 & (1 << 7)) { 752 if (mpf->mpf_feature2 & (1 << 7)) {
605 printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); 753 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
606 pic_mode = 1; 754 pic_mode = 1;
@@ -632,7 +780,9 @@ static void __init __get_smp_config(unsigned early)
632 * override the defaults. 780 * override the defaults.
633 */ 781 */
634 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { 782 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
783#ifdef CONFIG_X86_LOCAL_APIC
635 smp_found_config = 0; 784 smp_found_config = 0;
785#endif
636 printk(KERN_ERR 786 printk(KERN_ERR
637 "BIOS bug, MP table errors detected!...\n"); 787 "BIOS bug, MP table errors detected!...\n");
638 printk(KERN_ERR "... disabling SMP support. " 788 printk(KERN_ERR "... disabling SMP support. "
@@ -689,7 +839,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
689 unsigned int *bp = phys_to_virt(base); 839 unsigned int *bp = phys_to_virt(base);
690 struct intel_mp_floating *mpf; 840 struct intel_mp_floating *mpf;
691 841
692 Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); 842 printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
693 BUILD_BUG_ON(sizeof(*mpf) != 16); 843 BUILD_BUG_ON(sizeof(*mpf) != 16);
694 844
695 while (length > 0) { 845 while (length > 0) {
@@ -699,8 +849,9 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
699 !mpf_checksum((unsigned char *)bp, 16) && 849 !mpf_checksum((unsigned char *)bp, 16) &&
700 ((mpf->mpf_specification == 1) 850 ((mpf->mpf_specification == 1)
701 || (mpf->mpf_specification == 4))) { 851 || (mpf->mpf_specification == 4))) {
702 852#ifdef CONFIG_X86_LOCAL_APIC
703 smp_found_config = 1; 853 smp_found_config = 1;
854#endif
704 mpf_found = mpf; 855 mpf_found = mpf;
705#ifdef CONFIG_X86_32 856#ifdef CONFIG_X86_32
706 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", 857 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
@@ -790,298 +941,294 @@ void __init find_smp_config(void)
790 __find_smp_config(1); 941 __find_smp_config(1);
791} 942}
792 943
793/* -------------------------------------------------------------------------- 944#ifdef CONFIG_X86_IO_APIC
794 ACPI-based MP Configuration 945static u8 __initdata irq_used[MAX_IRQ_SOURCES];
795 -------------------------------------------------------------------------- */
796 946
797/* 947static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
798 * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: 948{
799 */ 949 int i;
800int es7000_plat;
801 950
802#ifdef CONFIG_ACPI 951 if (m->mpc_irqtype != mp_INT)
952 return 0;
803 953
804#ifdef CONFIG_X86_IO_APIC 954 if (m->mpc_irqflag != 0x0f)
955 return 0;
805 956
806#define MP_ISA_BUS 0 957 /* not legacy */
807 958
808extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; 959 for (i = 0; i < mp_irq_entries; i++) {
960 if (mp_irqs[i].mp_irqtype != mp_INT)
961 continue;
809 962
810static int mp_find_ioapic(int gsi) 963 if (mp_irqs[i].mp_irqflag != 0x0f)
811{ 964 continue;
812 int i = 0;
813 965
814 /* Find the IOAPIC that manages this GSI. */ 966 if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
815 for (i = 0; i < nr_ioapics; i++) { 967 continue;
816 if ((gsi >= mp_ioapic_routing[i].gsi_base) 968 if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
817 && (gsi <= mp_ioapic_routing[i].gsi_end)) 969 continue;
818 return i; 970 if (irq_used[i]) {
971 /* already claimed */
972 return -2;
973 }
974 irq_used[i] = 1;
975 return i;
819 } 976 }
820 977
821 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); 978 /* not found */
822 return -1; 979 return -1;
823} 980}
824 981
825static u8 __init uniq_ioapic_id(u8 id) 982#define SPARE_SLOT_NUM 20
826{ 983
827#ifdef CONFIG_X86_32 984static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
828 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
829 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
830 return io_apic_get_unique_id(nr_ioapics, id);
831 else
832 return id;
833#else
834 int i;
835 DECLARE_BITMAP(used, 256);
836 bitmap_zero(used, 256);
837 for (i = 0; i < nr_ioapics; i++) {
838 struct mpc_config_ioapic *ia = &mp_ioapics[i];
839 __set_bit(ia->mpc_apicid, used);
840 }
841 if (!test_bit(id, used))
842 return id;
843 return find_first_zero_bit(used, 256);
844#endif 985#endif
845}
846 986
847void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 987static int __init replace_intsrc_all(struct mp_config_table *mpc,
988 unsigned long mpc_new_phys,
989 unsigned long mpc_new_length)
848{ 990{
849 int idx = 0; 991#ifdef CONFIG_X86_IO_APIC
850 992 int i;
851 if (bad_ioapic(address)) 993 int nr_m_spare = 0;
852 return; 994#endif
853 995
854 idx = nr_ioapics; 996 int count = sizeof(*mpc);
997 unsigned char *mpt = ((unsigned char *)mpc) + count;
855 998
856 mp_ioapics[idx].mpc_type = MP_IOAPIC; 999 printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
857 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; 1000 while (count < mpc->mpc_length) {
858 mp_ioapics[idx].mpc_apicaddr = address; 1001 switch (*mpt) {
1002 case MP_PROCESSOR:
1003 {
1004 struct mpc_config_processor *m =
1005 (struct mpc_config_processor *)mpt;
1006 mpt += sizeof(*m);
1007 count += sizeof(*m);
1008 break;
1009 }
1010 case MP_BUS:
1011 {
1012 struct mpc_config_bus *m =
1013 (struct mpc_config_bus *)mpt;
1014 mpt += sizeof(*m);
1015 count += sizeof(*m);
1016 break;
1017 }
1018 case MP_IOAPIC:
1019 {
1020 mpt += sizeof(struct mpc_config_ioapic);
1021 count += sizeof(struct mpc_config_ioapic);
1022 break;
1023 }
1024 case MP_INTSRC:
1025 {
1026#ifdef CONFIG_X86_IO_APIC
1027 struct mpc_config_intsrc *m =
1028 (struct mpc_config_intsrc *)mpt;
859 1029
860 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 1030 printk(KERN_INFO "OLD ");
861 mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); 1031 print_MP_intsrc_info(m);
862#ifdef CONFIG_X86_32 1032 i = get_MP_intsrc_index(m);
863 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); 1033 if (i > 0) {
864#else 1034 assign_to_mpc_intsrc(&mp_irqs[i], m);
865 mp_ioapics[idx].mpc_apicver = 0; 1035 printk(KERN_INFO "NEW ");
1036 print_mp_irq_info(&mp_irqs[i]);
1037 } else if (!i) {
1038 /* legacy, do nothing */
1039 } else if (nr_m_spare < SPARE_SLOT_NUM) {
1040 /*
1041 * not found (-1), or duplicated (-2)
1042 * are invalid entries,
1043 * we need to use the slot later
1044 */
1045 m_spare[nr_m_spare] = m;
1046 nr_m_spare++;
1047 }
866#endif 1048#endif
867 /* 1049 mpt += sizeof(struct mpc_config_intsrc);
868 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 1050 count += sizeof(struct mpc_config_intsrc);
869 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 1051 break;
870 */ 1052 }
871 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; 1053 case MP_LINTSRC:
872 mp_ioapic_routing[idx].gsi_base = gsi_base; 1054 {
873 mp_ioapic_routing[idx].gsi_end = gsi_base + 1055 struct mpc_config_lintsrc *m =
874 io_apic_get_redir_entries(idx); 1056 (struct mpc_config_lintsrc *)mpt;
875 1057 mpt += sizeof(*m);
876 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 1058 count += sizeof(*m);
877 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 1059 break;
878 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, 1060 }
879 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); 1061 default:
880 1062 /* wrong mptable */
881 nr_ioapics++; 1063 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
882} 1064 printk(KERN_ERR "type %x\n", *mpt);
1065 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
1066 1, mpc, mpc->mpc_length, 1);
1067 goto out;
1068 }
1069 }
883 1070
884void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) 1071#ifdef CONFIG_X86_IO_APIC
885{ 1072 for (i = 0; i < mp_irq_entries; i++) {
886 struct mpc_config_intsrc intsrc; 1073 if (irq_used[i])
887 int ioapic = -1; 1074 continue;
888 int pin = -1;
889 1075
890 /* 1076 if (mp_irqs[i].mp_irqtype != mp_INT)
891 * Convert 'gsi' to 'ioapic.pin'. 1077 continue;
892 */
893 ioapic = mp_find_ioapic(gsi);
894 if (ioapic < 0)
895 return;
896 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
897 1078
898 /* 1079 if (mp_irqs[i].mp_irqflag != 0x0f)
899 * TBD: This check is for faulty timer entries, where the override 1080 continue;
900 * erroneously sets the trigger to level, resulting in a HUGE
901 * increase of timer interrupts!
902 */
903 if ((bus_irq == 0) && (trigger == 3))
904 trigger = 1;
905 1081
906 intsrc.mpc_type = MP_INTSRC; 1082 if (nr_m_spare > 0) {
907 intsrc.mpc_irqtype = mp_INT; 1083 printk(KERN_INFO "*NEW* found ");
908 intsrc.mpc_irqflag = (trigger << 2) | polarity; 1084 nr_m_spare--;
909 intsrc.mpc_srcbus = MP_ISA_BUS; 1085 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
910 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ 1086 m_spare[nr_m_spare] = NULL;
911 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ 1087 } else {
912 intsrc.mpc_dstirq = pin; /* INTIN# */ 1088 struct mpc_config_intsrc *m =
1089 (struct mpc_config_intsrc *)mpt;
1090 count += sizeof(struct mpc_config_intsrc);
1091 if (!mpc_new_phys) {
1092 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
1093 } else {
1094 if (count <= mpc_new_length)
1095 printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
1096 else {
1097 printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
1098 goto out;
1099 }
1100 }
1101 assign_to_mpc_intsrc(&mp_irqs[i], m);
1102 mpc->mpc_length = count;
1103 mpt += sizeof(struct mpc_config_intsrc);
1104 }
1105 print_mp_irq_info(&mp_irqs[i]);
1106 }
1107#endif
1108out:
1109 /* update checksum */
1110 mpc->mpc_checksum = 0;
1111 mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
1112 mpc->mpc_length);
913 1113
914 MP_intsrc_info(&intsrc); 1114 return 0;
915} 1115}
916 1116
917void __init mp_config_acpi_legacy_irqs(void) 1117int __initdata enable_update_mptable;
918{
919 struct mpc_config_intsrc intsrc;
920 int i = 0;
921 int ioapic = -1;
922 1118
923#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 1119static int __init update_mptable_setup(char *str)
924 /* 1120{
925 * Fabricate the legacy ISA bus (bus #31). 1121 enable_update_mptable = 1;
926 */ 1122 return 0;
927 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 1123}
928#endif 1124early_param("update_mptable", update_mptable_setup);
929 set_bit(MP_ISA_BUS, mp_bus_not_pci);
930 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
931 1125
932 /* 1126static unsigned long __initdata mpc_new_phys;
933 * Older generations of ES7000 have no legacy identity mappings 1127static unsigned long mpc_new_length __initdata = 4096;
934 */
935 if (es7000_plat == 1)
936 return;
937 1128
938 /* 1129/* alloc_mptable or alloc_mptable=4k */
939 * Locate the IOAPIC that manages the ISA IRQs (0-15). 1130static int __initdata alloc_mptable;
940 */ 1131static int __init parse_alloc_mptable_opt(char *p)
941 ioapic = mp_find_ioapic(0); 1132{
942 if (ioapic < 0) 1133 enable_update_mptable = 1;
943 return; 1134 alloc_mptable = 1;
1135 if (!p)
1136 return 0;
1137 mpc_new_length = memparse(p, &p);
1138 return 0;
1139}
1140early_param("alloc_mptable", parse_alloc_mptable_opt);
944 1141
945 intsrc.mpc_type = MP_INTSRC; 1142void __init early_reserve_e820_mpc_new(void)
946 intsrc.mpc_irqflag = 0; /* Conforming */ 1143{
947 intsrc.mpc_srcbus = MP_ISA_BUS; 1144 if (enable_update_mptable && alloc_mptable) {
948#ifdef CONFIG_X86_IO_APIC 1145 u64 startt = 0;
949 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; 1146#ifdef CONFIG_X86_TRAMPOLINE
1147 startt = TRAMPOLINE_BASE;
950#endif 1148#endif
951 /* 1149 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
952 * Use the default configuration for the IRQs 0-15. Unless
953 * overridden by (MADT) interrupt source override entries.
954 */
955 for (i = 0; i < 16; i++) {
956 int idx;
957
958 for (idx = 0; idx < mp_irq_entries; idx++) {
959 struct mpc_config_intsrc *irq = mp_irqs + idx;
960
961 /* Do we already have a mapping for this ISA IRQ? */
962 if (irq->mpc_srcbus == MP_ISA_BUS
963 && irq->mpc_srcbusirq == i)
964 break;
965
966 /* Do we already have a mapping for this IOAPIC pin */
967 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
968 (irq->mpc_dstirq == i))
969 break;
970 }
971
972 if (idx != mp_irq_entries) {
973 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
974 continue; /* IRQ already used */
975 }
976
977 intsrc.mpc_irqtype = mp_INT;
978 intsrc.mpc_srcbusirq = i; /* Identity mapped */
979 intsrc.mpc_dstirq = i;
980
981 MP_intsrc_info(&intsrc);
982 } 1150 }
983} 1151}
984 1152
985int mp_register_gsi(u32 gsi, int triggering, int polarity) 1153static int __init update_mp_table(void)
986{ 1154{
987 int ioapic; 1155 char str[16];
988 int ioapic_pin; 1156 char oem[10];
989#ifdef CONFIG_X86_32 1157 struct intel_mp_floating *mpf;
990#define MAX_GSI_NUM 4096 1158 struct mp_config_table *mpc;
991#define IRQ_COMPRESSION_START 64 1159 struct mp_config_table *mpc_new;
1160
1161 if (!enable_update_mptable)
1162 return 0;
1163
1164 mpf = mpf_found;
1165 if (!mpf)
1166 return 0;
992 1167
993 static int pci_irq = IRQ_COMPRESSION_START;
994 /* 1168 /*
995 * Mapping between Global System Interrupts, which 1169 * Now see if we need to go further.
996 * represent all possible interrupts, and IRQs
997 * assigned to actual devices.
998 */ 1170 */
999 static int gsi_to_irq[MAX_GSI_NUM]; 1171 if (mpf->mpf_feature1 != 0)
1000#else 1172 return 0;
1001
1002 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1003 return gsi;
1004#endif
1005 1173
1006 /* Don't set up the ACPI SCI because it's already set up */ 1174 if (!mpf->mpf_physptr)
1007 if (acpi_gbl_FADT.sci_interrupt == gsi) 1175 return 0;
1008 return gsi;
1009 1176
1010 ioapic = mp_find_ioapic(gsi); 1177 mpc = phys_to_virt(mpf->mpf_physptr);
1011 if (ioapic < 0) {
1012 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1013 return gsi;
1014 }
1015 1178
1016 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1179 if (!smp_check_mpc(mpc, oem, str))
1180 return 0;
1017 1181
1018#ifdef CONFIG_X86_32 1182 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
1019 if (ioapic_renumber_irq) 1183 printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
1020 gsi = ioapic_renumber_irq(ioapic, gsi);
1021#endif
1022 1184
1023 /* 1185 if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
1024 * Avoid pin reprogramming. PRTs typically include entries 1186 mpc_new_phys = 0;
1025 * with redundant pin->gsi mappings (but unique PCI devices); 1187 printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
1026 * we only program the IOAPIC on the first. 1188 mpc_new_length);
1027 */
1028 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1029 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1030 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1031 ioapic_pin);
1032 return gsi;
1033 } 1189 }
1034 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1190
1035 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1191 if (!mpc_new_phys) {
1036 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1192 unsigned char old, new;
1037#ifdef CONFIG_X86_32 1193 /* check if we can change the postion */
1038 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1194 mpc->mpc_checksum = 0;
1039#else 1195 old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
1040 return gsi; 1196 mpc->mpc_checksum = 0xff;
1041#endif 1197 new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
1198 if (old == new) {
1199 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
1200 return 0;
1201 }
1202 printk(KERN_INFO "use in-positon replacing\n");
1203 } else {
1204 mpf->mpf_physptr = mpc_new_phys;
1205 mpc_new = phys_to_virt(mpc_new_phys);
1206 memcpy(mpc_new, mpc, mpc->mpc_length);
1207 mpc = mpc_new;
1208 /* check if we can modify that */
1209 if (mpc_new_phys - mpf->mpf_physptr) {
1210 struct intel_mp_floating *mpf_new;
1211 /* steal 16 bytes from [0, 1k) */
1212 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
1213 mpf_new = phys_to_virt(0x400 - 16);
1214 memcpy(mpf_new, mpf, 16);
1215 mpf = mpf_new;
1216 mpf->mpf_physptr = mpc_new_phys;
1217 }
1218 mpf->mpf_checksum = 0;
1219 mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
1220 printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
1042 } 1221 }
1043 1222
1044 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1045#ifdef CONFIG_X86_32
1046 /* 1223 /*
1047 * For GSI >= 64, use IRQ compression 1224 * only replace the one with mp_INT and
1225 * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
1226 * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
1227 * may need pci=routeirq for all coverage
1048 */ 1228 */
1049 if ((gsi >= IRQ_COMPRESSION_START) 1229 replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
1050 && (triggering == ACPI_LEVEL_SENSITIVE)) { 1230
1051 /* 1231 return 0;
1052 * For PCI devices assign IRQs in order, avoiding gaps
1053 * due to unused I/O APIC pins.
1054 */
1055 int irq = gsi;
1056 if (gsi < MAX_GSI_NUM) {
1057 /*
1058 * Retain the VIA chipset work-around (gsi > 15), but
1059 * avoid a problem where the 8254 timer (IRQ0) is setup
1060 * via an override (so it's not on pin 0 of the ioapic),
1061 * and at the same time, the pin 0 interrupt is a PCI
1062 * type. The gsi > 15 test could cause these two pins
1063 * to be shared as IRQ0, and they are not shareable.
1064 * So test for this condition, and if necessary, avoid
1065 * the pin collision.
1066 */
1067 gsi = pci_irq++;
1068 /*
1069 * Don't assign IRQ used by ACPI SCI
1070 */
1071 if (gsi == acpi_gbl_FADT.sci_interrupt)
1072 gsi = pci_irq++;
1073 gsi_to_irq[irq] = gsi;
1074 } else {
1075 printk(KERN_ERR "GSI %u is too high\n", gsi);
1076 return gsi;
1077 }
1078 }
1079#endif
1080 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1081 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1082 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1083 return gsi;
1084} 1232}
1085 1233
1086#endif /* CONFIG_X86_IO_APIC */ 1234late_initcall(update_mp_table);
1087#endif /* CONFIG_ACPI */
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index e65281b1634b..f0f1de1c4a1d 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,6 +31,8 @@
31#include <asm/numaq.h> 31#include <asm/numaq.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h>
35#include <asm/e820.h>
34 36
35#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 37#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
36 38
@@ -58,6 +60,8 @@ static void __init smp_dump_qct(void)
58 node_end_pfn[node] = MB_TO_PAGES( 60 node_end_pfn[node] = MB_TO_PAGES(
59 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); 61 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
60 62
63 e820_register_active_regions(node, node_start_pfn[node],
64 node_end_pfn[node]);
61 memory_present(node, 65 memory_present(node,
62 node_start_pfn[node], node_end_pfn[node]); 66 node_start_pfn[node], node_end_pfn[node]);
63 node_remap_size[node] = node_memmap_size_bytes(node, 67 node_remap_size[node] = node_memmap_size_bytes(node,
@@ -67,13 +71,24 @@ static void __init smp_dump_qct(void)
67 } 71 }
68} 72}
69 73
70/* 74static __init void early_check_numaq(void)
71 * Unlike Summit, we don't really care to let the NUMA-Q 75{
72 * fall back to flat mode. Don't compile for NUMA-Q 76 /*
73 * unless you really need it! 77 * Find possible boot-time SMP configuration:
74 */ 78 */
79 early_find_smp_config();
80 /*
81 * get boot-time SMP configuration:
82 */
83 if (smp_found_config)
84 early_get_smp_config();
85}
86
75int __init get_memcfg_numaq(void) 87int __init get_memcfg_numaq(void)
76{ 88{
89 early_check_numaq();
90 if (!found_numaq)
91 return 0;
77 smp_dump_qct(); 92 smp_dump_qct();
78 return 1; 93 return 1;
79} 94}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6f80b852a196..45a5e247d450 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -17,6 +17,7 @@ unsigned int num_processors;
17unsigned disabled_cpus __cpuinitdata; 17unsigned disabled_cpus __cpuinitdata;
18/* Processor that is doing the boot up */ 18/* Processor that is doing the boot up */
19unsigned int boot_cpu_physical_apicid = -1U; 19unsigned int boot_cpu_physical_apicid = -1U;
20unsigned int max_physical_apicid;
20EXPORT_SYMBOL(boot_cpu_physical_apicid); 21EXPORT_SYMBOL(boot_cpu_physical_apicid);
21 22
22DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; 23DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
@@ -137,3 +138,25 @@ void __init setup_per_cpu_areas(void)
137} 138}
138 139
139#endif 140#endif
141
142void __init parse_setup_data(void)
143{
144 struct setup_data *data;
145 u64 pa_data;
146
147 if (boot_params.hdr.version < 0x0209)
148 return;
149 pa_data = boot_params.hdr.setup_data;
150 while (pa_data) {
151 data = early_ioremap(pa_data, PAGE_SIZE);
152 switch (data->type) {
153 default:
154 break;
155 }
156#ifndef CONFIG_DEBUG_BOOT_PARAMS
157 free_early(pa_data, pa_data+sizeof(*data)+data->len);
158#endif
159 pa_data = data->next;
160 early_iounmap(data, PAGE_SIZE);
161 }
162}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 5a2f8e063887..1d4be07e15e5 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -67,10 +67,12 @@
67#include <asm/bios_ebda.h> 67#include <asm/bios_ebda.h>
68#include <asm/cacheflush.h> 68#include <asm/cacheflush.h>
69#include <asm/processor.h> 69#include <asm/processor.h>
70#include <asm/efi.h>
70 71
71/* This value is set up by the early boot code to point to the value 72/* This value is set up by the early boot code to point to the value
72 immediately after the boot time page tables. It contains a *physical* 73 immediately after the boot time page tables. It contains a *physical*
73 address, and must not be in the .bss segment! */ 74 address, and must not be in the .bss segment! */
75unsigned long init_pg_tables_start __initdata = ~0UL;
74unsigned long init_pg_tables_end __initdata = ~0UL; 76unsigned long init_pg_tables_end __initdata = ~0UL;
75 77
76/* 78/*
@@ -237,42 +239,6 @@ static inline void copy_edd(void)
237} 239}
238#endif 240#endif
239 241
240int __initdata user_defined_memmap;
241
242/*
243 * "mem=nopentium" disables the 4MB page tables.
244 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
245 * to <mem>, overriding the bios size.
246 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
247 * <start> to <start>+<mem>, overriding the bios size.
248 *
249 * HPA tells me bootloaders need to parse mem=, so no new
250 * option should be mem= [also see Documentation/i386/boot.txt]
251 */
252static int __init parse_mem(char *arg)
253{
254 if (!arg)
255 return -EINVAL;
256
257 if (strcmp(arg, "nopentium") == 0) {
258 setup_clear_cpu_cap(X86_FEATURE_PSE);
259 } else {
260 /* If the user specifies memory size, we
261 * limit the BIOS-provided memory map to
262 * that size. exactmap can be used to specify
263 * the exact map. mem=number can be used to
264 * trim the existing memory map.
265 */
266 unsigned long long mem_size;
267
268 mem_size = memparse(arg, &arg);
269 limit_regions(mem_size);
270 user_defined_memmap = 1;
271 }
272 return 0;
273}
274early_param("mem", parse_mem);
275
276#ifdef CONFIG_PROC_VMCORE 242#ifdef CONFIG_PROC_VMCORE
277/* elfcorehdr= specifies the location of elf core header 243/* elfcorehdr= specifies the location of elf core header
278 * stored by the crashed kernel. 244 * stored by the crashed kernel.
@@ -395,56 +361,6 @@ unsigned long __init find_max_low_pfn(void)
395 return max_low_pfn; 361 return max_low_pfn;
396} 362}
397 363
398#define BIOS_LOWMEM_KILOBYTES 0x413
399
400/*
401 * The BIOS places the EBDA/XBDA at the top of conventional
402 * memory, and usually decreases the reported amount of
403 * conventional memory (int 0x12) too. This also contains a
404 * workaround for Dell systems that neglect to reserve EBDA.
405 * The same workaround also avoids a problem with the AMD768MPX
406 * chipset: reserve a page before VGA to prevent PCI prefetch
407 * into it (errata #56). Usually the page is reserved anyways,
408 * unless you have no PS/2 mouse plugged in.
409 */
410static void __init reserve_ebda_region(void)
411{
412 unsigned int lowmem, ebda_addr;
413
414 /* To determine the position of the EBDA and the */
415 /* end of conventional memory, we need to look at */
416 /* the BIOS data area. In a paravirtual environment */
417 /* that area is absent. We'll just have to assume */
418 /* that the paravirt case can handle memory setup */
419 /* correctly, without our help. */
420 if (paravirt_enabled())
421 return;
422
423 /* end of low (conventional) memory */
424 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
425 lowmem <<= 10;
426
427 /* start of EBDA area */
428 ebda_addr = get_bios_ebda();
429
430 /* Fixup: bios puts an EBDA in the top 64K segment */
431 /* of conventional memory, but does not adjust lowmem. */
432 if ((lowmem - ebda_addr) <= 0x10000)
433 lowmem = ebda_addr;
434
435 /* Fixup: bios does not report an EBDA at all. */
436 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
437 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
438 lowmem = 0x9f000;
439
440 /* Paranoia: should never happen, but... */
441 if ((lowmem == 0) || (lowmem >= 0x100000))
442 lowmem = 0x9f000;
443
444 /* reserve all memory between lowmem and the 1MB mark */
445 reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
446}
447
448#ifndef CONFIG_NEED_MULTIPLE_NODES 364#ifndef CONFIG_NEED_MULTIPLE_NODES
449static void __init setup_bootmem_allocator(void); 365static void __init setup_bootmem_allocator(void);
450static unsigned long __init setup_memory(void) 366static unsigned long __init setup_memory(void)
@@ -462,11 +378,13 @@ static unsigned long __init setup_memory(void)
462 if (max_pfn > max_low_pfn) { 378 if (max_pfn > max_low_pfn) {
463 highstart_pfn = max_low_pfn; 379 highstart_pfn = max_low_pfn;
464 } 380 }
381 memory_present(0, 0, highend_pfn);
465 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 382 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
466 pages_to_mb(highend_pfn - highstart_pfn)); 383 pages_to_mb(highend_pfn - highstart_pfn));
467 num_physpages = highend_pfn; 384 num_physpages = highend_pfn;
468 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 385 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
469#else 386#else
387 memory_present(0, 0, max_low_pfn);
470 num_physpages = max_low_pfn; 388 num_physpages = max_low_pfn;
471 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 389 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
472#endif 390#endif
@@ -488,11 +406,12 @@ static void __init zone_sizes_init(void)
488 max_zone_pfns[ZONE_DMA] = 406 max_zone_pfns[ZONE_DMA] =
489 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 407 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
490 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 408 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
409 remove_all_active_ranges();
491#ifdef CONFIG_HIGHMEM 410#ifdef CONFIG_HIGHMEM
492 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 411 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
493 add_active_range(0, 0, highend_pfn); 412 e820_register_active_regions(0, 0, highend_pfn);
494#else 413#else
495 add_active_range(0, 0, max_low_pfn); 414 e820_register_active_regions(0, 0, max_low_pfn);
496#endif 415#endif
497 416
498 free_area_init_nodes(max_zone_pfns); 417 free_area_init_nodes(max_zone_pfns);
@@ -558,44 +477,57 @@ static bool do_relocate_initrd = false;
558 477
559static void __init reserve_initrd(void) 478static void __init reserve_initrd(void)
560{ 479{
561 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 480 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
562 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 481 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
563 unsigned long ramdisk_end = ramdisk_image + ramdisk_size; 482 u64 ramdisk_end = ramdisk_image + ramdisk_size;
564 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; 483 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
565 unsigned long ramdisk_here; 484 u64 ramdisk_here;
566
567 initrd_start = 0;
568 485
569 if (!boot_params.hdr.type_of_loader || 486 if (!boot_params.hdr.type_of_loader ||
570 !ramdisk_image || !ramdisk_size) 487 !ramdisk_image || !ramdisk_size)
571 return; /* No initrd provided by bootloader */ 488 return; /* No initrd provided by bootloader */
572 489
573 if (ramdisk_end < ramdisk_image) { 490 initrd_start = 0;
574 printk(KERN_ERR "initrd wraps around end of memory, " 491
575 "disabling initrd\n");
576 return;
577 }
578 if (ramdisk_size >= end_of_lowmem/2) { 492 if (ramdisk_size >= end_of_lowmem/2) {
493 free_early(ramdisk_image, ramdisk_end);
579 printk(KERN_ERR "initrd too large to handle, " 494 printk(KERN_ERR "initrd too large to handle, "
580 "disabling initrd\n"); 495 "disabling initrd\n");
581 return; 496 return;
582 } 497 }
498
499 printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image,
500 ramdisk_end);
501
502
583 if (ramdisk_end <= end_of_lowmem) { 503 if (ramdisk_end <= end_of_lowmem) {
584 /* All in lowmem, easy case */ 504 /* All in lowmem, easy case */
585 reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT); 505 /*
506 * don't need to reserve again, already reserved early
507 * in i386_start_kernel
508 */
586 initrd_start = ramdisk_image + PAGE_OFFSET; 509 initrd_start = ramdisk_image + PAGE_OFFSET;
587 initrd_end = initrd_start+ramdisk_size; 510 initrd_end = initrd_start+ramdisk_size;
588 return; 511 return;
589 } 512 }
590 513
591 /* We need to move the initrd down into lowmem */ 514 /* We need to move the initrd down into lowmem */
592 ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; 515 ramdisk_here = find_e820_area(min_low_pfn<<PAGE_SHIFT,
516 end_of_lowmem, ramdisk_size,
517 PAGE_SIZE);
518
519 if (ramdisk_here == -1ULL)
520 panic("Cannot find place for new RAMDISK of size %lld\n",
521 ramdisk_size);
593 522
594 /* Note: this includes all the lowmem currently occupied by 523 /* Note: this includes all the lowmem currently occupied by
595 the initrd, we rely on that fact to keep the data intact. */ 524 the initrd, we rely on that fact to keep the data intact. */
596 reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT); 525 reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
526 "NEW RAMDISK");
597 initrd_start = ramdisk_here + PAGE_OFFSET; 527 initrd_start = ramdisk_here + PAGE_OFFSET;
598 initrd_end = initrd_start + ramdisk_size; 528 initrd_end = initrd_start + ramdisk_size;
529 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
530 ramdisk_here, ramdisk_here + ramdisk_size);
599 531
600 do_relocate_initrd = true; 532 do_relocate_initrd = true;
601} 533}
@@ -604,10 +536,10 @@ static void __init reserve_initrd(void)
604 536
605static void __init relocate_initrd(void) 537static void __init relocate_initrd(void)
606{ 538{
607 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 539 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
608 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 540 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
609 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; 541 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
610 unsigned long ramdisk_here; 542 u64 ramdisk_here;
611 unsigned long slop, clen, mapaddr; 543 unsigned long slop, clen, mapaddr;
612 char *p, *q; 544 char *p, *q;
613 545
@@ -624,6 +556,10 @@ static void __init relocate_initrd(void)
624 p = (char *)__va(ramdisk_image); 556 p = (char *)__va(ramdisk_image);
625 memcpy(q, p, clen); 557 memcpy(q, p, clen);
626 q += clen; 558 q += clen;
559 /* need to free these low pages...*/
560 printk(KERN_INFO "Freeing old partial RAMDISK %08llx-%08llx\n",
561 ramdisk_image, ramdisk_image + clen - 1);
562 free_bootmem(ramdisk_image, clen);
627 ramdisk_image += clen; 563 ramdisk_image += clen;
628 ramdisk_size -= clen; 564 ramdisk_size -= clen;
629 } 565 }
@@ -642,47 +578,44 @@ static void __init relocate_initrd(void)
642 ramdisk_image += clen; 578 ramdisk_image += clen;
643 ramdisk_size -= clen; 579 ramdisk_size -= clen;
644 } 580 }
581 /* high pages is not converted by early_res_to_bootmem */
582 ramdisk_image = boot_params.hdr.ramdisk_image;
583 ramdisk_size = boot_params.hdr.ramdisk_size;
584 printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n",
585 ramdisk_image, ramdisk_image + ramdisk_size - 1,
586 ramdisk_here, ramdisk_here + ramdisk_size - 1);
645} 587}
646 588
647#endif /* CONFIG_BLK_DEV_INITRD */ 589#endif /* CONFIG_BLK_DEV_INITRD */
648 590
649void __init setup_bootmem_allocator(void) 591void __init setup_bootmem_allocator(void)
650{ 592{
651 unsigned long bootmap_size; 593 int i;
594 unsigned long bootmap_size, bootmap;
652 /* 595 /*
653 * Initialize the boot-time allocator (with low memory only): 596 * Initialize the boot-time allocator (with low memory only):
654 */ 597 */
655 bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); 598 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
656 599 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
657 register_bootmem_low_pages(max_low_pfn); 600 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
658 601 PAGE_SIZE);
659 /* 602 if (bootmap == -1L)
660 * Reserve the bootmem bitmap itself as well. We do this in two 603 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
661 * steps (first step was init_bootmem()) because this catches 604 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
662 * the (very unlikely) case of us accidentally initializing the 605#ifdef CONFIG_BLK_DEV_INITRD
663 * bootmem allocator with an invalid RAM area. 606 reserve_initrd();
664 */
665 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
666 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
667 BOOTMEM_DEFAULT);
668
669 /*
670 * reserve physical page 0 - it's a special BIOS page on many boxes,
671 * enabling clean reboots, SMP operation, laptop functions.
672 */
673 reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
674
675 /* reserve EBDA region */
676 reserve_ebda_region();
677
678#ifdef CONFIG_SMP
679 /*
680 * But first pinch a few for the stack/trampoline stuff
681 * FIXME: Don't need the extra page at 4K, but need to fix
682 * trampoline before removing it. (see the GDT stuff)
683 */
684 reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
685#endif 607#endif
608 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
609 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
610 max_pfn_mapped<<PAGE_SHIFT);
611 printk(KERN_INFO " low ram: %08lx - %08lx\n",
612 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
613 printk(KERN_INFO " bootmap %08lx - %08lx\n",
614 bootmap, bootmap + bootmap_size);
615 for_each_online_node(i)
616 free_bootmem_with_active_regions(i, max_low_pfn);
617 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
618
686#ifdef CONFIG_ACPI_SLEEP 619#ifdef CONFIG_ACPI_SLEEP
687 /* 620 /*
688 * Reserve low memory region for sleep support. 621 * Reserve low memory region for sleep support.
@@ -695,10 +628,6 @@ void __init setup_bootmem_allocator(void)
695 */ 628 */
696 find_smp_config(); 629 find_smp_config();
697#endif 630#endif
698#ifdef CONFIG_BLK_DEV_INITRD
699 reserve_initrd();
700#endif
701 numa_kva_reserve();
702 reserve_crashkernel(); 631 reserve_crashkernel();
703 632
704 reserve_ibft_region(); 633 reserve_ibft_region();
@@ -731,12 +660,6 @@ static void set_mca_bus(int x)
731static void set_mca_bus(int x) { } 660static void set_mca_bus(int x) { }
732#endif 661#endif
733 662
734/* Overridden in paravirt.c if CONFIG_PARAVIRT */
735char * __init __attribute__((weak)) memory_setup(void)
736{
737 return machine_specific_memory_setup();
738}
739
740#ifdef CONFIG_NUMA 663#ifdef CONFIG_NUMA
741/* 664/*
742 * In the golden day, when everything among i386 and x86_64 will be 665 * In the golden day, when everything among i386 and x86_64 will be
@@ -764,11 +687,14 @@ void __init setup_arch(char **cmdline_p)
764 pre_setup_arch_hook(); 687 pre_setup_arch_hook();
765 early_cpu_init(); 688 early_cpu_init();
766 early_ioremap_init(); 689 early_ioremap_init();
690 reserve_setup_data();
767 691
768#ifdef CONFIG_EFI 692#ifdef CONFIG_EFI
769 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 693 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
770 "EL32", 4)) 694 "EL32", 4)) {
771 efi_enabled = 1; 695 efi_enabled = 1;
696 efi_reserve_early();
697 }
772#endif 698#endif
773 699
774 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 700 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -792,8 +718,7 @@ void __init setup_arch(char **cmdline_p)
792#endif 718#endif
793 ARCH_SETUP 719 ARCH_SETUP
794 720
795 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 721 setup_memory_map();
796 print_memory_map(memory_setup());
797 722
798 copy_edd(); 723 copy_edd();
799 724
@@ -811,12 +736,11 @@ void __init setup_arch(char **cmdline_p)
811 bss_resource.start = virt_to_phys(&__bss_start); 736 bss_resource.start = virt_to_phys(&__bss_start);
812 bss_resource.end = virt_to_phys(&__bss_stop)-1; 737 bss_resource.end = virt_to_phys(&__bss_stop)-1;
813 738
739 parse_setup_data();
740
814 parse_early_param(); 741 parse_early_param();
815 742
816 if (user_defined_memmap) { 743 finish_e820_parsing();
817 printk(KERN_INFO "user-defined physical RAM map:\n");
818 print_memory_map("user");
819 }
820 744
821 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 745 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
822 *cmdline_p = command_line; 746 *cmdline_p = command_line;
@@ -824,11 +748,22 @@ void __init setup_arch(char **cmdline_p)
824 if (efi_enabled) 748 if (efi_enabled)
825 efi_init(); 749 efi_init();
826 750
751 e820_register_active_regions(0, 0, -1UL);
752 /*
753 * partially used pages are not usable - thus
754 * we are rounding upwards:
755 */
756 max_pfn = e820_end_of_ram();
757
758 /* preallocate 4k for mptable mpc */
759 early_reserve_e820_mpc_new();
827 /* update e820 for memory not covered by WB MTRRs */ 760 /* update e820 for memory not covered by WB MTRRs */
828 propagate_e820_map();
829 mtrr_bp_init(); 761 mtrr_bp_init();
830 if (mtrr_trim_uncached_memory(max_pfn)) 762 if (mtrr_trim_uncached_memory(max_pfn)) {
831 propagate_e820_map(); 763 remove_all_active_ranges();
764 e820_register_active_regions(0, 0, -1UL);
765 max_pfn = e820_end_of_ram();
766 }
832 767
833 max_low_pfn = setup_memory(); 768 max_low_pfn = setup_memory();
834 769
@@ -855,9 +790,6 @@ void __init setup_arch(char **cmdline_p)
855 * not to exceed the 8Mb limit. 790 * not to exceed the 8Mb limit.
856 */ 791 */
857 792
858#ifdef CONFIG_SMP
859 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
860#endif
861 paging_init(); 793 paging_init();
862 794
863 /* 795 /*
@@ -914,21 +846,20 @@ void __init setup_arch(char **cmdline_p)
914 846
915#ifdef CONFIG_ACPI 847#ifdef CONFIG_ACPI
916 acpi_boot_init(); 848 acpi_boot_init();
917 849#endif
850#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
851 if (smp_found_config)
852 get_smp_config();
853#endif
918#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) 854#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
919 if (def_to_bigsmp) 855 if (def_to_bigsmp)
920 printk(KERN_WARNING "More than 8 CPUs detected and " 856 printk(KERN_WARNING "More than 8 CPUs detected and "
921 "CONFIG_X86_PC cannot handle it.\nUse " 857 "CONFIG_X86_PC cannot handle it.\nUse "
922 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); 858 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
923#endif 859#endif
924#endif
925#ifdef CONFIG_X86_LOCAL_APIC
926 if (smp_found_config)
927 get_smp_config();
928#endif
929 860
930 e820_register_memory(); 861 e820_setup_gap();
931 e820_mark_nosave_regions(); 862 e820_mark_nosave_regions(max_low_pfn);
932 863
933#ifdef CONFIG_VT 864#ifdef CONFIG_VT
934#if defined(CONFIG_VGA_CONSOLE) 865#if defined(CONFIG_VGA_CONSOLE)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6dff1286ad8a..26d60cc0e370 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -56,6 +56,7 @@
56#include <asm/desc.h> 56#include <asm/desc.h>
57#include <video/edid.h> 57#include <video/edid.h>
58#include <asm/e820.h> 58#include <asm/e820.h>
59#include <asm/mpspec.h>
59#include <asm/dma.h> 60#include <asm/dma.h>
60#include <asm/gart.h> 61#include <asm/gart.h>
61#include <asm/mpspec.h> 62#include <asm/mpspec.h>
@@ -271,28 +272,6 @@ void __attribute__((weak)) __init memory_setup(void)
271 machine_specific_memory_setup(); 272 machine_specific_memory_setup();
272} 273}
273 274
274static void __init parse_setup_data(void)
275{
276 struct setup_data *data;
277 unsigned long pa_data;
278
279 if (boot_params.hdr.version < 0x0209)
280 return;
281 pa_data = boot_params.hdr.setup_data;
282 while (pa_data) {
283 data = early_ioremap(pa_data, PAGE_SIZE);
284 switch (data->type) {
285 default:
286 break;
287 }
288#ifndef CONFIG_DEBUG_BOOT_PARAMS
289 free_early(pa_data, pa_data+sizeof(*data)+data->len);
290#endif
291 pa_data = data->next;
292 early_iounmap(data, PAGE_SIZE);
293 }
294}
295
296#ifdef CONFIG_PCI_MMCONFIG 275#ifdef CONFIG_PCI_MMCONFIG
297extern void __cpuinit fam10h_check_enable_mmcfg(void); 276extern void __cpuinit fam10h_check_enable_mmcfg(void);
298extern void __init check_enable_amd_mmconf_dmi(void); 277extern void __init check_enable_amd_mmconf_dmi(void);
@@ -329,8 +308,10 @@ void __init setup_arch(char **cmdline_p)
329#endif 308#endif
330#ifdef CONFIG_EFI 309#ifdef CONFIG_EFI
331 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 310 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
332 "EL64", 4)) 311 "EL64", 4)) {
333 efi_enabled = 1; 312 efi_enabled = 1;
313 efi_reserve_early();
314 }
334#endif 315#endif
335 316
336 ARCH_SETUP 317 ARCH_SETUP
@@ -381,9 +362,13 @@ void __init setup_arch(char **cmdline_p)
381 * we are rounding upwards: 362 * we are rounding upwards:
382 */ 363 */
383 end_pfn = e820_end_of_ram(); 364 end_pfn = e820_end_of_ram();
365
366 /* pre allocte 4k for mptable mpc */
367 early_reserve_e820_mpc_new();
384 /* update e820 for memory not covered by WB MTRRs */ 368 /* update e820 for memory not covered by WB MTRRs */
385 mtrr_bp_init(); 369 mtrr_bp_init();
386 if (mtrr_trim_uncached_memory(end_pfn)) { 370 if (mtrr_trim_uncached_memory(end_pfn)) {
371 remove_all_active_ranges();
387 e820_register_active_regions(0, 0, -1UL); 372 e820_register_active_regions(0, 0, -1UL);
388 end_pfn = e820_end_of_ram(); 373 end_pfn = e820_end_of_ram();
389 } 374 }
@@ -392,7 +377,7 @@ void __init setup_arch(char **cmdline_p)
392 377
393 check_efer(); 378 check_efer();
394 379
395 max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT)); 380 max_pfn_mapped = init_memory_mapping(0, (end_pfn << PAGE_SHIFT));
396 if (efi_enabled) 381 if (efi_enabled)
397 efi_init(); 382 efi_init();
398 383
@@ -453,13 +438,12 @@ void __init setup_arch(char **cmdline_p)
453 acpi_reserve_bootmem(); 438 acpi_reserve_bootmem();
454#endif 439#endif
455 440
456 if (efi_enabled) 441#ifdef CONFIG_X86_MPPARSE
457 efi_reserve_bootmem();
458
459 /* 442 /*
460 * Find and reserve possible boot-time SMP configuration: 443 * Find and reserve possible boot-time SMP configuration:
461 */ 444 */
462 find_smp_config(); 445 find_smp_config();
446#endif
463#ifdef CONFIG_BLK_DEV_INITRD 447#ifdef CONFIG_BLK_DEV_INITRD
464 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 448 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
465 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 449 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -502,11 +486,13 @@ void __init setup_arch(char **cmdline_p)
502 486
503 init_cpu_to_node(); 487 init_cpu_to_node();
504 488
489#ifdef CONFIG_X86_MPPARSE
505 /* 490 /*
506 * get boot-time SMP configuration: 491 * get boot-time SMP configuration:
507 */ 492 */
508 if (smp_found_config) 493 if (smp_found_config)
509 get_smp_config(); 494 get_smp_config();
495#endif
510 init_apic_mappings(); 496 init_apic_mappings();
511 ioapic_init_mappings(); 497 ioapic_init_mappings();
512 498
@@ -516,7 +502,7 @@ void __init setup_arch(char **cmdline_p)
516 * We trust e820 completely. No explicit ROM probing in memory. 502 * We trust e820 completely. No explicit ROM probing in memory.
517 */ 503 */
518 e820_reserve_resources(); 504 e820_reserve_resources();
519 e820_mark_nosave_regions(); 505 e820_mark_nosave_regions(end_pfn);
520 506
521 /* request I/O space for devices used on all i[345]86 PCs */ 507 /* request I/O space for devices used on all i[345]86 PCs */
522 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) 508 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3e1cecedde42..83e62137911b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -555,23 +555,6 @@ cpumask_t cpu_coregroup_map(int cpu)
555 return c->llc_shared_map; 555 return c->llc_shared_map;
556} 556}
557 557
558#ifdef CONFIG_X86_32
559/*
560 * We are called very early to get the low memory for the
561 * SMP bootup trampoline page.
562 */
563void __init smp_alloc_memory(void)
564{
565 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
566 /*
567 * Has to be in very low memory so we can execute
568 * real-mode AP code.
569 */
570 if (__pa(trampoline_base) >= 0x9F000)
571 BUG();
572}
573#endif
574
575static void impress_friends(void) 558static void impress_friends(void)
576{ 559{
577 int cpu; 560 int cpu;
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 70e4a374b4e8..e9d91720a40f 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -31,6 +31,7 @@
31#include <asm/srat.h> 31#include <asm/srat.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/smp.h> 33#include <asm/smp.h>
34#include <asm/e820.h>
34 35
35/* 36/*
36 * proximity macros and definitions 37 * proximity macros and definitions
@@ -244,12 +245,13 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
244 printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", 245 printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
245 j, chunk->nid, chunk->start_pfn, chunk->end_pfn); 246 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
246 node_read_chunk(chunk->nid, chunk); 247 node_read_chunk(chunk->nid, chunk);
247 add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); 248 e820_register_active_regions(chunk->nid, chunk->start_pfn,
249 min(chunk->end_pfn, max_pfn));
248 } 250 }
249 251
250 for_each_online_node(nid) { 252 for_each_online_node(nid) {
251 unsigned long start = node_start_pfn[nid]; 253 unsigned long start = node_start_pfn[nid];
252 unsigned long end = node_end_pfn[nid]; 254 unsigned long end = min(node_end_pfn[nid], max_pfn);
253 255
254 memory_present(nid, start, end); 256 memory_present(nid, start, end);
255 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); 257 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
@@ -261,7 +263,7 @@ out_fail:
261 263
262struct acpi_static_rsdt { 264struct acpi_static_rsdt {
263 struct acpi_table_rsdt table; 265 struct acpi_table_rsdt table;
264 u32 padding[7]; /* Allow for 7 more table entries */ 266 u32 padding[32]; /* Allow for 32 more table entries */
265}; 267};
266 268
267int __init get_memcfg_from_srat(void) 269int __init get_memcfg_from_srat(void)
@@ -297,7 +299,7 @@ int __init get_memcfg_from_srat(void)
297 } 299 }
298 300
299 rsdt = (struct acpi_table_rsdt *) 301 rsdt = (struct acpi_table_rsdt *)
300 early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); 302 early_ioremap(rsdp->rsdt_physical_address, sizeof(saved_rsdt));
301 303
302 if (!rsdt) { 304 if (!rsdt) {
303 printk(KERN_WARNING 305 printk(KERN_WARNING
@@ -310,6 +312,7 @@ int __init get_memcfg_from_srat(void)
310 312
311 if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) { 313 if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
312 printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); 314 printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
315 early_iounmap(rsdt, sizeof(saved_rsdt));
313 goto out_err; 316 goto out_err;
314 } 317 }
315 318
@@ -319,37 +322,51 @@ int __init get_memcfg_from_srat(void)
319 * size of RSDT) divided by the size of each entry 322 * size of RSDT) divided by the size of each entry
320 * (4-byte table pointers). 323 * (4-byte table pointers).
321 */ 324 */
322 tables = (header->length - sizeof(struct acpi_table_header)) / 4; 325 tables = (header->length - sizeof(struct acpi_table_header)) / sizeof(u32);
323 326
324 if (!tables) 327 if (!tables)
325 goto out_err; 328 goto out_err;
326 329
327 memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); 330 memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
328 331 early_iounmap(rsdt, sizeof(saved_rsdt));
329 if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) { 332 if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
330 printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", 333 printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
331 saved_rsdt.table.header.length); 334 saved_rsdt.table.header.length);
332 goto out_err; 335 goto out_err;
333 } 336 }
334 337
335 printk("Begin SRAT table scan....\n"); 338 printk("Begin SRAT table scan....%d\n", tables);
336 339
337 for (i = 0; i < tables; i++) { 340 for (i = 0; i < tables; i++){
341 int result;
342 u32 length;
338 /* Map in header, then map in full table length. */ 343 /* Map in header, then map in full table length. */
339 header = (struct acpi_table_header *) 344 header = (struct acpi_table_header *)
340 early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); 345 early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
341 if (!header) 346 if (!header)
342 break; 347 break;
348
349 printk(KERN_INFO "ACPI: %4.4s %08lX, %04X\n",
350 header->signature,
351 (unsigned long)saved_rsdt.table.table_offset_entry[i],
352 header->length);
353
354 if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) {
355 early_iounmap(header, sizeof(struct acpi_table_header));
356 continue;
357 }
358
359 length = header->length;
360 early_iounmap(header, sizeof(struct acpi_table_header));
343 header = (struct acpi_table_header *) 361 header = (struct acpi_table_header *)
344 early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); 362 early_ioremap(saved_rsdt.table.table_offset_entry[i], length);
345 if (!header) 363 if (!header)
346 break; 364 break;
347 365
348 if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
349 continue;
350
351 /* we've found the srat table. don't need to look at any more tables */ 366 /* we've found the srat table. don't need to look at any more tables */
352 return acpi20_parse_srat((struct acpi_table_srat *)header); 367 result = acpi20_parse_srat((struct acpi_table_srat *)header);
368 early_iounmap(header, length);
369 return result;
353 } 370 }
354out_err: 371out_err:
355 remove_all_active_ranges(); 372 remove_all_active_ranges();
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index ae751094eba9..d67ce5f044ba 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; 37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
38 38
39#ifndef CONFIG_X86_NUMAQ
39static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; 40static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
41#endif
40 42
41static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) 43static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
42{ 44{
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index abbf199adebb..1106fac6024d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,7 +2,7 @@
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4 4
5/* ready for x86_64, no harm for x86, since it will overwrite after alloc */ 5/* ready for x86_64 and x86 */
6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
7 7
8/* 8/*
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5c7e2fd52075..5e4772907c6e 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1012,6 +1012,7 @@ __init void lguest_init(void)
1012 * clobbered. The Launcher places our initial pagetables somewhere at 1012 * clobbered. The Launcher places our initial pagetables somewhere at
1013 * the top of our physical memory, so we don't need extra space: set 1013 * the top of our physical memory, so we don't need extra space: set
1014 * init_pg_tables_end to the end of the kernel. */ 1014 * init_pg_tables_end to the end of the kernel. */
1015 init_pg_tables_start = __pa(pg0);
1015 init_pg_tables_end = __pa(pg0); 1016 init_pg_tables_end = __pa(pg0);
1016 1017
1017 /* Load the %fs segment register (the per-cpu segment register) with 1018 /* Load the %fs segment register (the per-cpu segment register) with
@@ -1065,9 +1066,9 @@ __init void lguest_init(void)
1065 pm_power_off = lguest_power_off; 1066 pm_power_off = lguest_power_off;
1066 machine_ops.restart = lguest_restart; 1067 machine_ops.restart = lguest_restart;
1067 1068
1068 /* Now we're set up, call start_kernel() in init/main.c and we proceed 1069 /* Now we're set up, call i386_start_kernel() in head32.c and we proceed
1069 * to boot as normal. It never returns. */ 1070 * to boot as normal. It never returns. */
1070 start_kernel(); 1071 i386_start_kernel();
1071} 1072}
1072/* 1073/*
1073 * This marks the end of stage II of our journey, The Guest. 1074 * This marks the end of stage II of our journey, The Guest.
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 0c28a071824c..56b4c39cb7fa 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -153,6 +153,7 @@ late_initcall(print_ipi_mode);
153char * __init machine_specific_memory_setup(void) 153char * __init machine_specific_memory_setup(void)
154{ 154{
155 char *who; 155 char *who;
156 int new_nr;
156 157
157 158
158 who = "BIOS-e820"; 159 who = "BIOS-e820";
@@ -163,7 +164,11 @@ char * __init machine_specific_memory_setup(void)
163 * Otherwise fake a memory map; one section from 0k->640k, 164 * Otherwise fake a memory map; one section from 0k->640k,
164 * the next section from 1mb->appropriate_mem_k 165 * the next section from 1mb->appropriate_mem_k
165 */ 166 */
166 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); 167 new_nr = boot_params.e820_entries;
168 sanitize_e820_map(boot_params.e820_map,
169 ARRAY_SIZE(boot_params.e820_map),
170 &new_nr);
171 boot_params.e820_entries = new_nr;
167 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) 172 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
168 < 0) { 173 < 0) {
169 unsigned long mem_size; 174 unsigned long mem_size;
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
index 69dd4da218dc..3ef8b43b62fc 100644
--- a/arch/x86/mach-es7000/Makefile
+++ b/arch/x86/mach-es7000/Makefile
@@ -3,4 +3,3 @@
3# 3#
4 4
5obj-$(CONFIG_X86_ES7000) := es7000plat.o 5obj-$(CONFIG_X86_ES7000) := es7000plat.o
6obj-$(CONFIG_X86_GENERICARCH) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
index f5d6f7d8b86e..4354ce804889 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -52,6 +52,8 @@ static struct mip_reg *host_reg;
52static int mip_port; 52static int mip_port;
53static unsigned long mip_addr, host_addr; 53static unsigned long mip_addr, host_addr;
54 54
55int es7000_plat;
56
55/* 57/*
56 * GSI override for ES7000 platforms. 58 * GSI override for ES7000 platforms.
57 */ 59 */
@@ -175,53 +177,6 @@ find_unisys_acpi_oem_table(unsigned long *oem_addr)
175} 177}
176#endif 178#endif
177 179
178/*
179 * This file also gets compiled if CONFIG_X86_GENERICARCH is set. Generic
180 * arch already has got following function definitions (asm-generic/es7000.c)
181 * hence no need to define these for that case.
182 */
183#ifndef CONFIG_X86_GENERICARCH
184void es7000_sw_apic(void);
185void __init enable_apic_mode(void)
186{
187 es7000_sw_apic();
188 return;
189}
190
191__init int mps_oem_check(struct mp_config_table *mpc, char *oem,
192 char *productid)
193{
194 if (mpc->mpc_oemptr) {
195 struct mp_config_oemtable *oem_table =
196 (struct mp_config_oemtable *)mpc->mpc_oemptr;
197 if (!strncmp(oem, "UNISYS", 6))
198 return parse_unisys_oem((char *)oem_table);
199 }
200 return 0;
201}
202#ifdef CONFIG_ACPI
203/* Hook from generic ACPI tables.c */
204int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
205{
206 unsigned long oem_addr;
207 if (!find_unisys_acpi_oem_table(&oem_addr)) {
208 if (es7000_check_dsdt())
209 return parse_unisys_oem((char *)oem_addr);
210 else {
211 setup_unisys();
212 return 1;
213 }
214 }
215 return 0;
216}
217#else
218int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
219{
220 return 0;
221}
222#endif
223#endif /* COFIG_X86_GENERICARCH */
224
225static void 180static void
226es7000_spin(int n) 181es7000_spin(int n)
227{ 182{
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 19d6d407737b..0dbd7803a1d5 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -2,7 +2,11 @@
2# Makefile for the generic architecture 2# Makefile for the generic architecture
3# 3#
4 4
5EXTRA_CFLAGS := -Iarch/x86/kernel 5EXTRA_CFLAGS := -Iarch/x86/kernel
6 6
7obj-y := probe.o summit.o bigsmp.o es7000.o default.o 7obj-y := probe.o default.o
8obj-y += ../../x86/mach-es7000/ 8obj-$(CONFIG_X86_NUMAQ) += numaq.o
9obj-$(CONFIG_X86_SUMMIT) += summit.o
10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
11obj-$(CONFIG_X86_ES7000) += es7000.o
12obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 95fc463056d0..59d771714559 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -23,10 +23,8 @@ static int dmi_bigsmp; /* can be set by dmi scanners */
23 23
24static int hp_ht_bigsmp(const struct dmi_system_id *d) 24static int hp_ht_bigsmp(const struct dmi_system_id *d)
25{ 25{
26#ifdef CONFIG_X86_GENERICARCH
27 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); 26 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
28 dmi_bigsmp = 1; 27 dmi_bigsmp = 1;
29#endif
30 return 0; 28 return 0;
31} 29}
32 30
@@ -48,7 +46,7 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
48static int probe_bigsmp(void) 46static int probe_bigsmp(void)
49{ 47{
50 if (def_to_bigsmp) 48 if (def_to_bigsmp)
51 dmi_bigsmp = 1; 49 dmi_bigsmp = 1;
52 else 50 else
53 dmi_check_system(bigsmp_dmi_table); 51 dmi_check_system(bigsmp_dmi_table);
54 return dmi_bigsmp; 52 return dmi_bigsmp;
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
new file mode 100644
index 000000000000..8091e68764c4
--- /dev/null
+++ b/arch/x86/mach-generic/numaq.c
@@ -0,0 +1,41 @@
1/*
2 * APIC driver for the IBM NUMAQ chipset.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <linux/smp.h>
8#include <asm/mpspec.h>
9#include <asm/genapic.h>
10#include <asm/fixmap.h>
11#include <asm/apicdef.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/init.h>
15#include <asm/mach-numaq/mach_apic.h>
16#include <asm/mach-numaq/mach_apicdef.h>
17#include <asm/mach-numaq/mach_ipi.h>
18#include <asm/mach-numaq/mach_mpparse.h>
19#include <asm/mach-numaq/mach_wakecpu.h>
20#include <asm/numaq.h>
21
22static int mps_oem_check(struct mp_config_table *mpc, char *oem,
23 char *productid)
24{
25 numaq_mps_oem_check(mpc, oem, productid);
26 return found_numaq;
27}
28
29static int probe_numaq(void)
30{
31 /* already know from get_memcfg_numaq() */
32 return found_numaq;
33}
34
35/* Hook from generic ACPI tables.c */
36static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
37{
38 return 0;
39}
40
41struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
index c5ae751b994a..ba18dec48555 100644
--- a/arch/x86/mach-generic/probe.c
+++ b/arch/x86/mach-generic/probe.c
@@ -16,6 +16,7 @@
16#include <asm/apicdef.h> 16#include <asm/apicdef.h>
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18 18
19extern struct genapic apic_numaq;
19extern struct genapic apic_summit; 20extern struct genapic apic_summit;
20extern struct genapic apic_bigsmp; 21extern struct genapic apic_bigsmp;
21extern struct genapic apic_es7000; 22extern struct genapic apic_es7000;
@@ -24,9 +25,18 @@ extern struct genapic apic_default;
24struct genapic *genapic = &apic_default; 25struct genapic *genapic = &apic_default;
25 26
26static struct genapic *apic_probe[] __initdata = { 27static struct genapic *apic_probe[] __initdata = {
28#ifdef CONFIG_X86_NUMAQ
29 &apic_numaq,
30#endif
31#ifdef CONFIG_X86_SUMMIT
27 &apic_summit, 32 &apic_summit,
33#endif
34#ifdef CONFIG_X86_BIGSMP
28 &apic_bigsmp, 35 &apic_bigsmp,
36#endif
37#ifdef CONFIG_X86_ES7000
29 &apic_es7000, 38 &apic_es7000,
39#endif
30 &apic_default, /* must be last */ 40 &apic_default, /* must be last */
31 NULL, 41 NULL,
32}; 42};
@@ -54,6 +64,7 @@ early_param("apic", parse_apic);
54 64
55void __init generic_bigsmp_probe(void) 65void __init generic_bigsmp_probe(void)
56{ 66{
67#if CONFIG_X86_BIGSMP
57 /* 68 /*
58 * This routine is used to switch to bigsmp mode when 69 * This routine is used to switch to bigsmp mode when
59 * - There is no apic= option specified by the user 70 * - There is no apic= option specified by the user
@@ -67,6 +78,7 @@ void __init generic_bigsmp_probe(void)
67 printk(KERN_INFO "Overriding APIC driver with %s\n", 78 printk(KERN_INFO "Overriding APIC driver with %s\n",
68 genapic->name); 79 genapic->name);
69 } 80 }
81#endif
70} 82}
71 83
72void __init generic_apic_probe(void) 84void __init generic_apic_probe(void)
@@ -88,7 +100,8 @@ void __init generic_apic_probe(void)
88 100
89/* These functions can switch the APIC even after the initial ->probe() */ 101/* These functions can switch the APIC even after the initial ->probe() */
90 102
91int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid) 103int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
104 char *productid)
92{ 105{
93 int i; 106 int i;
94 for (i = 0; apic_probe[i]; ++i) { 107 for (i = 0; apic_probe[i]; ++i) {
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
index 57484e91ab90..a2fb78c0d154 100644
--- a/arch/x86/mach-visws/mpparse.c
+++ b/arch/x86/mach-visws/mpparse.c
@@ -8,11 +8,6 @@
8#include "cobalt.h" 8#include "cobalt.h"
9#include "mach_apic.h" 9#include "mach_apic.h"
10 10
11/* Have we found an MP table */
12int smp_found_config;
13
14int pic_mode;
15
16extern unsigned int __cpuinitdata maxcpus; 11extern unsigned int __cpuinitdata maxcpus;
17 12
18/* 13/*
@@ -76,7 +71,9 @@ void __init find_smp_config(void)
76 if (ncpus > maxcpus) 71 if (ncpus > maxcpus)
77 ncpus = maxcpus; 72 ncpus = maxcpus;
78 73
74#ifdef CONFIG_X86_LOCAL_APIC
79 smp_found_config = 1; 75 smp_found_config = 1;
76#endif
80 while (ncpus--) 77 while (ncpus--)
81 MP_processor_info(mp++); 78 MP_processor_info(mp++);
82 79
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 5ae5466b9eb9..f4aca9fa9546 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -62,6 +62,7 @@ void __init time_init_hook(void)
62char *__init machine_specific_memory_setup(void) 62char *__init machine_specific_memory_setup(void)
63{ 63{
64 char *who; 64 char *who;
65 int new_nr;
65 66
66 who = "NOT VOYAGER"; 67 who = "NOT VOYAGER";
67 68
@@ -111,7 +112,11 @@ char *__init machine_specific_memory_setup(void)
111 * Otherwise fake a memory map; one section from 0k->640k, 112 * Otherwise fake a memory map; one section from 0k->640k,
112 * the next section from 1mb->appropriate_mem_k 113 * the next section from 1mb->appropriate_mem_k
113 */ 114 */
114 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); 115 new_nr = boot_params.e820_entries;
116 sanitize_e820_map(boot_params.e820_map,
117 ARRAY_SIZE(boot_params.e820_map),
118 &new_nr);
119 boot_params.e820_entries = new_nr;
115 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) 120 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
116 < 0) { 121 < 0) {
117 unsigned long mem_size; 122 unsigned long mem_size;
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 8acbf0cdf1a5..8dedd01e909f 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -59,11 +59,6 @@ __u32 voyager_quad_processors = 0;
59 * activity count. Finally exported by i386_ksyms.c */ 59 * activity count. Finally exported by i386_ksyms.c */
60static int voyager_extended_cpus = 1; 60static int voyager_extended_cpus = 1;
61 61
62/* Have we found an SMP box - used by time.c to do the profiling
63 interrupt for timeslicing; do not set to 1 until the per CPU timer
64 interrupt is active */
65int smp_found_config = 0;
66
67/* Used for the invalidate map that's also checked in the spinlock */ 62/* Used for the invalidate map that's also checked in the spinlock */
68static volatile unsigned long smp_invalidate_needed; 63static volatile unsigned long smp_invalidate_needed;
69 64
@@ -1137,15 +1132,6 @@ void flush_tlb_all(void)
1137 on_each_cpu(do_flush_tlb_all, 0, 1, 1); 1132 on_each_cpu(do_flush_tlb_all, 0, 1, 1);
1138} 1133}
1139 1134
1140/* used to set up the trampoline for other CPUs when the memory manager
1141 * is sorted out */
1142void __init smp_alloc_memory(void)
1143{
1144 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
1145 if (__pa(trampoline_base) >= 0x93000)
1146 BUG();
1147}
1148
1149/* send a reschedule CPI to one CPU by physical CPU number*/ 1135/* send a reschedule CPI to one CPU by physical CPU number*/
1150static void voyager_smp_send_reschedule(int cpu) 1136static void voyager_smp_send_reschedule(int cpu)
1151{ 1137{
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf983687..accc7c6c57fc 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,6 +38,7 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 60/*
60 * 4) physnode_map - the mapping between a pfn and owning node 61 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 62 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 63 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 64 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 65 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 66 * physnode_map will contain:
66 * 67 *
67 * physnode_map[0-3] = 0; 68 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 69 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 70 * physnode_map[32- ] = -1;
70 */ 71 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 72s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 73EXPORT_SYMBOL(physnode_map);
@@ -81,9 +82,9 @@ void memory_present(int nid, unsigned long start, unsigned long end)
81 printk(KERN_DEBUG " "); 82 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 83 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 84 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 85 printk(KERN_CONT "%ld ", pfn);
85 } 86 }
86 printk("\n"); 87 printk(KERN_CONT "\n");
87} 88}
88 89
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 90unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -119,11 +120,11 @@ int __init get_memcfg_numa_flat(void)
119{ 120{
120 printk("NUMA - single node, flat memory mode\n"); 121 printk("NUMA - single node, flat memory mode\n");
121 122
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 123 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 124 node_end_pfn[0] = max_pfn;
125 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 126 memory_present(0, 0, max_pfn);
127 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 128
128 /* Indicate there is one node available. */ 129 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 130 nodes_clear(node_online_map);
@@ -159,9 +160,17 @@ static void __init allocate_pgdat(int nid)
159 if (nid && node_has_online_mem(nid)) 160 if (nid && node_has_online_mem(nid))
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 161 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 162 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 163 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 164 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
165 (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT,
166 sizeof(pg_data_t),
167 PAGE_SIZE);
168 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
169 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t),
170 "NODE_DATA");
164 } 171 }
172 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
173 nid, (unsigned long)NODE_DATA(nid));
165} 174}
166 175
167#ifdef CONFIG_DISCONTIGMEM 176#ifdef CONFIG_DISCONTIGMEM
@@ -202,8 +211,12 @@ void __init remap_numa_kva(void)
202 int node; 211 int node;
203 212
204 for_each_online_node(node) { 213 for_each_online_node(node) {
214 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
205 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 215 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
206 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 216 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
217 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
218 (unsigned long)vaddr,
219 node_remap_start_pfn[node] + pfn);
207 set_pmd_pfn((ulong) vaddr, 220 set_pmd_pfn((ulong) vaddr,
208 node_remap_start_pfn[node] + pfn, 221 node_remap_start_pfn[node] + pfn,
209 PAGE_KERNEL_LARGE); 222 PAGE_KERNEL_LARGE);
@@ -215,17 +228,21 @@ static unsigned long calculate_numa_remap_pages(void)
215{ 228{
216 int nid; 229 int nid;
217 unsigned long size, reserve_pages = 0; 230 unsigned long size, reserve_pages = 0;
218 unsigned long pfn;
219 231
220 for_each_online_node(nid) { 232 for_each_online_node(nid) {
221 unsigned old_end_pfn = node_end_pfn[nid]; 233 u64 node_end_target;
234 u64 node_end_final;
222 235
223 /* 236 /*
224 * The acpi/srat node info can show hot-add memroy zones 237 * The acpi/srat node info can show hot-add memroy zones
225 * where memory could be added but not currently present. 238 * where memory could be added but not currently present.
226 */ 239 */
240 printk("node %d pfn: [%lx - %lx]\n",
241 nid, node_start_pfn[nid], node_end_pfn[nid]);
227 if (node_start_pfn[nid] > max_pfn) 242 if (node_start_pfn[nid] > max_pfn)
228 continue; 243 continue;
244 if (!node_end_pfn[nid])
245 continue;
229 if (node_end_pfn[nid] > max_pfn) 246 if (node_end_pfn[nid] > max_pfn)
230 node_end_pfn[nid] = max_pfn; 247 node_end_pfn[nid] = max_pfn;
231 248
@@ -237,39 +254,42 @@ static unsigned long calculate_numa_remap_pages(void)
237 /* now the roundup is correct, convert to PAGE_SIZE pages */ 254 /* now the roundup is correct, convert to PAGE_SIZE pages */
238 size = size * PTRS_PER_PTE; 255 size = size * PTRS_PER_PTE;
239 256
240 /* 257 node_end_target = round_down(node_end_pfn[nid] - size,
241 * Validate the region we are allocating only contains valid 258 PTRS_PER_PTE);
242 * pages. 259 node_end_target <<= PAGE_SHIFT;
243 */ 260 do {
244 for (pfn = node_end_pfn[nid] - size; 261 node_end_final = find_e820_area(node_end_target,
245 pfn < node_end_pfn[nid]; pfn++) 262 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
246 if (!page_is_ram(pfn)) 263 ((u64)size)<<PAGE_SHIFT,
247 break; 264 LARGE_PAGE_BYTES);
248 265 node_end_target -= LARGE_PAGE_BYTES;
249 if (pfn != node_end_pfn[nid]) 266 } while (node_end_final == -1ULL &&
250 size = 0; 267 (node_end_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
268
269 if (node_end_final == -1ULL)
270 panic("Can not get kva ram\n");
251 271
252 printk("Reserving %ld pages of KVA for lmem_map of node %d\n", 272 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
253 size, nid); 273 size, nid);
254 node_remap_size[nid] = size; 274 node_remap_size[nid] = size;
255 node_remap_offset[nid] = reserve_pages; 275 node_remap_offset[nid] = reserve_pages;
256 reserve_pages += size; 276 reserve_pages += size;
257 printk("Shrinking node %d from %ld pages to %ld pages\n", 277 printk("Shrinking node %d from %ld pages to %lld pages\n",
258 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 278 nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT);
259
260 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
261 /*
262 * Align node_end_pfn[] and node_remap_start_pfn[] to
263 * pmd boundary. remap_numa_kva will barf otherwise.
264 */
265 printk("Shrinking node %d further by %ld pages for proper alignment\n",
266 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
267 size += node_end_pfn[nid] & (PTRS_PER_PTE-1);
268 }
269 279
270 node_end_pfn[nid] -= size; 280 /*
281 * prevent kva address below max_low_pfn want it on system
282 * with less memory later.
283 * layout will be: KVA address , KVA RAM
284 */
285 if ((node_end_final>>PAGE_SHIFT) < max_low_pfn)
286 reserve_early(node_end_final,
287 node_end_final+(((u64)size)<<PAGE_SHIFT),
288 "KVA RAM");
289
290 node_end_pfn[nid] = node_end_final>>PAGE_SHIFT;
271 node_remap_start_pfn[nid] = node_end_pfn[nid]; 291 node_remap_start_pfn[nid] = node_end_pfn[nid];
272 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 292 shrink_active_range(nid, node_end_pfn[nid]);
273 } 293 }
274 printk("Reserving total of %ld pages for numa KVA remap\n", 294 printk("Reserving total of %ld pages for numa KVA remap\n",
275 reserve_pages); 295 reserve_pages);
@@ -287,8 +307,7 @@ static void init_remap_allocator(int nid)
287 307
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 308 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid], 309 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn 310 (ulong) node_remap_end_vaddr[nid]);
291 + node_remap_offset[nid] + node_remap_size[nid]));
292} 311}
293#else 312#else
294void *alloc_remap(int nid, unsigned long size) 313void *alloc_remap(int nid, unsigned long size)
@@ -315,7 +334,7 @@ unsigned long __init setup_memory(void)
315{ 334{
316 int nid; 335 int nid;
317 unsigned long system_start_pfn, system_max_low_pfn; 336 unsigned long system_start_pfn, system_max_low_pfn;
318 unsigned long wasted_pages; 337 long kva_target_pfn;
319 338
320 /* 339 /*
321 * When mapping a NUMA machine we allocate the node_mem_map arrays 340 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,34 +343,38 @@ unsigned long __init setup_memory(void)
324 * this space and use it to adjust the boundary between ZONE_NORMAL 343 * this space and use it to adjust the boundary between ZONE_NORMAL
325 * and ZONE_HIGHMEM. 344 * and ZONE_HIGHMEM.
326 */ 345 */
346
347 /* call find_max_low_pfn at first, it could update max_pfn */
348 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
349
350 remove_all_active_ranges();
327 get_memcfg_numa(); 351 get_memcfg_numa();
328 352
329 kva_pages = calculate_numa_remap_pages(); 353 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
330 354
331 /* partially used pages are not usable - thus round upwards */ 355 /* partially used pages are not usable - thus round upwards */
332 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); 356 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
333 357
334 kva_start_pfn = find_max_low_pfn() - kva_pages; 358 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
359 do {
360 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
361 max_low_pfn<<PAGE_SHIFT,
362 kva_pages<<PAGE_SHIFT,
363 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
364 kva_target_pfn -= PTRS_PER_PTE;
365 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
335 366
336#ifdef CONFIG_BLK_DEV_INITRD 367 if (kva_start_pfn == -1UL)
337 /* Numa kva area is below the initrd */ 368 panic("Can not get kva space\n");
338 if (initrd_start)
339 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
340 - kva_pages;
341#endif
342 369
343 /*
344 * We waste pages past at the end of the KVA for no good reason other
345 * than how it is located. This is bad.
346 */
347 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
348 kva_start_pfn -= wasted_pages;
349 kva_pages += wasted_pages;
350
351 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
352 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", 370 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
353 kva_start_pfn, max_low_pfn); 371 kva_start_pfn, max_low_pfn);
354 printk("max_pfn = %ld\n", max_pfn); 372 printk("max_pfn = %ld\n", max_pfn);
373
374 /* avoid clash with initrd */
375 reserve_early(kva_start_pfn<<PAGE_SHIFT,
376 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
377 "KVA PG");
355#ifdef CONFIG_HIGHMEM 378#ifdef CONFIG_HIGHMEM
356 highstart_pfn = highend_pfn = max_pfn; 379 highstart_pfn = highend_pfn = max_pfn;
357 if (max_pfn > system_max_low_pfn) 380 if (max_pfn > system_max_low_pfn)
@@ -387,16 +410,8 @@ unsigned long __init setup_memory(void)
387 return max_low_pfn; 410 return max_low_pfn;
388} 411}
389 412
390void __init numa_kva_reserve(void)
391{
392 if (kva_pages)
393 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
394 BOOTMEM_DEFAULT);
395}
396
397void __init zone_sizes_init(void) 413void __init zone_sizes_init(void)
398{ 414{
399 int nid;
400 unsigned long max_zone_pfns[MAX_NR_ZONES]; 415 unsigned long max_zone_pfns[MAX_NR_ZONES];
401 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 416 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
402 max_zone_pfns[ZONE_DMA] = 417 max_zone_pfns[ZONE_DMA] =
@@ -406,15 +421,6 @@ void __init zone_sizes_init(void)
406 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 421 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
407#endif 422#endif
408 423
409 /* If SRAT has not registered memory, register it now */
410 if (find_max_pfn_with_active_regions() == 0) {
411 for_each_online_node(nid) {
412 if (node_has_online_mem(nid))
413 add_active_range(nid, node_start_pfn[nid],
414 node_end_pfn[nid]);
415 }
416 }
417
418 free_area_init_nodes(max_zone_pfns); 424 free_area_init_nodes(max_zone_pfns);
419 return; 425 return;
420} 426}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..0e7bb5e81670 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -289,7 +289,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
289 289
290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
291{ 291{
292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) &&
293 !page_is_reserved_early(pfn)) {
293 ClearPageReserved(page); 294 ClearPageReserved(page);
294 init_page_count(page); 295 init_page_count(page);
295 __free_page(page); 296 __free_page(page);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..afb07ffb931d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -233,7 +233,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
233 else 233 else
234 bootmap_start = round_up(start, PAGE_SIZE); 234 bootmap_start = round_up(start, PAGE_SIZE);
235 /* 235 /*
236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 236 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 237 * to use that to align to PAGE_SIZE
238 */ 238 */
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 239 bootmap = early_node_mem(nodeid, bootmap_start, end,
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
index 89ec35d00efd..962d96c0495a 100644
--- a/arch/x86/pci/Makefile_32
+++ b/arch/x86/pci/Makefile_32
@@ -13,10 +13,11 @@ pci-y := fixup.o
13pci-$(CONFIG_ACPI) += acpi.o 13pci-$(CONFIG_ACPI) += acpi.o
14pci-y += legacy.o irq.o 14pci-y += legacy.o irq.o
15 15
16# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are 16# Careful: VISWS overrule the pci-y above. The colons are
17# therefor correct. This needs a proper fix by distangling the code. 17# therefor correct. This needs a proper fix by distangling the code.
18pci-$(CONFIG_X86_VISWS) := visws.o fixup.o 18pci-$(CONFIG_X86_VISWS) := visws.o fixup.o
19pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o 19
20pci-$(CONFIG_X86_NUMAQ) += numa.o
20 21
21# Necessary for NUMAQ as well 22# Necessary for NUMAQ as well
22pci-$(CONFIG_NUMA) += mp_bus_to_node.o 23pci-$(CONFIG_NUMA) += mp_bus_to_node.o
diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c
index 5c2799c20e47..bfefdf0f40d4 100644
--- a/arch/x86/pci/k8-bus_64.c
+++ b/arch/x86/pci/k8-bus_64.c
@@ -384,7 +384,7 @@ static int __init early_fill_mp_bus_info(void)
384 /* need to take out [0, TOM) for RAM*/ 384 /* need to take out [0, TOM) for RAM*/
385 address = MSR_K8_TOP_MEM1; 385 address = MSR_K8_TOP_MEM1;
386 rdmsrl(address, val); 386 rdmsrl(address, val);
387 end = (val & 0xffffff8000000ULL); 387 end = (val & 0xffffff800000ULL);
388 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); 388 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
389 if (end < (1ULL<<32)) 389 if (end < (1ULL<<32))
390 update_range(range, 0, end - 1); 390 update_range(range, 0, end - 1);
@@ -478,7 +478,7 @@ static int __init early_fill_mp_bus_info(void)
478 /* TOP_MEM2 */ 478 /* TOP_MEM2 */
479 address = MSR_K8_TOP_MEM2; 479 address = MSR_K8_TOP_MEM2;
480 rdmsrl(address, val); 480 rdmsrl(address, val);
481 end = (val & 0xffffff8000000ULL); 481 end = (val & 0xffffff800000ULL);
482 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); 482 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
483 update_range(range, 1ULL<<32, end - 1); 483 update_range(range, 1ULL<<32, end - 1);
484 } 484 }
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c
index d9afbae5092b..99f1ecd485b5 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numa.c
@@ -6,45 +6,21 @@
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <mach_apic.h> 8#include <mach_apic.h>
9#include <asm/mpspec.h>
9#include "pci.h" 10#include "pci.h"
10 11
11#define XQUAD_PORTIO_BASE 0xfe400000 12#define XQUAD_PORTIO_BASE 0xfe400000
12#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ 13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
13 14
14int mp_bus_id_to_node[MAX_MP_BUSSES];
15#define BUS2QUAD(global) (mp_bus_id_to_node[global]) 15#define BUS2QUAD(global) (mp_bus_id_to_node[global])
16 16
17int mp_bus_id_to_local[MAX_MP_BUSSES];
18#define BUS2LOCAL(global) (mp_bus_id_to_local[global]) 17#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
19 18
20void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
21 struct mpc_config_translation *translation)
22{
23 int quad = translation->trans_quad;
24 int local = translation->trans_local;
25
26 mp_bus_id_to_node[m->mpc_busid] = quad;
27 mp_bus_id_to_local[m->mpc_busid] = local;
28 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
29 m->mpc_busid, name, quad);
30}
31
32int quad_local_to_mp_bus_id [NR_CPUS/4][4];
33#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
34void mpc_oem_pci_bus(struct mpc_config_bus *m,
35 struct mpc_config_translation *translation)
36{
37 int quad = translation->trans_quad;
38 int local = translation->trans_local;
39
40 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
41}
42 20
43/* Where the IO area was mapped on multiquad, always 0 otherwise */ 21/* Where the IO area was mapped on multiquad, always 0 otherwise */
44void *xquad_portio; 22void *xquad_portio;
45#ifdef CONFIG_X86_NUMAQ
46EXPORT_SYMBOL(xquad_portio); 23EXPORT_SYMBOL(xquad_portio);
47#endif
48 24
49#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) 25#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
50 26
@@ -179,6 +155,9 @@ static int __init pci_numa_init(void)
179{ 155{
180 int quad; 156 int quad;
181 157
158 if (!found_numaq)
159 return 0;
160
182 raw_pci_ops = &pci_direct_conf1_mq; 161 raw_pci_ops = &pci_direct_conf1_mq;
183 162
184 if (pcibios_scanned++) 163 if (pcibios_scanned++)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09c1c69c37a..275163f81464 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1196,6 +1196,7 @@ asmlinkage void __init xen_start_kernel(void)
1196 1196
1197 pgd = (pgd_t *)xen_start_info->pt_base; 1197 pgd = (pgd_t *)xen_start_info->pt_base;
1198 1198
1199 init_pg_tables_start = __pa(pgd);
1199 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1200 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1200 1201
1201 init_mm.pgd = pgd; /* use the Xen pagetables to start */ 1202 init_mm.pgd = pgd; /* use the Xen pagetables to start */
@@ -1236,5 +1237,5 @@ asmlinkage void __init xen_start_kernel(void)
1236 add_preferred_console("hvc", 0, NULL); 1237 add_preferred_console("hvc", 0, NULL);
1237 1238
1238 /* Start the world */ 1239 /* Start the world */
1239 start_kernel(); 1240 i386_start_kernel();
1240} 1241}