aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-05-19 21:07:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-19 21:07:31 -0400
commit13588209aa90d9c8e502750fc86160314555612f (patch)
tree91f5514aebf7244886070a6894c8e86c2b7ff4ce /arch/x86
parentac2941f59a38eeb535e1f227a8f90d7fe6b7828b (diff)
parentdc382fd5bcca7098a984705ed6ac880f539d068e (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (50 commits) x86, mm: Allow ZONE_DMA to be configurable x86, NUMA: Trim numa meminfo with max_pfn in a separate loop x86, NUMA: Rename setup_node_bootmem() to setup_node_data() x86, NUMA: Enable emulation on 32bit too x86, NUMA: Enable CONFIG_AMD_NUMA on 32bit too x86, NUMA: Rename amdtopology_64.c to amdtopology.c x86, NUMA: Make numa_init_array() static x86, NUMA: Make 32bit use common NUMA init path x86, NUMA: Initialize and use remap allocator from setup_node_bootmem() x86-32, NUMA: Add @start and @end to init_alloc_remap() x86, NUMA: Remove long 64bit assumption from numa.c x86, NUMA: Enable build of generic NUMA init code on 32bit x86, NUMA: Move NUMA init logic from numa_64.c to numa.c x86-32, NUMA: Update numaq to use new NUMA init protocol x86-32, NUMA: Replace srat_32.c with srat.c x86-32, NUMA: implement temporary NUMA init shims x86, NUMA: Move numa_nodes_parsed to numa.[hc] x86-32, NUMA: Move get_memcfg_numa() into numa_32.c x86, NUMA: make srat.c 32bit safe x86, NUMA: rename srat_64.c to srat.c ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig43
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/apic.h9
-rw-r--r--arch/x86/include/asm/cpufeature.h13
-rw-r--r--arch/x86/include/asm/dma.h12
-rw-r--r--arch/x86/include/asm/mmzone_32.h20
-rw-r--r--arch/x86/include/asm/mmzone_64.h23
-rw-r--r--arch/x86/include/asm/numa.h32
-rw-r--r--arch/x86/include/asm/numa_32.h10
-rw-r--r--arch/x86/include/asm/numa_64.h36
-rw-r--r--arch/x86/include/asm/numaq.h7
-rw-r--r--arch/x86/include/asm/percpu.h27
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/topology.h8
-rw-r--r--arch/x86/kernel/apic/apic.c28
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/es7000_32.c7
-rw-r--r--arch/x86/kernel/apic/numaq_32.c34
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c7
-rw-r--r--arch/x86/kernel/mpparse.c2
-rw-r--r--arch/x86/kernel/process.c4
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/mm/Makefile4
-rw-r--r--arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/amdtopology_64.c)21
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/numa.c550
-rw-r--r--arch/x86/mm/numa_32.c398
-rw-r--r--arch/x86/mm/numa_64.c644
-rw-r--r--arch/x86/mm/numa_emulation.c16
-rw-r--r--arch/x86/mm/numa_internal.h8
-rw-r--r--arch/x86/mm/srat.c (renamed from arch/x86/mm/srat_64.c)82
-rw-r--r--arch/x86/mm/srat_32.c288
38 files changed, 826 insertions, 1592 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 38adb2dca1d5..0a1fe60037f2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -112,7 +112,14 @@ config MMU
112 def_bool y 112 def_bool y
113 113
114config ZONE_DMA 114config ZONE_DMA
115 def_bool y 115 bool "DMA memory allocation support" if EXPERT
116 default y
117 help
118 DMA memory allocation support allows devices with less than 32-bit
119 addressing to allocate within the first 16MB of address space.
120 Disable if no such devices will be used.
121
122 If unsure, say Y.
116 123
117config SBUS 124config SBUS
118 bool 125 bool
@@ -1164,7 +1171,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
1164config AMD_NUMA 1171config AMD_NUMA
1165 def_bool y 1172 def_bool y
1166 prompt "Old style AMD Opteron NUMA detection" 1173 prompt "Old style AMD Opteron NUMA detection"
1167 depends on X86_64 && NUMA && PCI 1174 depends on NUMA && PCI
1168 ---help--- 1175 ---help---
1169 Enable AMD NUMA node topology detection. You should say Y here if 1176 Enable AMD NUMA node topology detection. You should say Y here if
1170 you have a multi processor AMD system. This uses an old method to 1177 you have a multi processor AMD system. This uses an old method to
@@ -1191,7 +1198,7 @@ config NODES_SPAN_OTHER_NODES
1191 1198
1192config NUMA_EMU 1199config NUMA_EMU
1193 bool "NUMA emulation" 1200 bool "NUMA emulation"
1194 depends on X86_64 && NUMA 1201 depends on NUMA
1195 ---help--- 1202 ---help---
1196 Enable NUMA emulation. A flat machine will be split 1203 Enable NUMA emulation. A flat machine will be split
1197 into virtual nodes when booted with "numa=fake=N", where N is the 1204 into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1213,6 +1220,10 @@ config HAVE_ARCH_BOOTMEM
1213 def_bool y 1220 def_bool y
1214 depends on X86_32 && NUMA 1221 depends on X86_32 && NUMA
1215 1222
1223config HAVE_ARCH_ALLOC_REMAP
1224 def_bool y
1225 depends on X86_32 && NUMA
1226
1216config ARCH_HAVE_MEMORY_PRESENT 1227config ARCH_HAVE_MEMORY_PRESENT
1217 def_bool y 1228 def_bool y
1218 depends on X86_32 && DISCONTIGMEM 1229 depends on X86_32 && DISCONTIGMEM
@@ -1221,13 +1232,9 @@ config NEED_NODE_MEMMAP_SIZE
1221 def_bool y 1232 def_bool y
1222 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) 1233 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
1223 1234
1224config HAVE_ARCH_ALLOC_REMAP
1225 def_bool y
1226 depends on X86_32 && NUMA
1227
1228config ARCH_FLATMEM_ENABLE 1235config ARCH_FLATMEM_ENABLE
1229 def_bool y 1236 def_bool y
1230 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA 1237 depends on X86_32 && !NUMA
1231 1238
1232config ARCH_DISCONTIGMEM_ENABLE 1239config ARCH_DISCONTIGMEM_ENABLE
1233 def_bool y 1240 def_bool y
@@ -1237,20 +1244,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
1237 def_bool y 1244 def_bool y
1238 depends on NUMA && X86_32 1245 depends on NUMA && X86_32
1239 1246
1240config ARCH_PROC_KCORE_TEXT
1241 def_bool y
1242 depends on X86_64 && PROC_KCORE
1243
1244config ARCH_SPARSEMEM_DEFAULT
1245 def_bool y
1246 depends on X86_64
1247
1248config ARCH_SPARSEMEM_ENABLE 1247config ARCH_SPARSEMEM_ENABLE
1249 def_bool y 1248 def_bool y
1250 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD 1249 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
1251 select SPARSEMEM_STATIC if X86_32 1250 select SPARSEMEM_STATIC if X86_32
1252 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1251 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1253 1252
1253config ARCH_SPARSEMEM_DEFAULT
1254 def_bool y
1255 depends on X86_64
1256
1254config ARCH_SELECT_MEMORY_MODEL 1257config ARCH_SELECT_MEMORY_MODEL
1255 def_bool y 1258 def_bool y
1256 depends on ARCH_SPARSEMEM_ENABLE 1259 depends on ARCH_SPARSEMEM_ENABLE
@@ -1259,6 +1262,10 @@ config ARCH_MEMORY_PROBE
1259 def_bool X86_64 1262 def_bool X86_64
1260 depends on MEMORY_HOTPLUG 1263 depends on MEMORY_HOTPLUG
1261 1264
1265config ARCH_PROC_KCORE_TEXT
1266 def_bool y
1267 depends on X86_64 && PROC_KCORE
1268
1262config ILLEGAL_POINTER_VALUE 1269config ILLEGAL_POINTER_VALUE
1263 hex 1270 hex
1264 default 0 if X86_32 1271 default 0 if X86_32
@@ -1693,10 +1700,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
1693 def_bool y 1700 def_bool y
1694 depends on MEMORY_HOTPLUG 1701 depends on MEMORY_HOTPLUG
1695 1702
1696config HAVE_ARCH_EARLY_PFN_TO_NID
1697 def_bool X86_64
1698 depends on NUMA
1699
1700config USE_PERCPU_NUMA_NODE_ID 1703config USE_PERCPU_NUMA_NODE_ID
1701 def_bool y 1704 def_bool y
1702 depends on NUMA 1705 depends on NUMA
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 12e0e7dd869c..416d865eae39 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -183,8 +183,6 @@ static inline void disable_acpi(void) { }
183 183
184#define ARCH_HAS_POWER_INIT 1 184#define ARCH_HAS_POWER_INIT 1
185 185
186struct bootnode;
187
188#ifdef CONFIG_ACPI_NUMA 186#ifdef CONFIG_ACPI_NUMA
189extern int acpi_numa; 187extern int acpi_numa;
190extern int x86_acpi_numa_init(void); 188extern int x86_acpi_numa_init(void);
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 331682231bb4..67f87f257611 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range {
11 11
12extern const struct pci_device_id amd_nb_misc_ids[]; 12extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14struct bootnode;
15 14
16extern bool early_is_amd_nb(u32 value); 15extern bool early_is_amd_nb(u32 value);
17extern int amd_cache_northbridges(void); 16extern int amd_cache_northbridges(void);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2b7d573be549..a0c46f061210 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -363,7 +363,12 @@ struct apic {
363 */ 363 */
364 int (*x86_32_early_logical_apicid)(int cpu); 364 int (*x86_32_early_logical_apicid)(int cpu);
365 365
366 /* determine CPU -> NUMA node mapping */ 366 /*
367 * Optional method called from setup_local_APIC() after logical
368 * apicid is guaranteed to be known to initialize apicid -> node
369 * mapping if NUMA initialization hasn't done so already. Don't
370 * add new users.
371 */
367 int (*x86_32_numa_cpu_node)(int cpu); 372 int (*x86_32_numa_cpu_node)(int cpu);
368#endif 373#endif
369}; 374};
@@ -537,8 +542,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
537 return cpuid_apic >> index_msb; 542 return cpuid_apic >> index_msb;
538} 543}
539 544
540extern int default_x86_32_numa_cpu_node(int cpu);
541
542#endif 545#endif
543 546
544static inline unsigned int 547static inline unsigned int
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7f2f7b123293..30afb465d486 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -208,8 +208,7 @@ extern const char * const x86_power_flags[32];
208#define test_cpu_cap(c, bit) \ 208#define test_cpu_cap(c, bit) \
209 test_bit(bit, (unsigned long *)((c)->x86_capability)) 209 test_bit(bit, (unsigned long *)((c)->x86_capability))
210 210
211#define cpu_has(c, bit) \ 211#define REQUIRED_MASK_BIT_SET(bit) \
212 (__builtin_constant_p(bit) && \
213 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ 212 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
214 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ 213 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
215 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ 214 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
@@ -219,10 +218,16 @@ extern const char * const x86_power_flags[32];
219 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 218 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
220 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ 219 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
221 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ 220 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
222 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ 221 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
223 ? 1 : \ 222
223#define cpu_has(c, bit) \
224 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
224 test_cpu_cap(c, bit)) 225 test_cpu_cap(c, bit))
225 226
227#define this_cpu_has(bit) \
228 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
229 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
230
226#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) 231#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
227 232
228#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) 233#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 057099e5faba..0bdb0c54d9a1 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -69,22 +69,18 @@
69 69
70#define MAX_DMA_CHANNELS 8 70#define MAX_DMA_CHANNELS 8
71 71
72#ifdef CONFIG_X86_32
73
74/* The maximum address that we can perform a DMA transfer to on this platform */
75#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
76
77#else
78
79/* 16MB ISA DMA zone */ 72/* 16MB ISA DMA zone */
80#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) 73#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
81 74
82/* 4GB broken PCI/AGP hardware bus master zone */ 75/* 4GB broken PCI/AGP hardware bus master zone */
83#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) 76#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
84 77
78#ifdef CONFIG_X86_32
79/* The maximum address that we can perform a DMA transfer to on this platform */
80#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
81#else
85/* Compat define for old dma zone */ 82/* Compat define for old dma zone */
86#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) 83#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
87
88#endif 84#endif
89 85
90/* 8237 DMA controllers */ 86/* 8237 DMA controllers */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 91df7c51806c..5e83a416eca8 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -13,31 +13,11 @@ extern struct pglist_data *node_data[];
13#define NODE_DATA(nid) (node_data[nid]) 13#define NODE_DATA(nid) (node_data[nid])
14 14
15#include <asm/numaq.h> 15#include <asm/numaq.h>
16/* summit or generic arch */
17#include <asm/srat.h>
18
19extern int get_memcfg_numa_flat(void);
20/*
21 * This allows any one NUMA architecture to be compiled
22 * for, and still fall back to the flat function if it
23 * fails.
24 */
25static inline void get_memcfg_numa(void)
26{
27
28 if (get_memcfg_numaq())
29 return;
30 if (get_memcfg_from_srat())
31 return;
32 get_memcfg_numa_flat();
33}
34 16
35extern void resume_map_numa_kva(pgd_t *pgd); 17extern void resume_map_numa_kva(pgd_t *pgd);
36 18
37#else /* !CONFIG_NUMA */ 19#else /* !CONFIG_NUMA */
38 20
39#define get_memcfg_numa get_memcfg_numa_flat
40
41static inline void resume_map_numa_kva(pgd_t *pgd) {} 21static inline void resume_map_numa_kva(pgd_t *pgd) {}
42 22
43#endif /* CONFIG_NUMA */ 23#endif /* CONFIG_NUMA */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index 288b96f815a6..b3f88d7867c7 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -4,36 +4,13 @@
4#ifndef _ASM_X86_MMZONE_64_H 4#ifndef _ASM_X86_MMZONE_64_H
5#define _ASM_X86_MMZONE_64_H 5#define _ASM_X86_MMZONE_64_H
6 6
7
8#ifdef CONFIG_NUMA 7#ifdef CONFIG_NUMA
9 8
10#include <linux/mmdebug.h> 9#include <linux/mmdebug.h>
11
12#include <asm/smp.h> 10#include <asm/smp.h>
13 11
14/* Simple perfect hash to map physical addresses to node numbers */
15struct memnode {
16 int shift;
17 unsigned int mapsize;
18 s16 *map;
19 s16 embedded_map[64 - 8];
20} ____cacheline_aligned; /* total size = 128 bytes */
21extern struct memnode memnode;
22#define memnode_shift memnode.shift
23#define memnodemap memnode.map
24#define memnodemapsize memnode.mapsize
25
26extern struct pglist_data *node_data[]; 12extern struct pglist_data *node_data[];
27 13
28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
29{
30 unsigned nid;
31 VIRTUAL_BUG_ON(!memnodemap);
32 nid = memnodemap[addr >> memnode_shift];
33 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
34 return nid;
35}
36
37#define NODE_DATA(nid) (node_data[nid]) 14#define NODE_DATA(nid) (node_data[nid])
38 15
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 16#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index a50fc9f493b3..bfacd2ccf651 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,12 +1,24 @@
1#ifndef _ASM_X86_NUMA_H 1#ifndef _ASM_X86_NUMA_H
2#define _ASM_X86_NUMA_H 2#define _ASM_X86_NUMA_H
3 3
4#include <linux/nodemask.h>
5
4#include <asm/topology.h> 6#include <asm/topology.h>
5#include <asm/apicdef.h> 7#include <asm/apicdef.h>
6 8
7#ifdef CONFIG_NUMA 9#ifdef CONFIG_NUMA
8 10
9#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 11#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
12#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
13
14/*
15 * Too small node sizes may confuse the VM badly. Usually they
16 * result from BIOS bugs. So dont recognize nodes as standalone
17 * NUMA entities that have less than this amount of RAM listed:
18 */
19#define NODE_MIN_SIZE (4*1024*1024)
20
21extern int numa_off;
10 22
11/* 23/*
12 * __apicid_to_node[] stores the raw mapping between physical apicid and 24 * __apicid_to_node[] stores the raw mapping between physical apicid and
@@ -17,15 +29,27 @@
17 * numa_cpu_node(). 29 * numa_cpu_node().
18 */ 30 */
19extern s16 __apicid_to_node[MAX_LOCAL_APIC]; 31extern s16 __apicid_to_node[MAX_LOCAL_APIC];
32extern nodemask_t numa_nodes_parsed __initdata;
33
34extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
35extern void __init numa_set_distance(int from, int to, int distance);
20 36
21static inline void set_apicid_to_node(int apicid, s16 node) 37static inline void set_apicid_to_node(int apicid, s16 node)
22{ 38{
23 __apicid_to_node[apicid] = node; 39 __apicid_to_node[apicid] = node;
24} 40}
41
42extern int __cpuinit numa_cpu_node(int cpu);
43
25#else /* CONFIG_NUMA */ 44#else /* CONFIG_NUMA */
26static inline void set_apicid_to_node(int apicid, s16 node) 45static inline void set_apicid_to_node(int apicid, s16 node)
27{ 46{
28} 47}
48
49static inline int numa_cpu_node(int cpu)
50{
51 return NUMA_NO_NODE;
52}
29#endif /* CONFIG_NUMA */ 53#endif /* CONFIG_NUMA */
30 54
31#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
@@ -37,14 +61,12 @@ static inline void set_apicid_to_node(int apicid, s16 node)
37#ifdef CONFIG_NUMA 61#ifdef CONFIG_NUMA
38extern void __cpuinit numa_set_node(int cpu, int node); 62extern void __cpuinit numa_set_node(int cpu, int node);
39extern void __cpuinit numa_clear_node(int cpu); 63extern void __cpuinit numa_clear_node(int cpu);
40extern void __init numa_init_array(void);
41extern void __init init_cpu_to_node(void); 64extern void __init init_cpu_to_node(void);
42extern void __cpuinit numa_add_cpu(int cpu); 65extern void __cpuinit numa_add_cpu(int cpu);
43extern void __cpuinit numa_remove_cpu(int cpu); 66extern void __cpuinit numa_remove_cpu(int cpu);
44#else /* CONFIG_NUMA */ 67#else /* CONFIG_NUMA */
45static inline void numa_set_node(int cpu, int node) { } 68static inline void numa_set_node(int cpu, int node) { }
46static inline void numa_clear_node(int cpu) { } 69static inline void numa_clear_node(int cpu) { }
47static inline void numa_init_array(void) { }
48static inline void init_cpu_to_node(void) { } 70static inline void init_cpu_to_node(void) { }
49static inline void numa_add_cpu(int cpu) { } 71static inline void numa_add_cpu(int cpu) { }
50static inline void numa_remove_cpu(int cpu) { } 72static inline void numa_remove_cpu(int cpu) { }
@@ -54,4 +76,10 @@ static inline void numa_remove_cpu(int cpu) { }
54void debug_cpumask_set_cpu(int cpu, int node, bool enable); 76void debug_cpumask_set_cpu(int cpu, int node, bool enable);
55#endif 77#endif
56 78
79#ifdef CONFIG_NUMA_EMU
80#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
81#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
82void numa_emu_cmdline(char *);
83#endif /* CONFIG_NUMA_EMU */
84
57#endif /* _ASM_X86_NUMA_H */ 85#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index c6beed1ef103..e7d6b8254742 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,16 +1,6 @@
1#ifndef _ASM_X86_NUMA_32_H 1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H 2#define _ASM_X86_NUMA_32_H
3 3
4extern int numa_off;
5
6extern int pxm_to_nid(int pxm);
7
8#ifdef CONFIG_NUMA
9extern int __cpuinit numa_cpu_node(int cpu);
10#else /* CONFIG_NUMA */
11static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
12#endif /* CONFIG_NUMA */
13
14#ifdef CONFIG_HIGHMEM 4#ifdef CONFIG_HIGHMEM
15extern void set_highmem_pages_init(void); 5extern void set_highmem_pages_init(void);
16#else 6#else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 344eb1790b46..0c05f7ae46e8 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,42 +1,6 @@
1#ifndef _ASM_X86_NUMA_64_H 1#ifndef _ASM_X86_NUMA_64_H
2#define _ASM_X86_NUMA_64_H 2#define _ASM_X86_NUMA_64_H
3 3
4#include <linux/nodemask.h>
5
6struct bootnode {
7 u64 start;
8 u64 end;
9};
10
11#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
12
13extern int numa_off;
14
15extern unsigned long numa_free_all_bootmem(void); 4extern unsigned long numa_free_all_bootmem(void);
16extern void setup_node_bootmem(int nodeid, unsigned long start,
17 unsigned long end);
18
19#ifdef CONFIG_NUMA
20/*
21 * Too small node sizes may confuse the VM badly. Usually they
22 * result from BIOS bugs. So dont recognize nodes as standalone
23 * NUMA entities that have less than this amount of RAM listed:
24 */
25#define NODE_MIN_SIZE (4*1024*1024)
26
27extern nodemask_t numa_nodes_parsed __initdata;
28
29extern int __cpuinit numa_cpu_node(int cpu);
30extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
31extern void __init numa_set_distance(int from, int to, int distance);
32
33#ifdef CONFIG_NUMA_EMU
34#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
35#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
36void numa_emu_cmdline(char *);
37#endif /* CONFIG_NUMA_EMU */
38#else
39static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
40#endif
41 5
42#endif /* _ASM_X86_NUMA_64_H */ 6#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 37c516545ec8..c3b3c322fd87 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -29,7 +29,7 @@
29#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
30 30
31extern int found_numaq; 31extern int found_numaq;
32extern int get_memcfg_numaq(void); 32extern int numaq_numa_init(void);
33extern int pci_numaq_init(void); 33extern int pci_numaq_init(void);
34 34
35extern void *xquad_portio; 35extern void *xquad_portio;
@@ -166,11 +166,6 @@ struct sys_cfg_data {
166 166
167void numaq_tsc_disable(void); 167void numaq_tsc_disable(void);
168 168
169#else
170static inline int get_memcfg_numaq(void)
171{
172 return 0;
173}
174#endif /* CONFIG_X86_NUMAQ */ 169#endif /* CONFIG_X86_NUMAQ */
175#endif /* _ASM_X86_NUMAQ_H */ 170#endif /* _ASM_X86_NUMAQ_H */
176 171
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 751e7f3f705c..53278b0dfdf6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -542,6 +542,33 @@ do { \
542 old__; \ 542 old__; \
543}) 543})
544 544
545static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
546 const unsigned long __percpu *addr)
547{
548 unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
549
550 return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
551}
552
553static inline int x86_this_cpu_variable_test_bit(int nr,
554 const unsigned long __percpu *addr)
555{
556 int oldbit;
557
558 asm volatile("bt "__percpu_arg(2)",%1\n\t"
559 "sbb %0,%0"
560 : "=r" (oldbit)
561 : "m" (*(unsigned long *)addr), "Ir" (nr));
562
563 return oldbit;
564}
565
566#define x86_this_cpu_test_bit(nr, addr) \
567 (__builtin_constant_p((nr)) \
568 ? x86_this_cpu_constant_test_bit((nr), (addr)) \
569 : x86_this_cpu_variable_test_bit((nr), (addr)))
570
571
545#include <asm-generic/percpu.h> 572#include <asm-generic/percpu.h>
546 573
547/* We can use this directly for local CPU (faster). */ 574/* We can use this directly for local CPU (faster). */
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
deleted file mode 100644
index b508d639d1a7..000000000000
--- a/arch/x86/include/asm/srat.h
+++ /dev/null
@@ -1,39 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26
27#ifndef _ASM_X86_SRAT_H
28#define _ASM_X86_SRAT_H
29
30#ifdef CONFIG_ACPI_NUMA
31extern int get_memcfg_from_srat(void);
32#else
33static inline int get_memcfg_from_srat(void)
34{
35 return 0;
36}
37#endif
38
39#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 910a7084f7f2..c00692476e9f 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -93,19 +93,11 @@ extern void setup_node_to_cpumask_map(void);
93#define pcibus_to_node(bus) __pcibus_to_node(bus) 93#define pcibus_to_node(bus) __pcibus_to_node(bus)
94 94
95#ifdef CONFIG_X86_32 95#ifdef CONFIG_X86_32
96extern unsigned long node_start_pfn[];
97extern unsigned long node_end_pfn[];
98extern unsigned long node_remap_size[];
99#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
100
101# define SD_CACHE_NICE_TRIES 1 96# define SD_CACHE_NICE_TRIES 1
102# define SD_IDLE_IDX 1 97# define SD_IDLE_IDX 1
103
104#else 98#else
105
106# define SD_CACHE_NICE_TRIES 2 99# define SD_CACHE_NICE_TRIES 2
107# define SD_IDLE_IDX 2 100# define SD_IDLE_IDX 2
108
109#endif 101#endif
110 102
111/* sched_domains SD_NODE_INIT for NUMA machines */ 103/* sched_domains SD_NODE_INIT for NUMA machines */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ae147126b7b7..f92a8e5d1e21 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void)
505{ 505{
506 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 506 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
507 507
508 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { 508 if (this_cpu_has(X86_FEATURE_ARAT)) {
509 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; 509 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
510 /* Make LAPIC timer preferrable over percpu HPET */ 510 /* Make LAPIC timer preferrable over percpu HPET */
511 lapic_clockevent.rating = 150; 511 lapic_clockevent.rating = 150;
@@ -1237,6 +1237,17 @@ void __cpuinit setup_local_APIC(void)
1237 /* always use the value from LDR */ 1237 /* always use the value from LDR */
1238 early_per_cpu(x86_cpu_to_logical_apicid, cpu) = 1238 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1239 logical_smp_processor_id(); 1239 logical_smp_processor_id();
1240
1241 /*
1242 * Some NUMA implementations (NUMAQ) don't initialize apicid to
1243 * node mapping during NUMA init. Now that logical apicid is
1244 * guaranteed to be known, give it another chance. This is already
1245 * a bit too late - percpu allocation has already happened without
1246 * proper NUMA affinity.
1247 */
1248 if (apic->x86_32_numa_cpu_node)
1249 set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
1250 apic->x86_32_numa_cpu_node(cpu));
1240#endif 1251#endif
1241 1252
1242 /* 1253 /*
@@ -2014,21 +2025,6 @@ void default_init_apic_ldr(void)
2014 apic_write(APIC_LDR, val); 2025 apic_write(APIC_LDR, val);
2015} 2026}
2016 2027
2017#ifdef CONFIG_X86_32
2018int default_x86_32_numa_cpu_node(int cpu)
2019{
2020#ifdef CONFIG_NUMA
2021 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
2022
2023 if (apicid != BAD_APICID)
2024 return __apicid_to_node[apicid];
2025 return NUMA_NO_NODE;
2026#else
2027 return 0;
2028#endif
2029}
2030#endif
2031
2032/* 2028/*
2033 * Power management 2029 * Power management
2034 */ 2030 */
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index f1baa2dc087a..775b82bc655c 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v)
119 WARN_ON_ONCE(cpu_has_apic && !disable_apic); 119 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
120} 120}
121 121
122#ifdef CONFIG_X86_32
123static int noop_x86_32_numa_cpu_node(int cpu)
124{
125 /* we're always on node 0 */
126 return 0;
127}
128#endif
129
130struct apic apic_noop = { 122struct apic apic_noop = {
131 .name = "noop", 123 .name = "noop",
132 .probe = noop_probe, 124 .probe = noop_probe,
@@ -195,6 +187,5 @@ struct apic apic_noop = {
195 187
196#ifdef CONFIG_X86_32 188#ifdef CONFIG_X86_32
197 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, 189 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
198 .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
199#endif 190#endif
200}; 191};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 541a2e431659..d84ac5a584b5 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -253,5 +253,4 @@ struct apic apic_bigsmp = {
253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254 254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, 255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
256 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
257}; 256};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 3e9de4854c5b..70533de5bd29 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void)
510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
511} 511}
512 512
513static int es7000_numa_cpu_node(int cpu)
514{
515 return 0;
516}
517
518static int es7000_cpu_present_to_apicid(int mps_cpu) 513static int es7000_cpu_present_to_apicid(int mps_cpu)
519{ 514{
520 if (!mps_cpu) 515 if (!mps_cpu)
@@ -688,7 +683,6 @@ struct apic __refdata apic_es7000_cluster = {
688 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 683 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
689 684
690 .x86_32_early_logical_apicid = es7000_early_logical_apicid, 685 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
691 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
692}; 686};
693 687
694struct apic __refdata apic_es7000 = { 688struct apic __refdata apic_es7000 = {
@@ -752,5 +746,4 @@ struct apic __refdata apic_es7000 = {
752 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 746 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
753 747
754 .x86_32_early_logical_apicid = es7000_early_logical_apicid, 748 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
755 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
756}; 749};
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134b..30f13319e24b 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -48,8 +48,6 @@
48#include <asm/e820.h> 48#include <asm/e820.h>
49#include <asm/ipi.h> 49#include <asm/ipi.h>
50 50
51#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
52
53int found_numaq; 51int found_numaq;
54 52
55/* 53/*
@@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4];
79static inline void numaq_register_node(int node, struct sys_cfg_data *scd) 77static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
80{ 78{
81 struct eachquadmem *eq = scd->eq + node; 79 struct eachquadmem *eq = scd->eq + node;
80 u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
81 u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
82 int ret;
82 83
83 node_set_online(node); 84 node_set(node, numa_nodes_parsed);
84 85 ret = numa_add_memblk(node, start, end);
85 /* Convert to pages */ 86 BUG_ON(ret < 0);
86 node_start_pfn[node] =
87 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
88
89 node_end_pfn[node] =
90 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
91
92 memblock_x86_register_active_regions(node, node_start_pfn[node],
93 node_end_pfn[node]);
94
95 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
96
97 node_remap_size[node] = node_memmap_size_bytes(node,
98 node_start_pfn[node],
99 node_end_pfn[node]);
100} 87}
101 88
102/* 89/*
103 * Function: smp_dump_qct() 90 * Function: smp_dump_qct()
104 * 91 *
105 * Description: gets memory layout from the quad config table. This 92 * Description: gets memory layout from the quad config table. This
106 * function also updates node_online_map with the nodes (quads) present. 93 * function also updates numa_nodes_parsed with the nodes (quads) present.
107 */ 94 */
108static void __init smp_dump_qct(void) 95static void __init smp_dump_qct(void)
109{ 96{
@@ -112,7 +99,6 @@ static void __init smp_dump_qct(void)
112 99
113 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); 100 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
114 101
115 nodes_clear(node_online_map);
116 for_each_node(node) { 102 for_each_node(node) {
117 if (scd->quads_present31_0 & (1 << node)) 103 if (scd->quads_present31_0 & (1 << node))
118 numaq_register_node(node, scd); 104 numaq_register_node(node, scd);
@@ -282,14 +268,14 @@ static __init void early_check_numaq(void)
282 } 268 }
283} 269}
284 270
285int __init get_memcfg_numaq(void) 271int __init numaq_numa_init(void)
286{ 272{
287 early_check_numaq(); 273 early_check_numaq();
288 if (!found_numaq) 274 if (!found_numaq)
289 return 0; 275 return -ENOENT;
290 smp_dump_qct(); 276 smp_dump_qct();
291 277
292 return 1; 278 return 0;
293} 279}
294 280
295#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 281#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fc84c7b61108..6541e471fd91 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -172,7 +172,6 @@ struct apic apic_default = {
172 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 172 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
173 173
174 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, 174 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
175 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
176}; 175};
177 176
178extern struct apic apic_numaq; 177extern struct apic apic_numaq;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index e4b8059b414a..35bcd7d995a1 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -551,5 +551,4 @@ struct apic apic_summit = {
551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552 552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid, 553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
554 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
555}; 554};
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index f5208ff28b5c..27c625178bf1 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -353,7 +353,6 @@ static void notify_thresholds(__u64 msr_val)
353static void intel_thermal_interrupt(void) 353static void intel_thermal_interrupt(void)
354{ 354{
355 __u64 msr_val; 355 __u64 msr_val;
356 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
357 356
358 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 357 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
359 358
@@ -365,19 +364,19 @@ static void intel_thermal_interrupt(void)
365 CORE_LEVEL) != 0) 364 CORE_LEVEL) != 0)
366 mce_log_therm_throt_event(CORE_THROTTLED | msr_val); 365 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
367 366
368 if (cpu_has(c, X86_FEATURE_PLN)) 367 if (this_cpu_has(X86_FEATURE_PLN))
369 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 368 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
370 POWER_LIMIT_EVENT, 369 POWER_LIMIT_EVENT,
371 CORE_LEVEL) != 0) 370 CORE_LEVEL) != 0)
372 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); 371 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
373 372
374 if (cpu_has(c, X86_FEATURE_PTS)) { 373 if (this_cpu_has(X86_FEATURE_PTS)) {
375 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 374 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
376 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 375 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
377 THERMAL_THROTTLING_EVENT, 376 THERMAL_THROTTLING_EVENT,
378 PACKAGE_LEVEL) != 0) 377 PACKAGE_LEVEL) != 0)
379 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); 378 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
380 if (cpu_has(c, X86_FEATURE_PLN)) 379 if (this_cpu_has(X86_FEATURE_PLN))
381 if (therm_throt_process(msr_val & 380 if (therm_throt_process(msr_val &
382 PACKAGE_THERM_STATUS_POWER_LIMIT, 381 PACKAGE_THERM_STATUS_POWER_LIMIT,
383 POWER_LIMIT_EVENT, 382 POWER_LIMIT_EVENT,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ef59817357fc..6f9bfffb2720 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -715,7 +715,7 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
715 } 715 }
716} 716}
717 717
718static int 718static int __init
719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) 719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
720{ 720{
721 if (!mpc_new_phys || count <= mpc_new_length) { 721 if (!mpc_new_phys || count <= mpc_new_length) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d46cbe46b7ab..88a90a977f8e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
449void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 449void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
450{ 450{
451 if (!need_resched()) { 451 if (!need_resched()) {
452 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) 452 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
453 clflush((void *)&current_thread_info()->flags); 453 clflush((void *)&current_thread_info()->flags);
454 454
455 __monitor((void *)&current_thread_info()->flags, 0, 0); 455 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +465,7 @@ static void mwait_idle(void)
465 if (!need_resched()) { 465 if (!need_resched()) {
466 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 466 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
467 trace_cpu_idle(1, smp_processor_id()); 467 trace_cpu_idle(1, smp_processor_id());
468 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) 468 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
469 clflush((void *)&current_thread_info()->flags); 469 clflush((void *)&current_thread_info()->flags);
470 470
471 __monitor((void *)&current_thread_info()->flags, 0, 0); 471 __monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c2871d3c71b6..a3c430bdfb60 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1332,9 +1332,9 @@ static inline void mwait_play_dead(void)
1332 void *mwait_ptr; 1332 void *mwait_ptr;
1333 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); 1333 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1334 1334
1335 if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) 1335 if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))
1336 return; 1336 return;
1337 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) 1337 if (!this_cpu_has(X86_FEATURE_CLFLSH))
1338 return; 1338 return;
1339 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) 1339 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1340 return; 1340 return;
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3e608edf9958..3d11327c9ab4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,8 +23,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
24 24
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat.o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
29 29
30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology.c
index 0919c26820d4..5247d01329ca 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/memblock.h> 14#include <linux/memblock.h>
15#include <linux/bootmem.h>
15 16
16#include <asm/io.h> 17#include <asm/io.h>
17#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
@@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void)
69 70
70int __init amd_numa_init(void) 71int __init amd_numa_init(void)
71{ 72{
72 unsigned long start = PFN_PHYS(0); 73 u64 start = PFN_PHYS(0);
73 unsigned long end = PFN_PHYS(max_pfn); 74 u64 end = PFN_PHYS(max_pfn);
74 unsigned numnodes; 75 unsigned numnodes;
75 unsigned long prevbase; 76 u64 prevbase;
76 int i, j, nb; 77 int i, j, nb;
77 u32 nodeid, reg; 78 u32 nodeid, reg;
78 unsigned int bits, cores, apicid_base; 79 unsigned int bits, cores, apicid_base;
@@ -95,7 +96,7 @@ int __init amd_numa_init(void)
95 96
96 prevbase = 0; 97 prevbase = 0;
97 for (i = 0; i < 8; i++) { 98 for (i = 0; i < 8; i++) {
98 unsigned long base, limit; 99 u64 base, limit;
99 100
100 base = read_pci_config(0, nb, 1, 0x40 + i*8); 101 base = read_pci_config(0, nb, 1, 0x40 + i*8);
101 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 102 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -107,18 +108,18 @@ int __init amd_numa_init(void)
107 continue; 108 continue;
108 } 109 }
109 if (nodeid >= numnodes) { 110 if (nodeid >= numnodes) {
110 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, 111 pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
111 base, limit); 112 base, limit);
112 continue; 113 continue;
113 } 114 }
114 115
115 if (!limit) { 116 if (!limit) {
116 pr_info("Skipping node entry %d (base %lx)\n", 117 pr_info("Skipping node entry %d (base %Lx)\n",
117 i, base); 118 i, base);
118 continue; 119 continue;
119 } 120 }
120 if ((base >> 8) & 3 || (limit >> 8) & 3) { 121 if ((base >> 8) & 3 || (limit >> 8) & 3) {
121 pr_err("Node %d using interleaving mode %lx/%lx\n", 122 pr_err("Node %d using interleaving mode %Lx/%Lx\n",
122 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 123 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
123 return -EINVAL; 124 return -EINVAL;
124 } 125 }
@@ -150,19 +151,19 @@ int __init amd_numa_init(void)
150 continue; 151 continue;
151 } 152 }
152 if (limit < base) { 153 if (limit < base) {
153 pr_err("Node %d bogus settings %lx-%lx.\n", 154 pr_err("Node %d bogus settings %Lx-%Lx.\n",
154 nodeid, base, limit); 155 nodeid, base, limit);
155 continue; 156 continue;
156 } 157 }
157 158
158 /* Could sort here, but pun for now. Should not happen anyroads. */ 159 /* Could sort here, but pun for now. Should not happen anyroads. */
159 if (prevbase > base) { 160 if (prevbase > base) {
160 pr_err("Node map not sorted %lx,%lx\n", 161 pr_err("Node map not sorted %Lx,%Lx\n",
161 prevbase, base); 162 prevbase, base);
162 return -EINVAL; 163 return -EINVAL;
163 } 164 }
164 165
165 pr_info("Node %d MemBase %016lx Limit %016lx\n", 166 pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
166 nodeid, base, limit); 167 nodeid, base, limit);
167 168
168 prevbase = base; 169 prevbase = base;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 80088f994193..29f7c6d98179 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -678,8 +678,10 @@ static void __init zone_sizes_init(void)
678{ 678{
679 unsigned long max_zone_pfns[MAX_NR_ZONES]; 679 unsigned long max_zone_pfns[MAX_NR_ZONES];
680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
681#ifdef CONFIG_ZONE_DMA
681 max_zone_pfns[ZONE_DMA] = 682 max_zone_pfns[ZONE_DMA] =
682 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 683 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
684#endif
683 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 685 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
684#ifdef CONFIG_HIGHMEM 686#ifdef CONFIG_HIGHMEM
685 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 687 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
@@ -716,6 +718,7 @@ void __init paging_init(void)
716 * NOTE: at this point the bootmem allocator is fully available. 718 * NOTE: at this point the bootmem allocator is fully available.
717 */ 719 */
718 olpc_dt_build_devicetree(); 720 olpc_dt_build_devicetree();
721 sparse_memory_present_with_active_regions(MAX_NUMNODES);
719 sparse_init(); 722 sparse_init();
720 zone_sizes_init(); 723 zone_sizes_init();
721} 724}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 794233587287..d865c4aeec55 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -616,7 +616,9 @@ void __init paging_init(void)
616 unsigned long max_zone_pfns[MAX_NR_ZONES]; 616 unsigned long max_zone_pfns[MAX_NR_ZONES];
617 617
618 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 618 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
619#ifdef CONFIG_ZONE_DMA
619 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 620 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
621#endif
620 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 622 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
621 max_zone_pfns[ZONE_NORMAL] = max_pfn; 623 max_zone_pfns[ZONE_NORMAL] = max_pfn;
622 624
@@ -679,14 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
679} 681}
680EXPORT_SYMBOL_GPL(arch_add_memory); 682EXPORT_SYMBOL_GPL(arch_add_memory);
681 683
682#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
683int memory_add_physaddr_to_nid(u64 start)
684{
685 return 0;
686}
687EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
688#endif
689
690#endif /* CONFIG_MEMORY_HOTPLUG */ 684#endif /* CONFIG_MEMORY_HOTPLUG */
691 685
692static struct kcore_list kcore_vsyscall; 686static struct kcore_list kcore_vsyscall;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0369843511dc..be1ef574ce9a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
91 return (__force void __iomem *)phys_to_virt(phys_addr); 91 return (__force void __iomem *)phys_to_virt(phys_addr);
92 92
93 /* 93 /*
94 * Check if the request spans more than any BAR in the iomem resource
95 * tree.
96 */
97 WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
98 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
99
100 /*
101 * Don't allow anybody to remap normal RAM that we're using.. 94 * Don't allow anybody to remap normal RAM that we're using..
102 */ 95 */
103 last_pfn = last_addr >> PAGE_SHIFT; 96 last_pfn = last_addr >> PAGE_SHIFT;
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 ret_addr = (void __iomem *) (vaddr + offset); 163 ret_addr = (void __iomem *) (vaddr + offset);
171 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 164 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
172 165
166 /*
167 * Check if the request spans more than any BAR in the iomem resource
168 * tree.
169 */
170 WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
171 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
172
173 return ret_addr; 173 return ret_addr;
174err_free_area: 174err_free_area:
175 free_vm_area(area); 175 free_vm_area(area);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 745258dfc4dc..f5510d889a22 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,11 +1,39 @@
1/* Common code for 32 and 64-bit NUMA */ 1/* Common code for 32 and 64-bit NUMA */
2#include <linux/topology.h> 2#include <linux/kernel.h>
3#include <linux/module.h> 3#include <linux/mm.h>
4#include <linux/string.h>
5#include <linux/init.h>
4#include <linux/bootmem.h> 6#include <linux/bootmem.h>
5#include <asm/numa.h> 7#include <linux/memblock.h>
8#include <linux/mmzone.h>
9#include <linux/ctype.h>
10#include <linux/module.h>
11#include <linux/nodemask.h>
12#include <linux/sched.h>
13#include <linux/topology.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
6#include <asm/acpi.h> 18#include <asm/acpi.h>
19#include <asm/amd_nb.h>
20
21#include "numa_internal.h"
7 22
8int __initdata numa_off; 23int __initdata numa_off;
24nodemask_t numa_nodes_parsed __initdata;
25
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29static struct numa_meminfo numa_meminfo
30#ifndef CONFIG_MEMORY_HOTPLUG
31__initdata
32#endif
33;
34
35static int numa_distance_cnt;
36static u8 *numa_distance;
9 37
10static __init int numa_setup(char *opt) 38static __init int numa_setup(char *opt)
11{ 39{
@@ -32,6 +60,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
32 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
33}; 61};
34 62
63int __cpuinit numa_cpu_node(int cpu)
64{
65 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
66
67 if (apicid != BAD_APICID)
68 return __apicid_to_node[apicid];
69 return NUMA_NO_NODE;
70}
71
35cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 72cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
36EXPORT_SYMBOL(node_to_cpumask_map); 73EXPORT_SYMBOL(node_to_cpumask_map);
37 74
@@ -95,6 +132,407 @@ void __init setup_node_to_cpumask_map(void)
95 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 132 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
96} 133}
97 134
135static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
136 struct numa_meminfo *mi)
137{
138 /* ignore zero length blks */
139 if (start == end)
140 return 0;
141
142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
145 nid, start, end);
146 return 0;
147 }
148
149 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
150 pr_err("NUMA: too many memblk ranges\n");
151 return -EINVAL;
152 }
153
154 mi->blk[mi->nr_blks].start = start;
155 mi->blk[mi->nr_blks].end = end;
156 mi->blk[mi->nr_blks].nid = nid;
157 mi->nr_blks++;
158 return 0;
159}
160
161/**
162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
163 * @idx: Index of memblk to remove
164 * @mi: numa_meminfo to remove memblk from
165 *
166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
167 * decrementing @mi->nr_blks.
168 */
169void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
170{
171 mi->nr_blks--;
172 memmove(&mi->blk[idx], &mi->blk[idx + 1],
173 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
174}
175
176/**
177 * numa_add_memblk - Add one numa_memblk to numa_meminfo
178 * @nid: NUMA node ID of the new memblk
179 * @start: Start address of the new memblk
180 * @end: End address of the new memblk
181 *
182 * Add a new memblk to the default numa_meminfo.
183 *
184 * RETURNS:
185 * 0 on success, -errno on failure.
186 */
187int __init numa_add_memblk(int nid, u64 start, u64 end)
188{
189 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
190}
191
192/* Initialize NODE_DATA for a node on the local memory */
193static void __init setup_node_data(int nid, u64 start, u64 end)
194{
195 const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
196 const u64 nd_high = PFN_PHYS(max_pfn_mapped);
197 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
198 bool remapped = false;
199 u64 nd_pa;
200 void *nd;
201 int tnid;
202
203 /*
204 * Don't confuse VM with a node that doesn't have the
205 * minimum amount of memory:
206 */
207 if (end && (end - start) < NODE_MIN_SIZE)
208 return;
209
210 /* initialize remap allocator before aligning to ZONE_ALIGN */
211 init_alloc_remap(nid, start, end);
212
213 start = roundup(start, ZONE_ALIGN);
214
215 printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
216 nid, start, end);
217
218 /*
219 * Allocate node data. Try remap allocator first, node-local
220 * memory and then any node. Never allocate in DMA zone.
221 */
222 nd = alloc_remap(nid, nd_size);
223 if (nd) {
224 nd_pa = __pa(nd);
225 remapped = true;
226 } else {
227 nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
228 nd_size, SMP_CACHE_BYTES);
229 if (nd_pa == MEMBLOCK_ERROR)
230 nd_pa = memblock_find_in_range(nd_low, nd_high,
231 nd_size, SMP_CACHE_BYTES);
232 if (nd_pa == MEMBLOCK_ERROR) {
233 pr_err("Cannot find %zu bytes in node %d\n",
234 nd_size, nid);
235 return;
236 }
237 memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
238 nd = __va(nd_pa);
239 }
240
241 /* report and initialize */
242 printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
243 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
244 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
245 if (!remapped && tnid != nid)
246 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
247
248 node_data[nid] = nd;
249 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
250 NODE_DATA(nid)->node_id = nid;
251 NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
252 NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
253
254 node_set_online(nid);
255}
256
257/**
258 * numa_cleanup_meminfo - Cleanup a numa_meminfo
259 * @mi: numa_meminfo to clean up
260 *
261 * Sanitize @mi by merging and removing unncessary memblks. Also check for
262 * conflicts and clear unused memblks.
263 *
264 * RETURNS:
265 * 0 on success, -errno on failure.
266 */
267int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
268{
269 const u64 low = 0;
270 const u64 high = PFN_PHYS(max_pfn);
271 int i, j, k;
272
273 /* first, trim all entries */
274 for (i = 0; i < mi->nr_blks; i++) {
275 struct numa_memblk *bi = &mi->blk[i];
276
277 /* make sure all blocks are inside the limits */
278 bi->start = max(bi->start, low);
279 bi->end = min(bi->end, high);
280
281 /* and there's no empty block */
282 if (bi->start >= bi->end)
283 numa_remove_memblk_from(i--, mi);
284 }
285
286 /* merge neighboring / overlapping entries */
287 for (i = 0; i < mi->nr_blks; i++) {
288 struct numa_memblk *bi = &mi->blk[i];
289
290 for (j = i + 1; j < mi->nr_blks; j++) {
291 struct numa_memblk *bj = &mi->blk[j];
292 u64 start, end;
293
294 /*
295 * See whether there are overlapping blocks. Whine
296 * about but allow overlaps of the same nid. They
297 * will be merged below.
298 */
299 if (bi->end > bj->start && bi->start < bj->end) {
300 if (bi->nid != bj->nid) {
301 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
302 bi->nid, bi->start, bi->end,
303 bj->nid, bj->start, bj->end);
304 return -EINVAL;
305 }
306 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
307 bi->nid, bi->start, bi->end,
308 bj->start, bj->end);
309 }
310
311 /*
312 * Join together blocks on the same node, holes
313 * between which don't overlap with memory on other
314 * nodes.
315 */
316 if (bi->nid != bj->nid)
317 continue;
318 start = min(bi->start, bj->start);
319 end = max(bi->end, bj->end);
320 for (k = 0; k < mi->nr_blks; k++) {
321 struct numa_memblk *bk = &mi->blk[k];
322
323 if (bi->nid == bk->nid)
324 continue;
325 if (start < bk->end && end > bk->start)
326 break;
327 }
328 if (k < mi->nr_blks)
329 continue;
330 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
331 bi->nid, bi->start, bi->end, bj->start, bj->end,
332 start, end);
333 bi->start = start;
334 bi->end = end;
335 numa_remove_memblk_from(j--, mi);
336 }
337 }
338
339 /* clear unused ones */
340 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
341 mi->blk[i].start = mi->blk[i].end = 0;
342 mi->blk[i].nid = NUMA_NO_NODE;
343 }
344
345 return 0;
346}
347
348/*
349 * Set nodes, which have memory in @mi, in *@nodemask.
350 */
351static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
352 const struct numa_meminfo *mi)
353{
354 int i;
355
356 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
357 if (mi->blk[i].start != mi->blk[i].end &&
358 mi->blk[i].nid != NUMA_NO_NODE)
359 node_set(mi->blk[i].nid, *nodemask);
360}
361
362/**
363 * numa_reset_distance - Reset NUMA distance table
364 *
365 * The current table is freed. The next numa_set_distance() call will
366 * create a new one.
367 */
368void __init numa_reset_distance(void)
369{
370 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
371
372 /* numa_distance could be 1LU marking allocation failure, test cnt */
373 if (numa_distance_cnt)
374 memblock_x86_free_range(__pa(numa_distance),
375 __pa(numa_distance) + size);
376 numa_distance_cnt = 0;
377 numa_distance = NULL; /* enable table creation */
378}
379
380static int __init numa_alloc_distance(void)
381{
382 nodemask_t nodes_parsed;
383 size_t size;
384 int i, j, cnt = 0;
385 u64 phys;
386
387 /* size the new table and allocate it */
388 nodes_parsed = numa_nodes_parsed;
389 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
390
391 for_each_node_mask(i, nodes_parsed)
392 cnt = i;
393 cnt++;
394 size = cnt * cnt * sizeof(numa_distance[0]);
395
396 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
397 size, PAGE_SIZE);
398 if (phys == MEMBLOCK_ERROR) {
399 pr_warning("NUMA: Warning: can't allocate distance table!\n");
400 /* don't retry until explicitly reset */
401 numa_distance = (void *)1LU;
402 return -ENOMEM;
403 }
404 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
405
406 numa_distance = __va(phys);
407 numa_distance_cnt = cnt;
408
409 /* fill with the default distances */
410 for (i = 0; i < cnt; i++)
411 for (j = 0; j < cnt; j++)
412 numa_distance[i * cnt + j] = i == j ?
413 LOCAL_DISTANCE : REMOTE_DISTANCE;
414 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
415
416 return 0;
417}
418
419/**
420 * numa_set_distance - Set NUMA distance from one NUMA to another
421 * @from: the 'from' node to set distance
422 * @to: the 'to' node to set distance
423 * @distance: NUMA distance
424 *
425 * Set the distance from node @from to @to to @distance. If distance table
426 * doesn't exist, one which is large enough to accommodate all the currently
427 * known nodes will be created.
428 *
429 * If such table cannot be allocated, a warning is printed and further
430 * calls are ignored until the distance table is reset with
431 * numa_reset_distance().
432 *
433 * If @from or @to is higher than the highest known node at the time of
434 * table creation or @distance doesn't make sense, the call is ignored.
435 * This is to allow simplification of specific NUMA config implementations.
436 */
437void __init numa_set_distance(int from, int to, int distance)
438{
439 if (!numa_distance && numa_alloc_distance() < 0)
440 return;
441
442 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
443 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
444 from, to, distance);
445 return;
446 }
447
448 if ((u8)distance != distance ||
449 (from == to && distance != LOCAL_DISTANCE)) {
450 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
451 from, to, distance);
452 return;
453 }
454
455 numa_distance[from * numa_distance_cnt + to] = distance;
456}
457
458int __node_distance(int from, int to)
459{
460 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
461 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
462 return numa_distance[from * numa_distance_cnt + to];
463}
464EXPORT_SYMBOL(__node_distance);
465
466/*
467 * Sanity check to catch more bad NUMA configurations (they are amazingly
468 * common). Make sure the nodes cover all memory.
469 */
470static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
471{
472 u64 numaram, e820ram;
473 int i;
474
475 numaram = 0;
476 for (i = 0; i < mi->nr_blks; i++) {
477 u64 s = mi->blk[i].start >> PAGE_SHIFT;
478 u64 e = mi->blk[i].end >> PAGE_SHIFT;
479 numaram += e - s;
480 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
481 if ((s64)numaram < 0)
482 numaram = 0;
483 }
484
485 e820ram = max_pfn - (memblock_x86_hole_size(0,
486 PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
487 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
488 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
489 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
490 (numaram << PAGE_SHIFT) >> 20,
491 (e820ram << PAGE_SHIFT) >> 20);
492 return false;
493 }
494 return true;
495}
496
497static int __init numa_register_memblks(struct numa_meminfo *mi)
498{
499 int i, nid;
500
501 /* Account for nodes with cpus and no memory */
502 node_possible_map = numa_nodes_parsed;
503 numa_nodemask_from_meminfo(&node_possible_map, mi);
504 if (WARN_ON(nodes_empty(node_possible_map)))
505 return -EINVAL;
506
507 for (i = 0; i < mi->nr_blks; i++)
508 memblock_x86_register_active_regions(mi->blk[i].nid,
509 mi->blk[i].start >> PAGE_SHIFT,
510 mi->blk[i].end >> PAGE_SHIFT);
511
512 /* for out of order entries */
513 sort_node_map();
514 if (!numa_meminfo_cover_memory(mi))
515 return -EINVAL;
516
517 /* Finally register nodes. */
518 for_each_node_mask(nid, node_possible_map) {
519 u64 start = PFN_PHYS(max_pfn);
520 u64 end = 0;
521
522 for (i = 0; i < mi->nr_blks; i++) {
523 if (nid != mi->blk[i].nid)
524 continue;
525 start = min(mi->blk[i].start, start);
526 end = max(mi->blk[i].end, end);
527 }
528
529 if (start < end)
530 setup_node_data(nid, start, end);
531 }
532
533 return 0;
534}
535
98/* 536/*
99 * There are unfortunately some poorly designed mainboards around that 537 * There are unfortunately some poorly designed mainboards around that
100 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 538 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
@@ -102,7 +540,7 @@ void __init setup_node_to_cpumask_map(void)
102 * as the number of CPUs is not known yet. We round robin the existing 540 * as the number of CPUs is not known yet. We round robin the existing
103 * nodes. 541 * nodes.
104 */ 542 */
105void __init numa_init_array(void) 543static void __init numa_init_array(void)
106{ 544{
107 int rr, i; 545 int rr, i;
108 546
@@ -117,6 +555,95 @@ void __init numa_init_array(void)
117 } 555 }
118} 556}
119 557
558static int __init numa_init(int (*init_func)(void))
559{
560 int i;
561 int ret;
562
563 for (i = 0; i < MAX_LOCAL_APIC; i++)
564 set_apicid_to_node(i, NUMA_NO_NODE);
565
566 nodes_clear(numa_nodes_parsed);
567 nodes_clear(node_possible_map);
568 nodes_clear(node_online_map);
569 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
570 remove_all_active_ranges();
571 numa_reset_distance();
572
573 ret = init_func();
574 if (ret < 0)
575 return ret;
576 ret = numa_cleanup_meminfo(&numa_meminfo);
577 if (ret < 0)
578 return ret;
579
580 numa_emulation(&numa_meminfo, numa_distance_cnt);
581
582 ret = numa_register_memblks(&numa_meminfo);
583 if (ret < 0)
584 return ret;
585
586 for (i = 0; i < nr_cpu_ids; i++) {
587 int nid = early_cpu_to_node(i);
588
589 if (nid == NUMA_NO_NODE)
590 continue;
591 if (!node_online(nid))
592 numa_clear_node(i);
593 }
594 numa_init_array();
595 return 0;
596}
597
598/**
599 * dummy_numa_init - Fallback dummy NUMA init
600 *
601 * Used if there's no underlying NUMA architecture, NUMA initialization
602 * fails, or NUMA is disabled on the command line.
603 *
604 * Must online at least one node and add memory blocks that cover all
605 * allowed memory. This function must not fail.
606 */
607static int __init dummy_numa_init(void)
608{
609 printk(KERN_INFO "%s\n",
610 numa_off ? "NUMA turned off" : "No NUMA configuration found");
611 printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
612 0LLU, PFN_PHYS(max_pfn));
613
614 node_set(0, numa_nodes_parsed);
615 numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
616
617 return 0;
618}
619
620/**
621 * x86_numa_init - Initialize NUMA
622 *
623 * Try each configured NUMA initialization method until one succeeds. The
624 * last fallback is dummy single node config encomapssing whole memory and
625 * never fails.
626 */
627void __init x86_numa_init(void)
628{
629 if (!numa_off) {
630#ifdef CONFIG_X86_NUMAQ
631 if (!numa_init(numaq_numa_init))
632 return;
633#endif
634#ifdef CONFIG_ACPI_NUMA
635 if (!numa_init(x86_acpi_numa_init))
636 return;
637#endif
638#ifdef CONFIG_AMD_NUMA
639 if (!numa_init(amd_numa_init))
640 return;
641#endif
642 }
643
644 numa_init(dummy_numa_init);
645}
646
120static __init int find_near_online_node(int node) 647static __init int find_near_online_node(int node)
121{ 648{
122 int n, val; 649 int n, val;
@@ -282,3 +809,18 @@ const struct cpumask *cpumask_of_node(int node)
282EXPORT_SYMBOL(cpumask_of_node); 809EXPORT_SYMBOL(cpumask_of_node);
283 810
284#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 811#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
812
813#ifdef CONFIG_MEMORY_HOTPLUG
814int memory_add_physaddr_to_nid(u64 start)
815{
816 struct numa_meminfo *mi = &numa_meminfo;
817 int nid = mi->blk[0].nid;
818 int i;
819
820 for (i = 0; i < mi->nr_blks; i++)
821 if (mi->blk[i].start <= start && mi->blk[i].end > start)
822 nid = mi->blk[i].nid;
823 return nid;
824}
825EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
826#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index bde3906420df..849a975d3fa0 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,39 +22,11 @@
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25#include <linux/mm.h>
26#include <linux/bootmem.h> 25#include <linux/bootmem.h>
27#include <linux/memblock.h> 26#include <linux/memblock.h>
28#include <linux/mmzone.h>
29#include <linux/highmem.h>
30#include <linux/initrd.h>
31#include <linux/nodemask.h>
32#include <linux/module.h> 27#include <linux/module.h>
33#include <linux/kexec.h>
34#include <linux/pfn.h>
35#include <linux/swap.h>
36#include <linux/acpi.h>
37
38#include <asm/e820.h>
39#include <asm/setup.h>
40#include <asm/mmzone.h>
41#include <asm/bios_ebda.h>
42#include <asm/proto.h>
43
44struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
45EXPORT_SYMBOL(node_data);
46
47/*
48 * numa interface - we expect the numa architecture specific code to have
49 * populated the following initialisation.
50 *
51 * 1) node_online_map - the map of all nodes configured (online) in the system
52 * 2) node_start_pfn - the starting page frame number for a node
53 * 3) node_end_pfn - the ending page fram number for a node
54 */
55unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
56unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
57 28
29#include "numa_internal.h"
58 30
59#ifdef CONFIG_DISCONTIGMEM 31#ifdef CONFIG_DISCONTIGMEM
60/* 32/*
@@ -99,108 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99} 71}
100#endif 72#endif
101 73
102extern unsigned long find_max_low_pfn(void);
103extern unsigned long highend_pfn, highstart_pfn; 74extern unsigned long highend_pfn, highstart_pfn;
104 75
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 76#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
106 77
107unsigned long node_remap_size[MAX_NUMNODES];
108static void *node_remap_start_vaddr[MAX_NUMNODES]; 78static void *node_remap_start_vaddr[MAX_NUMNODES];
109void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 79void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 80
111static unsigned long kva_start_pfn;
112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu)
115{
116 return apic->x86_32_numa_cpu_node(cpu);
117}
118
119/*
120 * FLAT - support for basic PC memory model with discontig enabled, essentially
121 * a single node with all available processors in it with a flat
122 * memory map.
123 */
124int __init get_memcfg_numa_flat(void)
125{
126 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
127
128 node_start_pfn[0] = 0;
129 node_end_pfn[0] = max_pfn;
130 memblock_x86_register_active_regions(0, 0, max_pfn);
131 memory_present(0, 0, max_pfn);
132 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
133
134 /* Indicate there is one node available. */
135 nodes_clear(node_online_map);
136 node_set_online(0);
137 return 1;
138}
139
140/*
141 * Find the highest page frame number we have available for the node
142 */
143static void __init propagate_e820_map_node(int nid)
144{
145 if (node_end_pfn[nid] > max_pfn)
146 node_end_pfn[nid] = max_pfn;
147 /*
148 * if a user has given mem=XXXX, then we need to make sure
149 * that the node _starts_ before that, too, not just ends
150 */
151 if (node_start_pfn[nid] > max_pfn)
152 node_start_pfn[nid] = max_pfn;
153 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
154}
155
156/*
157 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
158 * method. For node zero take this from the bottom of memory, for
159 * subsequent nodes place them at node_remap_start_vaddr which contains
160 * node local data in physically node local memory. See setup_memory()
161 * for details.
162 */
163static void __init allocate_pgdat(int nid)
164{
165 char buf[16];
166
167 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
168 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
169 else {
170 unsigned long pgdat_phys;
171 pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
172 max_pfn_mapped<<PAGE_SHIFT,
173 sizeof(pg_data_t),
174 PAGE_SIZE);
175 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
176 memset(buf, 0, sizeof(buf));
177 sprintf(buf, "NODE_DATA %d", nid);
178 memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
179 }
180 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
181 nid, (unsigned long)NODE_DATA(nid));
182}
183
184/* 81/*
185 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel 82 * Remap memory allocator
186 * virtual address space (KVA) is reserved and portions of nodes are mapped
187 * using it. This is to allow node-local memory to be allocated for
188 * structures that would normally require ZONE_NORMAL. The memory is
189 * allocated with alloc_remap() and callers should be prepared to allocate
190 * from the bootmem allocator instead.
191 */ 83 */
192static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 84static unsigned long node_remap_start_pfn[MAX_NUMNODES];
193static void *node_remap_end_vaddr[MAX_NUMNODES]; 85static void *node_remap_end_vaddr[MAX_NUMNODES];
194static void *node_remap_alloc_vaddr[MAX_NUMNODES]; 86static void *node_remap_alloc_vaddr[MAX_NUMNODES];
195static unsigned long node_remap_offset[MAX_NUMNODES];
196 87
88/**
89 * alloc_remap - Allocate remapped memory
90 * @nid: NUMA node to allocate memory from
91 * @size: The size of allocation
92 *
93 * Allocate @size bytes from the remap area of NUMA node @nid. The
94 * size of the remap area is predetermined by init_alloc_remap() and
95 * only the callers considered there should call this function. For
96 * more info, please read the comment on top of init_alloc_remap().
97 *
98 * The caller must be ready to handle allocation failure from this
99 * function and fall back to regular memory allocator in such cases.
100 *
101 * CONTEXT:
102 * Single CPU early boot context.
103 *
104 * RETURNS:
105 * Pointer to the allocated memory on success, %NULL on failure.
106 */
197void *alloc_remap(int nid, unsigned long size) 107void *alloc_remap(int nid, unsigned long size)
198{ 108{
199 void *allocation = node_remap_alloc_vaddr[nid]; 109 void *allocation = node_remap_alloc_vaddr[nid];
200 110
201 size = ALIGN(size, L1_CACHE_BYTES); 111 size = ALIGN(size, L1_CACHE_BYTES);
202 112
203 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 113 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
204 return NULL; 114 return NULL;
205 115
206 node_remap_alloc_vaddr[nid] += size; 116 node_remap_alloc_vaddr[nid] += size;
@@ -209,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
209 return allocation; 119 return allocation;
210} 120}
211 121
212static void __init remap_numa_kva(void)
213{
214 void *vaddr;
215 unsigned long pfn;
216 int node;
217
218 for_each_online_node(node) {
219 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
220 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
221 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
222 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
223 (unsigned long)vaddr,
224 node_remap_start_pfn[node] + pfn);
225 set_pmd_pfn((ulong) vaddr,
226 node_remap_start_pfn[node] + pfn,
227 PAGE_KERNEL_LARGE);
228 }
229 }
230}
231
232#ifdef CONFIG_HIBERNATION 122#ifdef CONFIG_HIBERNATION
233/** 123/**
234 * resume_map_numa_kva - add KVA mapping to the temporary page tables created 124 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -240,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
240 int node; 130 int node;
241 131
242 for_each_online_node(node) { 132 for_each_online_node(node) {
243 unsigned long start_va, start_pfn, size, pfn; 133 unsigned long start_va, start_pfn, nr_pages, pfn;
244 134
245 start_va = (unsigned long)node_remap_start_vaddr[node]; 135 start_va = (unsigned long)node_remap_start_vaddr[node];
246 start_pfn = node_remap_start_pfn[node]; 136 start_pfn = node_remap_start_pfn[node];
247 size = node_remap_size[node]; 137 nr_pages = (node_remap_end_vaddr[node] -
138 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
248 139
249 printk(KERN_DEBUG "%s: node %d\n", __func__, node); 140 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
250 141
251 for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { 142 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
252 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); 143 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
253 pgd_t *pgd = pgd_base + pgd_index(vaddr); 144 pgd_t *pgd = pgd_base + pgd_index(vaddr);
254 pud_t *pud = pud_offset(pgd, vaddr); 145 pud_t *pud = pud_offset(pgd, vaddr);
@@ -264,132 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
264} 155}
265#endif 156#endif
266 157
267static __init unsigned long calculate_numa_remap_pages(void) 158/**
159 * init_alloc_remap - Initialize remap allocator for a NUMA node
160 * @nid: NUMA node to initizlie remap allocator for
161 *
162 * NUMA nodes may end up without any lowmem. As allocating pgdat and
163 * memmap on a different node with lowmem is inefficient, a special
164 * remap allocator is implemented which can be used by alloc_remap().
165 *
166 * For each node, the amount of memory which will be necessary for
167 * pgdat and memmap is calculated and two memory areas of the size are
168 * allocated - one in the node and the other in lowmem; then, the area
169 * in the node is remapped to the lowmem area.
170 *
171 * As pgdat and memmap must be allocated in lowmem anyway, this
172 * doesn't waste lowmem address space; however, the actual lowmem
173 * which gets remapped over is wasted. The amount shouldn't be
174 * problematic on machines this feature will be used.
175 *
176 * Initialization failure isn't fatal. alloc_remap() is used
177 * opportunistically and the callers will fall back to other memory
178 * allocation mechanisms on failure.
179 */
180void __init init_alloc_remap(int nid, u64 start, u64 end)
268{ 181{
269 int nid; 182 unsigned long start_pfn = start >> PAGE_SHIFT;
270 unsigned long size, reserve_pages = 0; 183 unsigned long end_pfn = end >> PAGE_SHIFT;
271 184 unsigned long size, pfn;
272 for_each_online_node(nid) { 185 u64 node_pa, remap_pa;
273 u64 node_kva_target; 186 void *remap_va;
274 u64 node_kva_final;
275
276 /*
277 * The acpi/srat node info can show hot-add memroy zones
278 * where memory could be added but not currently present.
279 */
280 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
281 nid, node_start_pfn[nid], node_end_pfn[nid]);
282 if (node_start_pfn[nid] > max_pfn)
283 continue;
284 if (!node_end_pfn[nid])
285 continue;
286 if (node_end_pfn[nid] > max_pfn)
287 node_end_pfn[nid] = max_pfn;
288
289 /* ensure the remap includes space for the pgdat. */
290 size = node_remap_size[nid] + sizeof(pg_data_t);
291
292 /* convert size to large (pmd size) pages, rounding up */
293 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
294 /* now the roundup is correct, convert to PAGE_SIZE pages */
295 size = size * PTRS_PER_PTE;
296
297 node_kva_target = round_down(node_end_pfn[nid] - size,
298 PTRS_PER_PTE);
299 node_kva_target <<= PAGE_SHIFT;
300 do {
301 node_kva_final = memblock_find_in_range(node_kva_target,
302 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
303 ((u64)size)<<PAGE_SHIFT,
304 LARGE_PAGE_BYTES);
305 node_kva_target -= LARGE_PAGE_BYTES;
306 } while (node_kva_final == MEMBLOCK_ERROR &&
307 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
308
309 if (node_kva_final == MEMBLOCK_ERROR)
310 panic("Can not get kva ram\n");
311
312 node_remap_size[nid] = size;
313 node_remap_offset[nid] = reserve_pages;
314 reserve_pages += size;
315 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
316 " node %d at %llx\n",
317 size, nid, node_kva_final>>PAGE_SHIFT);
318
319 /*
320 * prevent kva address below max_low_pfn want it on system
321 * with less memory later.
322 * layout will be: KVA address , KVA RAM
323 *
324 * we are supposed to only record the one less then max_low_pfn
325 * but we could have some hole in high memory, and it will only
326 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
327 * to use it as free.
328 * So memblock_x86_reserve_range here, hope we don't run out of that array
329 */
330 memblock_x86_reserve_range(node_kva_final,
331 node_kva_final+(((u64)size)<<PAGE_SHIFT),
332 "KVA RAM");
333
334 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
335 }
336 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
337 reserve_pages);
338 return reserve_pages;
339}
340 187
341static void init_remap_allocator(int nid) 188 /*
342{ 189 * The acpi/srat node info can show hot-add memroy zones where
343 node_remap_start_vaddr[nid] = pfn_to_kaddr( 190 * memory could be added but not currently present.
344 kva_start_pfn + node_remap_offset[nid]); 191 */
345 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + 192 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
346 (node_remap_size[nid] * PAGE_SIZE); 193 nid, start_pfn, end_pfn);
347 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 194
348 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 195 /* calculate the necessary space aligned to large page size */
349 196 size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
350 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, 197 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
351 (ulong) node_remap_start_vaddr[nid], 198 size = ALIGN(size, LARGE_PAGE_BYTES);
352 (ulong) node_remap_end_vaddr[nid]); 199
200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (node_pa == MEMBLOCK_ERROR) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid);
205 return;
206 }
207 memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES);
212 if (remap_pa == MEMBLOCK_ERROR) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid);
215 memblock_x86_free_range(node_pa, node_pa + size);
216 return;
217 }
218 memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
219 remap_va = phys_to_virt(remap_pa);
220
221 /* perform actual remap */
222 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
223 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
224 (node_pa >> PAGE_SHIFT) + pfn,
225 PAGE_KERNEL_LARGE);
226
227 /* initialize remap allocator parameters */
228 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
229 node_remap_start_vaddr[nid] = remap_va;
230 node_remap_end_vaddr[nid] = remap_va + size;
231 node_remap_alloc_vaddr[nid] = remap_va;
232
233 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
234 nid, node_pa, node_pa + size, remap_va, remap_va + size);
353} 235}
354 236
355void __init initmem_init(void) 237void __init initmem_init(void)
356{ 238{
357 int nid; 239 x86_numa_init();
358 long kva_target_pfn;
359
360 /*
361 * When mapping a NUMA machine we allocate the node_mem_map arrays
362 * from node local memory. They are then mapped directly into KVA
363 * between zone normal and vmalloc space. Calculate the size of
364 * this space and use it to adjust the boundary between ZONE_NORMAL
365 * and ZONE_HIGHMEM.
366 */
367
368 get_memcfg_numa();
369 numa_init_array();
370
371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
372 240
373 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
374 do {
375 kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
376 max_low_pfn<<PAGE_SHIFT,
377 kva_pages<<PAGE_SHIFT,
378 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
379 kva_target_pfn -= PTRS_PER_PTE;
380 } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
381
382 if (kva_start_pfn == MEMBLOCK_ERROR)
383 panic("Can not get kva space\n");
384
385 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
386 kva_start_pfn, max_low_pfn);
387 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
388
389 /* avoid clash with initrd */
390 memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
391 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
392 "KVA PG");
393#ifdef CONFIG_HIGHMEM 241#ifdef CONFIG_HIGHMEM
394 highstart_pfn = highend_pfn = max_pfn; 242 highstart_pfn = highend_pfn = max_pfn;
395 if (max_pfn > max_low_pfn) 243 if (max_pfn > max_low_pfn)
@@ -409,51 +257,9 @@ void __init initmem_init(void)
409 257
410 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", 258 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
411 (ulong) pfn_to_kaddr(max_low_pfn)); 259 (ulong) pfn_to_kaddr(max_low_pfn));
412 for_each_online_node(nid) {
413 init_remap_allocator(nid);
414
415 allocate_pgdat(nid);
416 }
417 remap_numa_kva();
418 260
419 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", 261 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
420 (ulong) pfn_to_kaddr(highstart_pfn)); 262 (ulong) pfn_to_kaddr(highstart_pfn));
421 for_each_online_node(nid)
422 propagate_e820_map_node(nid);
423
424 for_each_online_node(nid) {
425 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
426 NODE_DATA(nid)->node_id = nid;
427 }
428 263
429 setup_bootmem_allocator(); 264 setup_bootmem_allocator();
430} 265}
431
432#ifdef CONFIG_MEMORY_HOTPLUG
433static int paddr_to_nid(u64 addr)
434{
435 int nid;
436 unsigned long pfn = PFN_DOWN(addr);
437
438 for_each_node(nid)
439 if (node_start_pfn[nid] <= pfn &&
440 pfn < node_end_pfn[nid])
441 return nid;
442
443 return -1;
444}
445
446/*
447 * This function is used to ask node id BEFORE memmap and mem_section's
448 * initialization (pfn_to_nid() can't be used yet).
449 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
450 */
451int memory_add_physaddr_to_nid(u64 addr)
452{
453 int nid = paddr_to_nid(addr);
454 return (nid >= 0) ? nid : 0;
455}
456
457EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
458#endif
459
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 85b52fc03084..dd27f401f0a0 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,646 +2,13 @@
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h> 5#include <linux/bootmem.h>
10#include <linux/memblock.h>
11#include <linux/mmzone.h>
12#include <linux/ctype.h>
13#include <linux/module.h>
14#include <linux/nodemask.h>
15#include <linux/sched.h>
16#include <linux/acpi.h>
17
18#include <asm/e820.h>
19#include <asm/proto.h>
20#include <asm/dma.h>
21#include <asm/acpi.h>
22#include <asm/amd_nb.h>
23 6
24#include "numa_internal.h" 7#include "numa_internal.h"
25 8
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29nodemask_t numa_nodes_parsed __initdata;
30
31struct memnode memnode;
32
33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size;
35
36static struct numa_meminfo numa_meminfo __initdata;
37
38static int numa_distance_cnt;
39static u8 *numa_distance;
40
41/*
42 * Given a shift value, try to populate memnodemap[]
43 * Returns :
44 * 1 if OK
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
47 */
48static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
49{
50 unsigned long addr, end;
51 int i, res = -1;
52
53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
54 for (i = 0; i < mi->nr_blks; i++) {
55 addr = mi->blk[i].start;
56 end = mi->blk[i].end;
57 if (addr >= end)
58 continue;
59 if ((end >> shift) >= memnodemapsize)
60 return 0;
61 do {
62 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
63 return -1;
64 memnodemap[addr >> shift] = mi->blk[i].nid;
65 addr += (1UL << shift);
66 } while (addr < end);
67 res = 1;
68 }
69 return res;
70}
71
72static int __init allocate_cachealigned_memnodemap(void)
73{
74 unsigned long addr;
75
76 memnodemap = memnode.embedded_map;
77 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
78 return 0;
79
80 addr = 0x8000;
81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
83 nodemap_size, L1_CACHE_BYTES);
84 if (nodemap_addr == MEMBLOCK_ERROR) {
85 printk(KERN_ERR
86 "NUMA: Unable to allocate Memory to Node hash map\n");
87 nodemap_addr = nodemap_size = 0;
88 return -1;
89 }
90 memnodemap = phys_to_virt(nodemap_addr);
91 memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
92
93 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
94 nodemap_addr, nodemap_addr + nodemap_size);
95 return 0;
96}
97
98/*
99 * The LSB of all start and end addresses in the node map is the value of the
100 * maximum possible shift.
101 */
102static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
103{
104 int i, nodes_used = 0;
105 unsigned long start, end;
106 unsigned long bitfield = 0, memtop = 0;
107
108 for (i = 0; i < mi->nr_blks; i++) {
109 start = mi->blk[i].start;
110 end = mi->blk[i].end;
111 if (start >= end)
112 continue;
113 bitfield |= start;
114 nodes_used++;
115 if (end > memtop)
116 memtop = end;
117 }
118 if (nodes_used <= 1)
119 i = 63;
120 else
121 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
122 memnodemapsize = (memtop >> i)+1;
123 return i;
124}
125
126static int __init compute_hash_shift(const struct numa_meminfo *mi)
127{
128 int shift;
129
130 shift = extract_lsb_from_nodes(mi);
131 if (allocate_cachealigned_memnodemap())
132 return -1;
133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
134 shift);
135
136 if (populate_memnodemap(mi, shift) != 1) {
137 printk(KERN_INFO "Your memory is not aligned you need to "
138 "rebuild your kernel with a bigger NODEMAPSIZE "
139 "shift=%d\n", shift);
140 return -1;
141 }
142 return shift;
143}
144
145int __meminit __early_pfn_to_nid(unsigned long pfn)
146{
147 return phys_to_nid(pfn << PAGE_SHIFT);
148}
149
150static void * __init early_node_mem(int nodeid, unsigned long start,
151 unsigned long end, unsigned long size,
152 unsigned long align)
153{
154 unsigned long mem;
155
156 /*
157 * put it on high as possible
158 * something will go with NODE_DATA
159 */
160 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
161 start = MAX_DMA_PFN<<PAGE_SHIFT;
162 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
163 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
164 start = MAX_DMA32_PFN<<PAGE_SHIFT;
165 mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
166 if (mem != MEMBLOCK_ERROR)
167 return __va(mem);
168
169 /* extend the search scope */
170 end = max_pfn_mapped << PAGE_SHIFT;
171 start = MAX_DMA_PFN << PAGE_SHIFT;
172 mem = memblock_find_in_range(start, end, size, align);
173 if (mem != MEMBLOCK_ERROR)
174 return __va(mem);
175
176 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
177 size, nodeid);
178
179 return NULL;
180}
181
182static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
183 struct numa_meminfo *mi)
184{
185 /* ignore zero length blks */
186 if (start == end)
187 return 0;
188
189 /* whine about and ignore invalid blks */
190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
192 nid, start, end);
193 return 0;
194 }
195
196 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
197 pr_err("NUMA: too many memblk ranges\n");
198 return -EINVAL;
199 }
200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
208/**
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
212 *
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
215 */
216void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
217{
218 mi->nr_blks--;
219 memmove(&mi->blk[idx], &mi->blk[idx + 1],
220 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
221}
222
223/**
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
228 *
229 * Add a new memblk to the default numa_meminfo.
230 *
231 * RETURNS:
232 * 0 on success, -errno on failure.
233 */
234int __init numa_add_memblk(int nid, u64 start, u64 end)
235{
236 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
237}
238
239/* Initialize bootmem allocator for a node */
240void __init
241setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
242{
243 unsigned long start_pfn, last_pfn, nodedata_phys;
244 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
245 int nid;
246
247 if (!end)
248 return;
249
250 /*
251 * Don't confuse VM with a node that doesn't have the
252 * minimum amount of memory:
253 */
254 if (end && (end - start) < NODE_MIN_SIZE)
255 return;
256
257 start = roundup(start, ZONE_ALIGN);
258
259 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
260 start, end);
261
262 start_pfn = start >> PAGE_SHIFT;
263 last_pfn = end >> PAGE_SHIFT;
264
265 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
266 SMP_CACHE_BYTES);
267 if (node_data[nodeid] == NULL)
268 return;
269 nodedata_phys = __pa(node_data[nodeid]);
270 memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
271 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
272 nodedata_phys + pgdat_size - 1);
273 nid = phys_to_nid(nodedata_phys);
274 if (nid != nodeid)
275 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
276
277 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
278 NODE_DATA(nodeid)->node_id = nodeid;
279 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
280 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
281
282 node_set_online(nodeid);
283}
284
285/**
286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
287 * @mi: numa_meminfo to clean up
288 *
289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
290 * conflicts and clear unused memblks.
291 *
292 * RETURNS:
293 * 0 on success, -errno on failure.
294 */
295int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
296{
297 const u64 low = 0;
298 const u64 high = (u64)max_pfn << PAGE_SHIFT;
299 int i, j, k;
300
301 for (i = 0; i < mi->nr_blks; i++) {
302 struct numa_memblk *bi = &mi->blk[i];
303
304 /* make sure all blocks are inside the limits */
305 bi->start = max(bi->start, low);
306 bi->end = min(bi->end, high);
307
308 /* and there's no empty block */
309 if (bi->start >= bi->end) {
310 numa_remove_memblk_from(i--, mi);
311 continue;
312 }
313
314 for (j = i + 1; j < mi->nr_blks; j++) {
315 struct numa_memblk *bj = &mi->blk[j];
316 unsigned long start, end;
317
318 /*
319 * See whether there are overlapping blocks. Whine
320 * about but allow overlaps of the same nid. They
321 * will be merged below.
322 */
323 if (bi->end > bj->start && bi->start < bj->end) {
324 if (bi->nid != bj->nid) {
325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
326 bi->nid, bi->start, bi->end,
327 bj->nid, bj->start, bj->end);
328 return -EINVAL;
329 }
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi->nid, bi->start, bi->end,
332 bj->start, bj->end);
333 }
334
335 /*
336 * Join together blocks on the same node, holes
337 * between which don't overlap with memory on other
338 * nodes.
339 */
340 if (bi->nid != bj->nid)
341 continue;
342 start = max(min(bi->start, bj->start), low);
343 end = min(max(bi->end, bj->end), high);
344 for (k = 0; k < mi->nr_blks; k++) {
345 struct numa_memblk *bk = &mi->blk[k];
346
347 if (bi->nid == bk->nid)
348 continue;
349 if (start < bk->end && end > bk->start)
350 break;
351 }
352 if (k < mi->nr_blks)
353 continue;
354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
355 bi->nid, bi->start, bi->end, bj->start, bj->end,
356 start, end);
357 bi->start = start;
358 bi->end = end;
359 numa_remove_memblk_from(j--, mi);
360 }
361 }
362
363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
364 mi->blk[i].start = mi->blk[i].end = 0;
365 mi->blk[i].nid = NUMA_NO_NODE;
366 }
367
368 return 0;
369}
370
371/*
372 * Set nodes, which have memory in @mi, in *@nodemask.
373 */
374static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
375 const struct numa_meminfo *mi)
376{
377 int i;
378
379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
380 if (mi->blk[i].start != mi->blk[i].end &&
381 mi->blk[i].nid != NUMA_NO_NODE)
382 node_set(mi->blk[i].nid, *nodemask);
383}
384
385/**
386 * numa_reset_distance - Reset NUMA distance table
387 *
388 * The current table is freed. The next numa_set_distance() call will
389 * create a new one.
390 */
391void __init numa_reset_distance(void)
392{
393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
394
395 /* numa_distance could be 1LU marking allocation failure, test cnt */
396 if (numa_distance_cnt)
397 memblock_x86_free_range(__pa(numa_distance),
398 __pa(numa_distance) + size);
399 numa_distance_cnt = 0;
400 numa_distance = NULL; /* enable table creation */
401}
402
403static int __init numa_alloc_distance(void)
404{
405 nodemask_t nodes_parsed;
406 size_t size;
407 int i, j, cnt = 0;
408 u64 phys;
409
410 /* size the new table and allocate it */
411 nodes_parsed = numa_nodes_parsed;
412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
413
414 for_each_node_mask(i, nodes_parsed)
415 cnt = i;
416 cnt++;
417 size = cnt * cnt * sizeof(numa_distance[0]);
418
419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
420 size, PAGE_SIZE);
421 if (phys == MEMBLOCK_ERROR) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance = (void *)1LU;
425 return -ENOMEM;
426 }
427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
428
429 numa_distance = __va(phys);
430 numa_distance_cnt = cnt;
431
432 /* fill with the default distances */
433 for (i = 0; i < cnt; i++)
434 for (j = 0; j < cnt; j++)
435 numa_distance[i * cnt + j] = i == j ?
436 LOCAL_DISTANCE : REMOTE_DISTANCE;
437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
438
439 return 0;
440}
441
442/**
443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
447 *
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accommodate all the currently
450 * known nodes will be created.
451 *
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
455 *
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
459 */
460void __init numa_set_distance(int from, int to, int distance)
461{
462 if (!numa_distance && numa_alloc_distance() < 0)
463 return;
464
465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
467 from, to, distance);
468 return;
469 }
470
471 if ((u8)distance != distance ||
472 (from == to && distance != LOCAL_DISTANCE)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
474 from, to, distance);
475 return;
476 }
477
478 numa_distance[from * numa_distance_cnt + to] = distance;
479}
480
481int __node_distance(int from, int to)
482{
483 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
485 return numa_distance[from * numa_distance_cnt + to];
486}
487EXPORT_SYMBOL(__node_distance);
488
489/*
490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
492 */
493static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
494{
495 unsigned long numaram, e820ram;
496 int i;
497
498 numaram = 0;
499 for (i = 0; i < mi->nr_blks; i++) {
500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
502 numaram += e - s;
503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
504 if ((long)numaram < 0)
505 numaram = 0;
506 }
507
508 e820ram = max_pfn - (memblock_x86_hole_size(0,
509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram << PAGE_SHIFT) >> 20,
514 (e820ram << PAGE_SHIFT) >> 20);
515 return false;
516 }
517 return true;
518}
519
520static int __init numa_register_memblks(struct numa_meminfo *mi)
521{
522 int i, nid;
523
524 /* Account for nodes with cpus and no memory */
525 node_possible_map = numa_nodes_parsed;
526 numa_nodemask_from_meminfo(&node_possible_map, mi);
527 if (WARN_ON(nodes_empty(node_possible_map)))
528 return -EINVAL;
529
530 memnode_shift = compute_hash_shift(mi);
531 if (memnode_shift < 0) {
532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
533 return -EINVAL;
534 }
535
536 for (i = 0; i < mi->nr_blks; i++)
537 memblock_x86_register_active_regions(mi->blk[i].nid,
538 mi->blk[i].start >> PAGE_SHIFT,
539 mi->blk[i].end >> PAGE_SHIFT);
540
541 /* for out of order entries */
542 sort_node_map();
543 if (!numa_meminfo_cover_memory(mi))
544 return -EINVAL;
545
546 /* Finally register nodes. */
547 for_each_node_mask(nid, node_possible_map) {
548 u64 start = (u64)max_pfn << PAGE_SHIFT;
549 u64 end = 0;
550
551 for (i = 0; i < mi->nr_blks; i++) {
552 if (nid != mi->blk[i].nid)
553 continue;
554 start = min(mi->blk[i].start, start);
555 end = max(mi->blk[i].end, end);
556 }
557
558 if (start < end)
559 setup_node_bootmem(nid, start, end);
560 }
561
562 return 0;
563}
564
565/**
566 * dummy_numma_init - Fallback dummy NUMA init
567 *
568 * Used if there's no underlying NUMA architecture, NUMA initialization
569 * fails, or NUMA is disabled on the command line.
570 *
571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
573 */
574static int __init dummy_numa_init(void)
575{
576 printk(KERN_INFO "%s\n",
577 numa_off ? "NUMA turned off" : "No NUMA configuration found");
578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
579 0LU, max_pfn << PAGE_SHIFT);
580
581 node_set(0, numa_nodes_parsed);
582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
583
584 return 0;
585}
586
587static int __init numa_init(int (*init_func)(void))
588{
589 int i;
590 int ret;
591
592 for (i = 0; i < MAX_LOCAL_APIC; i++)
593 set_apicid_to_node(i, NUMA_NO_NODE);
594
595 nodes_clear(numa_nodes_parsed);
596 nodes_clear(node_possible_map);
597 nodes_clear(node_online_map);
598 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
599 remove_all_active_ranges();
600 numa_reset_distance();
601
602 ret = init_func();
603 if (ret < 0)
604 return ret;
605 ret = numa_cleanup_meminfo(&numa_meminfo);
606 if (ret < 0)
607 return ret;
608
609 numa_emulation(&numa_meminfo, numa_distance_cnt);
610
611 ret = numa_register_memblks(&numa_meminfo);
612 if (ret < 0)
613 return ret;
614
615 for (i = 0; i < nr_cpu_ids; i++) {
616 int nid = early_cpu_to_node(i);
617
618 if (nid == NUMA_NO_NODE)
619 continue;
620 if (!node_online(nid))
621 numa_clear_node(i);
622 }
623 numa_init_array();
624 return 0;
625}
626
627void __init initmem_init(void) 9void __init initmem_init(void)
628{ 10{
629 int ret; 11 x86_numa_init();
630
631 if (!numa_off) {
632#ifdef CONFIG_ACPI_NUMA
633 ret = numa_init(x86_acpi_numa_init);
634 if (!ret)
635 return;
636#endif
637#ifdef CONFIG_AMD_NUMA
638 ret = numa_init(amd_numa_init);
639 if (!ret)
640 return;
641#endif
642 }
643
644 numa_init(dummy_numa_init);
645} 12}
646 13
647unsigned long __init numa_free_all_bootmem(void) 14unsigned long __init numa_free_all_bootmem(void)
@@ -656,12 +23,3 @@ unsigned long __init numa_free_all_bootmem(void)
656 23
657 return pages; 24 return pages;
658} 25}
659
660int __cpuinit numa_cpu_node(int cpu)
661{
662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
663
664 if (apicid != BAD_APICID)
665 return __apicid_to_node[apicid];
666 return NUMA_NO_NODE;
667}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index de84cc140379..d0ed086b6247 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/topology.h> 6#include <linux/topology.h>
7#include <linux/memblock.h> 7#include <linux/memblock.h>
8#include <linux/bootmem.h>
8#include <asm/dma.h> 9#include <asm/dma.h>
9 10
10#include "numa_internal.h" 11#include "numa_internal.h"
@@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
84 nr_nodes = MAX_NUMNODES; 85 nr_nodes = MAX_NUMNODES;
85 } 86 }
86 87
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; 88 /*
89 * Calculate target node size. x86_32 freaks on __udivdi3() so do
90 * the division in ulong number of pages and convert back.
91 */
92 size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
93 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
94
88 /* 95 /*
89 * Calculate the number of big nodes that can be allocated as a result 96 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder. 97 * of consolidating the remainder.
@@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
226 */ 233 */
227 while (nodes_weight(physnode_mask)) { 234 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) { 235 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; 236 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
230 u64 start, limit, end; 237 u64 start, limit, end;
231 int phys_blk; 238 int phys_blk;
232 239
@@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
298{ 305{
299 static struct numa_meminfo ei __initdata; 306 static struct numa_meminfo ei __initdata;
300 static struct numa_meminfo pi __initdata; 307 static struct numa_meminfo pi __initdata;
301 const u64 max_addr = max_pfn << PAGE_SHIFT; 308 const u64 max_addr = PFN_PHYS(max_pfn);
302 u8 *phys_dist = NULL; 309 u8 *phys_dist = NULL;
303 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 310 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
304 int max_emu_nid, dfl_phys_nid; 311 int max_emu_nid, dfl_phys_nid;
@@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
342 if (numa_dist_cnt) { 349 if (numa_dist_cnt) {
343 u64 phys; 350 u64 phys;
344 351
345 phys = memblock_find_in_range(0, 352 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
346 (u64)max_pfn_mapped << PAGE_SHIFT,
347 phys_size, PAGE_SIZE); 353 phys_size, PAGE_SIZE);
348 if (phys == MEMBLOCK_ERROR) { 354 if (phys == MEMBLOCK_ERROR) {
349 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 355 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ef2d97377d7c..7178c3afe05e 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -19,6 +19,14 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi); 19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void); 20void __init numa_reset_distance(void);
21 21
22void __init x86_numa_init(void);
23
24#ifdef CONFIG_X86_64
25static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
26#else
27void __init init_alloc_remap(int nid, u64 start, u64 end);
28#endif
29
22#ifdef CONFIG_NUMA_EMU 30#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo, 31void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt); 32 int numa_dist_cnt);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat.c
index 8e9d3394f6d4..81dbfdeb080d 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat.c
@@ -26,8 +26,6 @@
26 26
27int acpi_numa __initdata; 27int acpi_numa __initdata;
28 28
29static struct bootnode nodes_add[MAX_NUMNODES];
30
31static __init int setup_node(int pxm) 29static __init int setup_node(int pxm)
32{ 30{
33 return acpi_map_pxm_to_node(pxm); 31 return acpi_map_pxm_to_node(pxm);
@@ -37,7 +35,6 @@ static __init void bad_srat(void)
37{ 35{
38 printk(KERN_ERR "SRAT: SRAT not used.\n"); 36 printk(KERN_ERR "SRAT: SRAT not used.\n");
39 acpi_numa = -1; 37 acpi_numa = -1;
40 memset(nodes_add, 0, sizeof(nodes_add));
41} 38}
42 39
43static __init inline int srat_disabled(void) 40static __init inline int srat_disabled(void)
@@ -131,73 +128,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
131 pxm, apic_id, node); 128 pxm, apic_id, node);
132} 129}
133 130
134#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 131#ifdef CONFIG_MEMORY_HOTPLUG
135static inline int save_add_info(void) {return 1;} 132static inline int save_add_info(void) {return 1;}
136#else 133#else
137static inline int save_add_info(void) {return 0;} 134static inline int save_add_info(void) {return 0;}
138#endif 135#endif
139/*
140 * Update nodes_add[]
141 * This code supports one contiguous hot add area per node
142 */
143static void __init
144update_nodes_add(int node, unsigned long start, unsigned long end)
145{
146 unsigned long s_pfn = start >> PAGE_SHIFT;
147 unsigned long e_pfn = end >> PAGE_SHIFT;
148 int changed = 0;
149 struct bootnode *nd = &nodes_add[node];
150
151 /* I had some trouble with strange memory hotadd regions breaking
152 the boot. Be very strict here and reject anything unexpected.
153 If you want working memory hotadd write correct SRATs.
154
155 The node size check is a basic sanity check to guard against
156 mistakes */
157 if ((signed long)(end - start) < NODE_MIN_SIZE) {
158 printk(KERN_ERR "SRAT: Hotplug area too small\n");
159 return;
160 }
161
162 /* This check might be a bit too strict, but I'm keeping it for now. */
163 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
164 printk(KERN_ERR
165 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
166 s_pfn, e_pfn);
167 return;
168 }
169
170 /* Looks good */
171
172 if (nd->start == nd->end) {
173 nd->start = start;
174 nd->end = end;
175 changed = 1;
176 } else {
177 if (nd->start == end) {
178 nd->start = start;
179 changed = 1;
180 }
181 if (nd->end == start) {
182 nd->end = end;
183 changed = 1;
184 }
185 if (!changed)
186 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
187 }
188
189 if (changed) {
190 node_set(node, numa_nodes_parsed);
191 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
192 nd->start, nd->end);
193 }
194}
195 136
196/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 137/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
197void __init 138void __init
198acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 139acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
199{ 140{
200 unsigned long start, end; 141 u64 start, end;
201 int node, pxm; 142 int node, pxm;
202 143
203 if (srat_disabled()) 144 if (srat_disabled())
@@ -226,11 +167,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
226 return; 167 return;
227 } 168 }
228 169
229 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 170 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
230 start, end); 171 start, end);
231
232 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
233 update_nodes_add(node, start, end);
234} 172}
235 173
236void __init acpi_numa_arch_fixup(void) {} 174void __init acpi_numa_arch_fixup(void) {}
@@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void)
244 return ret; 182 return ret;
245 return srat_disabled() ? -EINVAL : 0; 183 return srat_disabled() ? -EINVAL : 0;
246} 184}
247
248#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
249int memory_add_physaddr_to_nid(u64 start)
250{
251 int i, ret = 0;
252
253 for_each_node(i)
254 if (nodes_add[i].start <= start && nodes_add[i].end > start)
255 ret = i;
256
257 return ret;
258}
259EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
260#endif
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 364f36bdfad8..000000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,288 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/memblock.h>
29#include <linux/mmzone.h>
30#include <linux/acpi.h>
31#include <linux/nodemask.h>
32#include <asm/srat.h>
33#include <asm/topology.h>
34#include <asm/smp.h>
35#include <asm/e820.h>
36
37/*
38 * proximity macros and definitions
39 */
40#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
41#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
42#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
43#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
44/* bitmap length; _PXM is at most 255 */
45#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
46static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
47
48#define MAX_CHUNKS_PER_NODE 3
49#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
50struct node_memory_chunk_s {
51 unsigned long start_pfn;
52 unsigned long end_pfn;
53 u8 pxm; // proximity domain of node
54 u8 nid; // which cnode contains this chunk?
55 u8 bank; // which mem bank on this node
56};
57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
58
59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
61
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 /* don't need to check apic_id here, because it is always 8 bits */
95 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
96
97 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
98 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
99}
100
101/*
102 * Identify memory proximity domains and hot-remove capabilities.
103 * Fill node memory chunk list structure.
104 */
105void __init
106acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
107{
108 unsigned long long paddr, size;
109 unsigned long start_pfn, end_pfn;
110 u8 pxm;
111 struct node_memory_chunk_s *p, *q, *pend;
112
113 if (srat_disabled())
114 return;
115 if (memory_affinity->header.length !=
116 sizeof(struct acpi_srat_mem_affinity)) {
117 bad_srat();
118 return;
119 }
120
121 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
122 return; /* empty entry */
123
124 pxm = memory_affinity->proximity_domain & 0xff;
125
126 /* mark this node as "seen" in node bitmap */
127 BMAP_SET(pxm_bitmap, pxm);
128
129 /* calculate info for memory chunk structure */
130 paddr = memory_affinity->base_address;
131 size = memory_affinity->length;
132
133 start_pfn = paddr >> PAGE_SHIFT;
134 end_pfn = (paddr + size) >> PAGE_SHIFT;
135
136
137 if (num_memory_chunks >= MAXCHUNKS) {
138 printk(KERN_WARNING "Too many mem chunks in SRAT."
139 " Ignoring %lld MBytes at %llx\n",
140 size/(1024*1024), paddr);
141 return;
142 }
143
144 /* Insertion sort based on base address */
145 pend = &node_memory_chunk[num_memory_chunks];
146 for (p = &node_memory_chunk[0]; p < pend; p++) {
147 if (start_pfn < p->start_pfn)
148 break;
149 }
150 if (p < pend) {
151 for (q = pend; q >= p; q--)
152 *(q + 1) = *q;
153 }
154 p->start_pfn = start_pfn;
155 p->end_pfn = end_pfn;
156 p->pxm = pxm;
157
158 num_memory_chunks++;
159
160 printk(KERN_DEBUG "Memory range %08lx to %08lx"
161 " in proximity domain %02x %s\n",
162 start_pfn, end_pfn,
163 pxm,
164 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
165 "enabled and removable" : "enabled" ) );
166}
167
168/* Callback for SLIT parsing */
169void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
170{
171}
172
173void acpi_numa_arch_fixup(void)
174{
175}
176/*
177 * The SRAT table always lists ascending addresses, so can always
178 * assume that the first "start" address that you see is the real
179 * start of the node, and that the current "end" address is after
180 * the previous one.
181 */
182static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
183{
184 /*
185 * Only add present memory as told by the e820.
186 * There is no guarantee from the SRAT that the memory it
187 * enumerates is present at boot time because it represents
188 * *possible* memory hotplug areas the same as normal RAM.
189 */
190 if (memory_chunk->start_pfn >= max_pfn) {
191 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
192 memory_chunk->start_pfn, memory_chunk->end_pfn);
193 return -1;
194 }
195 if (memory_chunk->nid != nid)
196 return -1;
197
198 if (!node_has_online_mem(nid))
199 node_start_pfn[nid] = memory_chunk->start_pfn;
200
201 if (node_start_pfn[nid] > memory_chunk->start_pfn)
202 node_start_pfn[nid] = memory_chunk->start_pfn;
203
204 if (node_end_pfn[nid] < memory_chunk->end_pfn)
205 node_end_pfn[nid] = memory_chunk->end_pfn;
206
207 return 0;
208}
209
210int __init get_memcfg_from_srat(void)
211{
212 int i, j, nid;
213
214 if (srat_disabled())
215 goto out_fail;
216
217 if (acpi_numa_init() < 0)
218 goto out_fail;
219
220 if (num_memory_chunks == 0) {
221 printk(KERN_DEBUG
222 "could not find any ACPI SRAT memory areas.\n");
223 goto out_fail;
224 }
225
226 /* Calculate total number of nodes in system from PXM bitmap and create
227 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
228 * to specify the range of _PXM values.)
229 */
230 /*
231 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
232 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
233 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
234 * approaches MAX_PXM_DOMAINS for i386.
235 */
236 nodes_clear(node_online_map);
237 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
238 if (BMAP_TEST(pxm_bitmap, i)) {
239 int nid = acpi_map_pxm_to_node(i);
240 node_set_online(nid);
241 }
242 }
243 BUG_ON(num_online_nodes() == 0);
244
245 /* set cnode id in memory chunk structure */
246 for (i = 0; i < num_memory_chunks; i++)
247 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
248
249 printk(KERN_DEBUG "pxm bitmap: ");
250 for (i = 0; i < sizeof(pxm_bitmap); i++) {
251 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
252 }
253 printk(KERN_CONT "\n");
254 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
255 num_online_nodes());
256 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
257 num_memory_chunks);
258
259 for (i = 0; i < MAX_LOCAL_APIC; i++)
260 set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
261
262 for (j = 0; j < num_memory_chunks; j++){
263 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
264 printk(KERN_DEBUG
265 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
266 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
267 if (node_read_chunk(chunk->nid, chunk))
268 continue;
269
270 memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
271 min(chunk->end_pfn, max_pfn));
272 }
273 /* for out of order entries in SRAT */
274 sort_node_map();
275
276 for_each_online_node(nid) {
277 unsigned long start = node_start_pfn[nid];
278 unsigned long end = min(node_end_pfn[nid], max_pfn);
279
280 memory_present(nid, start, end);
281 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
282 }
283 return 1;
284out_fail:
285 printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
286 " table\n");
287 return 0;
288}