x86: Interleave emulated nodes over physical nodes

Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: David Rientjes <rientjes@google.com> 2009-09-25 18:20:09 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-10-12 16:56:46 -0400
commit: adc1938994f7f1112d335d998b5218b0aa680ad6 (patch)
tree: 66b15981e346145fba39e3560ef8b192e2c7e10d /arch/x86/mm/numa_64.c
parent: 8716273caef7f55f39fe4fc6c69c5f9f197f41f1 (diff)
1 files changed, 184 insertions, 27 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index d1a3d94efc8e..086f98a66d80 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -306,8 +306,71 @@ void __init numa_init_array(void)
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 static char *cmdline __initdata;
+static int __init setup_physnodes(unsigned long start, unsigned long end,
+                                        int acpi, int k8)
+{
+        int nr_nodes = 0;
+        int ret = 0;
+        int i;
+#ifdef CONFIG_ACPI_NUMA
+        if (acpi)
+                nr_nodes = acpi_get_nodes(physnodes);
+#endif
+#ifdef CONFIG_K8_NUMA
+        if (k8)
+                nr_nodes = k8_get_nodes(physnodes);
+#endif
+        /*
+         * Basic sanity checking on the physical node map: there may be errors
+         * if the SRAT or K8 incorrectly reported the topology or the mem=
+         * kernel parameter is used.
+         */
+        for (i = 0; i < nr_nodes; i++) {
+                if (physnodes[i].start == physnodes[i].end)
+                        continue;
+                if (physnodes[i].start > end) {
+                        physnodes[i].end = physnodes[i].start;
+                        continue;
+                }
+                if (physnodes[i].end < start) {
+                        physnodes[i].start = physnodes[i].end;
+                        continue;
+                }
+                if (physnodes[i].start < start)
+                        physnodes[i].start = start;
+                if (physnodes[i].end > end)
+                        physnodes[i].end = end;
+        }
+        /*
+         * Remove all nodes that have no memory or were truncated because of the
+         * limited address range.
+         */
+        for (i = 0; i < nr_nodes; i++) {
+                if (physnodes[i].start == physnodes[i].end)
+                        continue;
+                physnodes[ret].start = physnodes[i].start;
+                physnodes[ret].end = physnodes[i].end;
+                ret++;
+        }
+        /*
+         * If no physical topology was detected, a single node is faked to cover
+         * the entire address space.
+         */
+        if (!ret) {
+                physnodes[ret].start = start;
+                physnodes[ret].end = end;
+                ret = 1;
+        }
+        return ret;
+}
 /*
 * Setups up nid to range from addr to addr + size.  If the end
 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +378,9 @@ static char *cmdline __initdata;
 * allocation past addr and -1 otherwise.  addr is adjusted to be at
 * the end of the node.
 */
-static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
-                                   u64 size, u64 max_addr)
 {
        int ret = 0;
        nodes[nid].start = *addr;
        *addr += size;
        if (*addr >= max_addr) {
@@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
 }
 /*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(u64 addr, u64 max_addr,
+                                                int nr_phys_nodes, int nr_nodes)
+{
+        nodemask_t physnode_mask = NODE_MASK_NONE;
+        u64 size;
+        int big;
+        int ret = 0;
+        int i;
+        if (nr_nodes <= 0)
+                return -1;
+        if (nr_nodes > MAX_NUMNODES) {
+                pr_info("numa=fake=%d too large, reducing to %d\n",
+                        nr_nodes, MAX_NUMNODES);
+                nr_nodes = MAX_NUMNODES;
+        }
+        size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
+        /*
+         * Calculate the number of big nodes that can be allocated as a result
+         * of consolidating the remainder.
+         */
+        big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
+                FAKE_NODE_MIN_SIZE;
+        size &= FAKE_NODE_MIN_HASH_MASK;
+        if (!size) {
+                pr_err("Not enough memory for each node.  "
+                        "NUMA emulation disabled.\n");
+                return -1;
+        }
+        for (i = 0; i < nr_phys_nodes; i++)
+                if (physnodes[i].start != physnodes[i].end)
+                        node_set(i, physnode_mask);
+        /*
+         * Continue to fill physical nodes with fake nodes until there is no
+         * memory left on any of them.
+         */
+        while (nodes_weight(physnode_mask)) {
+                for_each_node_mask(i, physnode_mask) {
+                        u64 end = physnodes[i].start + size;
+                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+                        if (ret < big)
+                                end += FAKE_NODE_MIN_SIZE;
+                        /*
+                         * Continue to add memory to this fake node if its
+                         * non-reserved memory is less than the per-node size.
+                         */
+                        while (end - physnodes[i].start -
+                                e820_hole_size(physnodes[i].start, end) < size) {
+                                end += FAKE_NODE_MIN_SIZE;
+                                if (end > physnodes[i].end) {
+                                        end = physnodes[i].end;
+                                        break;
+                                }
+                        }
+                        /*
+                         * If there won't be at least FAKE_NODE_MIN_SIZE of
+                         * non-reserved memory in ZONE_DMA32 for the next node,
+                         * this one must extend to the boundary.
+                         */
+                        if (end < dma32_end && dma32_end - end -
+                            e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+                                end = dma32_end;
+                        /*
+                         * If there won't be enough non-reserved memory for the
+                         * next node, this one must extend to the end of the
+                         * physical node.
+                         */
+                        if (physnodes[i].end - end -
+                            e820_hole_size(end, physnodes[i].end) < size)
+                                end = physnodes[i].end;
+                        /*
+                         * Avoid allocating more nodes than requested, which can
+                         * happen as a result of rounding down each node's size
+                         * to FAKE_NODE_MIN_SIZE.
+                         */
+                        if (nodes_weight(physnode_mask) + ret >= nr_nodes)
+                                end = physnodes[i].end;
+                        if (setup_node_range(ret++, &physnodes[i].start,
+                                                end - physnodes[i].start,
+                                                physnodes[i].end) < 0)
+                                node_clear(i, physnode_mask);
+                }
+        }
+        return ret;
+}
+/*
 * Splits num_nodes nodes up equally starting at node_start.  The return value
 * is the number of nodes split up and addr is adjusted to be at the end of the
 * last node allocated.
 */
-static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
-                                      u64 max_addr, int node_start,
                                      int num_nodes)
 {
        unsigned int big;
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
                                        break;
                                }
                        }
-                if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
+                if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
                        break;
        }
        return i - node_start + 1;
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
 * always assigned to a final node and can be asymmetric.  Returns the number of
 * nodes split.
 */
-static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
+static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
-                                      u64 max_addr, int node_start, u64 size)
+                                      u64 size)
 {
        int i = node_start;
        size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
-        while (!setup_node_range(i++, nodes, addr, size, max_addr))
+        while (!setup_node_range(i++, addr, size, max_addr))
                ;
        return i - node_start;
 }
@@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
 * Sets up the system RAM area from start_pfn to last_pfn according to the
 * numa=fake command-line option.
 */
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static int __init numa_emulation(unsigned long start_pfn,
+                        unsigned long last_pfn, int acpi, int k8)
-static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
 {
        u64 size, addr = start_pfn << PAGE_SHIFT;
        u64 max_addr = last_pfn << PAGE_SHIFT;
        int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
+        int num_phys_nodes;
-        memset(&nodes, 0, sizeof(nodes));
+        num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
        /*
         * If the numa=fake command-line is just a single number N, split the
         * system RAM into N fake nodes.
@@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
        if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
                long n = simple_strtol(cmdline, NULL, 0);
-                num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
+                num_nodes = split_nodes_interleave(addr, max_addr,
+                                                        num_phys_nodes, n);
                if (num_nodes < 0)
                        return num_nodes;
                goto out;
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
                        size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
                        if (size)
                                for (i = 0; i < coeff; i++, num_nodes++)
-                                        if (setup_node_range(num_nodes, nodes,
+                                        if (setup_node_range(num_nodes, &addr,
-                                                &addr, size, max_addr) < 0)
+                                                size, max_addr) < 0)
                                                goto done;
                        if (!*cmdline)
                                break;
@@ -473,7 +634,7 @@ done:
        if (addr < max_addr) {
                if (coeff_flag && coeff < 0) {
                        /* Split remaining nodes into num-sized chunks */
-                        num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
+                        num_nodes += split_nodes_by_size(&addr, max_addr,
                                                         num_nodes, num);
                        goto out;
                }
@@ -482,7 +643,7 @@ done:
                        /* Split remaining nodes into coeff chunks */
                        if (coeff <= 0)
                                break;
-                        num_nodes += split_nodes_equally(nodes, &addr, max_addr,
+                        num_nodes += split_nodes_equally(&addr, max_addr,
                                                         num_nodes, coeff);
                        break;
                case ',':
@@ -490,8 +651,8 @@ done:
                        break;
                default:
                        /* Give one final node */
-                        setup_node_range(num_nodes, nodes, &addr,
+                        setup_node_range(num_nodes, &addr, max_addr - addr,
-                                         max_addr - addr, max_addr);
+                                         max_addr);
                        num_nodes++;
                }
        }
@@ -505,14 +666,10 @@ out:
        }
        /*
-         * We need to vacate all active ranges that may have been registered by
+         * We need to vacate all active ranges that may have been registered for
-         * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
+         * the e820 memory map.
-         * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
         */
        remove_all_active_ranges();
-#ifdef CONFIG_ACPI_NUMA
-        acpi_numa = -1;
-#endif
        for_each_node_mask(i, node_possible_map) {
                e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
                                                nodes[i].end >> PAGE_SHIFT);
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
        nodes_clear(node_online_map);
 #ifdef CONFIG_NUMA_EMU
-        if (cmdline && !numa_emulation(start_pfn, last_pfn))
+        if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
                return;
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
author	David Rientjes <rientjes@google.com>	2009-09-25 18:20:09 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-10-12 16:56:46 -0400
commit	adc1938994f7f1112d335d998b5218b0aa680ad6 (patch)
tree	66b15981e346145fba39e3560ef8b192e2c7e10d /arch/x86/mm/numa_64.c
parent	8716273caef7f55f39fe4fc6c69c5f9f197f41f1 (diff)

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index d1a3d94efc8e..086f98a66d80 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c
@@ -306,8 +306,71 @@ void __init numa_init_array(void)
306		306
307	#ifdef CONFIG_NUMA_EMU	307	#ifdef CONFIG_NUMA_EMU
308	/* Numa emulation */	308	/* Numa emulation */
		309	static struct bootnode nodes[MAX_NUMNODES] __initdata;
		310	static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309	static char *cmdline __initdata;	311	static char *cmdline __initdata;
310		312
		313	static int __init setup_physnodes(unsigned long start, unsigned long end,
		314	int acpi, int k8)
		315	{
		316	int nr_nodes = 0;
		317	int ret = 0;
		318	int i;
		319
		320	#ifdef CONFIG_ACPI_NUMA
		321	if (acpi)
		322	nr_nodes = acpi_get_nodes(physnodes);
		323	#endif
		324	#ifdef CONFIG_K8_NUMA
		325	if (k8)
		326	nr_nodes = k8_get_nodes(physnodes);
		327	#endif
		328	/*
		329	* Basic sanity checking on the physical node map: there may be errors
		330	* if the SRAT or K8 incorrectly reported the topology or the mem=
		331	* kernel parameter is used.
		332	*/
		333	for (i = 0; i < nr_nodes; i++) {
		334	if (physnodes[i].start == physnodes[i].end)
		335	continue;
		336	if (physnodes[i].start > end) {
		337	physnodes[i].end = physnodes[i].start;
		338	continue;
		339	}
		340	if (physnodes[i].end < start) {
		341	physnodes[i].start = physnodes[i].end;
		342	continue;
		343	}
		344	if (physnodes[i].start < start)
		345	physnodes[i].start = start;
		346	if (physnodes[i].end > end)
		347	physnodes[i].end = end;
		348	}
		349
		350	/*
		351	* Remove all nodes that have no memory or were truncated because of the
		352	* limited address range.
		353	*/
		354	for (i = 0; i < nr_nodes; i++) {
		355	if (physnodes[i].start == physnodes[i].end)
		356	continue;
		357	physnodes[ret].start = physnodes[i].start;
		358	physnodes[ret].end = physnodes[i].end;
		359	ret++;
		360	}
		361
		362	/*
		363	* If no physical topology was detected, a single node is faked to cover
		364	* the entire address space.
		365	*/
		366	if (!ret) {
		367	physnodes[ret].start = start;
		368	physnodes[ret].end = end;
		369	ret = 1;
		370	}
		371	return ret;
		372	}
		373
311	/*	374	/*
312	* Setups up nid to range from addr to addr + size. If the end	375	* Setups up nid to range from addr to addr + size. If the end
313	* boundary is greater than max_addr, then max_addr is used instead.	376	* boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +378,9 @@ static char *cmdline __initdata;
315	* allocation past addr and -1 otherwise. addr is adjusted to be at	378	* allocation past addr and -1 otherwise. addr is adjusted to be at
316	* the end of the node.	379	* the end of the node.
317	*/	380	*/
318	static int __init setup_node_range(int nid, struct bootnode nodes, u64 addr,	381	static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319	u64 size, u64 max_addr)
320	{	382	{
321	int ret = 0;	383	int ret = 0;
322
323	nodes[nid].start = *addr;	384	nodes[nid].start = *addr;
324	*addr += size;	385	*addr += size;
325	if (*addr >= max_addr) {	386	if (*addr >= max_addr) {
@@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode nodes, u64 addr,
335	}	396	}
336		397
337	/*	398	/*
		399	* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
		400	* to max_addr. The return value is the number of nodes allocated.
		401	*/
		402	static int __init split_nodes_interleave(u64 addr, u64 max_addr,
		403	int nr_phys_nodes, int nr_nodes)
		404	{
		405	nodemask_t physnode_mask = NODE_MASK_NONE;
		406	u64 size;
		407	int big;
		408	int ret = 0;
		409	int i;
		410
		411	if (nr_nodes <= 0)
		412	return -1;
		413	if (nr_nodes > MAX_NUMNODES) {
		414	pr_info("numa=fake=%d too large, reducing to %d\n",
		415	nr_nodes, MAX_NUMNODES);
		416	nr_nodes = MAX_NUMNODES;
		417	}
		418
		419	size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
		420	/*
		421	* Calculate the number of big nodes that can be allocated as a result
		422	* of consolidating the remainder.
		423	*/
		424	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
		425	FAKE_NODE_MIN_SIZE;
		426
		427	size &= FAKE_NODE_MIN_HASH_MASK;
		428	if (!size) {
		429	pr_err("Not enough memory for each node. "
		430	"NUMA emulation disabled.\n");
		431	return -1;
		432	}
		433
		434	for (i = 0; i < nr_phys_nodes; i++)
		435	if (physnodes[i].start != physnodes[i].end)
		436	node_set(i, physnode_mask);
		437
		438	/*
		439	* Continue to fill physical nodes with fake nodes until there is no
		440	* memory left on any of them.
		441	*/
		442	while (nodes_weight(physnode_mask)) {
		443	for_each_node_mask(i, physnode_mask) {
		444	u64 end = physnodes[i].start + size;
		445	u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
		446
		447	if (ret < big)
		448	end += FAKE_NODE_MIN_SIZE;
		449
		450	/*
		451	* Continue to add memory to this fake node if its
		452	* non-reserved memory is less than the per-node size.
		453	*/
		454	while (end - physnodes[i].start -
		455	e820_hole_size(physnodes[i].start, end) < size) {
		456	end += FAKE_NODE_MIN_SIZE;
		457	if (end > physnodes[i].end) {
		458	end = physnodes[i].end;
		459	break;
		460	}
		461	}
		462
		463	/*
		464	* If there won't be at least FAKE_NODE_MIN_SIZE of
		465	* non-reserved memory in ZONE_DMA32 for the next node,
		466	* this one must extend to the boundary.
		467	*/
		468	if (end < dma32_end && dma32_end - end -
		469	e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
		470	end = dma32_end;
		471
		472	/*
		473	* If there won't be enough non-reserved memory for the
		474	* next node, this one must extend to the end of the
		475	* physical node.
		476	*/
		477	if (physnodes[i].end - end -
		478	e820_hole_size(end, physnodes[i].end) < size)
		479	end = physnodes[i].end;
		480
		481	/*
		482	* Avoid allocating more nodes than requested, which can
		483	* happen as a result of rounding down each node's size
		484	* to FAKE_NODE_MIN_SIZE.
		485	*/
		486	if (nodes_weight(physnode_mask) + ret >= nr_nodes)
		487	end = physnodes[i].end;
		488
		489	if (setup_node_range(ret++, &physnodes[i].start,
		490	end - physnodes[i].start,
		491	physnodes[i].end) < 0)
		492	node_clear(i, physnode_mask);
		493	}
		494	}
		495	return ret;
		496	}
		497
		498	/*
338	* Splits num_nodes nodes up equally starting at node_start. The return value	499	* Splits num_nodes nodes up equally starting at node_start. The return value
339	* is the number of nodes split up and addr is adjusted to be at the end of the	500	* is the number of nodes split up and addr is adjusted to be at the end of the
340	* last node allocated.	501	* last node allocated.
341	*/	502	*/
342	static int __init split_nodes_equally(struct bootnode nodes, u64 addr,	503	static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343	u64 max_addr, int node_start,
344	int num_nodes)	504	int num_nodes)
345	{	505	{
346	unsigned int big;	506	unsigned int big;
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode nodes, u64 addr,
388	break;	548	break;
389	}	549	}
390	}	550	}
391	if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)	551	if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392	break;	552	break;
393	}	553	}
394	return i - node_start + 1;	554	return i - node_start + 1;
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode nodes, u64 addr,
399	* always assigned to a final node and can be asymmetric. Returns the number of	559	* always assigned to a final node and can be asymmetric. Returns the number of
400	* nodes split.	560	* nodes split.
401	*/	561	*/
402	static int __init split_nodes_by_size(struct bootnode nodes, u64 addr,	562	static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403	u64 max_addr, int node_start, u64 size)	563	u64 size)
404	{	564	{
405	int i = node_start;	565	int i = node_start;
406	size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;	566	size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407	while (!setup_node_range(i++, nodes, addr, size, max_addr))	567	while (!setup_node_range(i++, addr, size, max_addr))
408	;	568	;
409	return i - node_start;	569	return i - node_start;
410	}	570	}
@@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode nodes, u64 addr,
413	* Sets up the system RAM area from start_pfn to last_pfn according to the	573	* Sets up the system RAM area from start_pfn to last_pfn according to the
414	* numa=fake command-line option.	574	* numa=fake command-line option.
415	*/	575	*/
416	static struct bootnode nodes[MAX_NUMNODES] __initdata;	576	static int __init numa_emulation(unsigned long start_pfn,
417		577	unsigned long last_pfn, int acpi, int k8)
418	static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419	{	578	{
420	u64 size, addr = start_pfn << PAGE_SHIFT;	579	u64 size, addr = start_pfn << PAGE_SHIFT;
421	u64 max_addr = last_pfn << PAGE_SHIFT;	580	u64 max_addr = last_pfn << PAGE_SHIFT;
422	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;	581	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
		582	int num_phys_nodes;
423		583
424	memset(&nodes, 0, sizeof(nodes));	584	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425	/*	585	/*
426	* If the numa=fake command-line is just a single number N, split the	586	* If the numa=fake command-line is just a single number N, split the
427	* system RAM into N fake nodes.	587	* system RAM into N fake nodes.
@@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {	589	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430	long n = simple_strtol(cmdline, NULL, 0);	590	long n = simple_strtol(cmdline, NULL, 0);
431		591
432	num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);	592	num_nodes = split_nodes_interleave(addr, max_addr,
		593	num_phys_nodes, n);
433	if (num_nodes < 0)	594	if (num_nodes < 0)
434	return num_nodes;	595	return num_nodes;
435	goto out;	596	goto out;
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456	size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;	617	size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457	if (size)	618	if (size)
458	for (i = 0; i < coeff; i++, num_nodes++)	619	for (i = 0; i < coeff; i++, num_nodes++)
459	if (setup_node_range(num_nodes, nodes,	620	if (setup_node_range(num_nodes, &addr,
460	&addr, size, max_addr) < 0)	621	size, max_addr) < 0)
461	goto done;	622	goto done;
462	if (!*cmdline)	623	if (!*cmdline)
463	break;	624	break;
@@ -473,7 +634,7 @@ done:
473	if (addr < max_addr) {	634	if (addr < max_addr) {
474	if (coeff_flag && coeff < 0) {	635	if (coeff_flag && coeff < 0) {
475	/* Split remaining nodes into num-sized chunks */	636	/* Split remaining nodes into num-sized chunks */
476	num_nodes += split_nodes_by_size(nodes, &addr, max_addr,	637	num_nodes += split_nodes_by_size(&addr, max_addr,
477	num_nodes, num);	638	num_nodes, num);
478	goto out;	639	goto out;
479	}	640	}
@@ -482,7 +643,7 @@ done:
482	/* Split remaining nodes into coeff chunks */	643	/* Split remaining nodes into coeff chunks */
483	if (coeff <= 0)	644	if (coeff <= 0)
484	break;	645	break;
485	num_nodes += split_nodes_equally(nodes, &addr, max_addr,	646	num_nodes += split_nodes_equally(&addr, max_addr,
486	num_nodes, coeff);	647	num_nodes, coeff);
487	break;	648	break;
488	case ',':	649	case ',':
@@ -490,8 +651,8 @@ done:
490	break;	651	break;
491	default:	652	default:
492	/* Give one final node */	653	/* Give one final node */
493	setup_node_range(num_nodes, nodes, &addr,	654	setup_node_range(num_nodes, &addr, max_addr - addr,
494	max_addr - addr, max_addr);	655	max_addr);
495	num_nodes++;	656	num_nodes++;
496	}	657	}
497	}	658	}
@@ -505,14 +666,10 @@ out:
505	}	666	}
506		667
507	/*	668	/*
508	* We need to vacate all active ranges that may have been registered by	669	* We need to vacate all active ranges that may have been registered for
509	* SRAT and set acpi_numa to -1 so that srat_disabled() always returns	670	* the e820 memory map.
510	* true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511	*/	671	*/
512	remove_all_active_ranges();	672	remove_all_active_ranges();
513	#ifdef CONFIG_ACPI_NUMA
514	acpi_numa = -1;
515	#endif
516	for_each_node_mask(i, node_possible_map) {	673	for_each_node_mask(i, node_possible_map) {
517	e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,	674	e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518	nodes[i].end >> PAGE_SHIFT);	675	nodes[i].end >> PAGE_SHIFT);
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
533	nodes_clear(node_online_map);	690	nodes_clear(node_online_map);
534		691
535	#ifdef CONFIG_NUMA_EMU	692	#ifdef CONFIG_NUMA_EMU
536	if (cmdline && !numa_emulation(start_pfn, last_pfn))	693	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
537	return;	694	return;
538	nodes_clear(node_possible_map);	695	nodes_clear(node_possible_map);
539	nodes_clear(node_online_map);	696	nodes_clear(node_online_map);