aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAmul Shah <amul.shah@unisys.com>2007-02-13 07:26:19 -0500
committerAndi Kleen <andi@basil.nowhere.org>2007-02-13 07:26:19 -0500
commit076422d2af7e3d8e72c6e70843f6ea377714b082 (patch)
tree942ec9d2e7f3f74d6694af99a745ee74ef851268
parent0812a579c92fefa57506821fa08e90f47cb6dbdd (diff)
[PATCH] x86-64: Allocate the NUMA hash function nodemap dynamically
Remove the statically allocated memory to NUMA node hash map in favor of a dynamically allocated memory to node hash map (it is cache aligned). This patch has the nice side effect in that it allows the hash map to grow for systems with large amounts of memory (256GB - 1TB), but suffer from having small PCI space tacked onto the boot node (which is somewhere between 192MB to 512MB on the ES7000). Signed-off-by: Amul Shah <amul.shah@unisys.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@suse.de> Cc: Rohit Seth <rohitseth@google.com> Signed-off-by: Andrew Morton <akpm@osdl.org>
-rw-r--r--arch/x86_64/kernel/e820.c7
-rw-r--r--arch/x86_64/kernel/setup.c5
-rw-r--r--arch/x86_64/mm/numa.c74
-rw-r--r--include/asm-x86_64/e820.h1
-rw-r--r--include/asm-x86_64/mmzone.h13
5 files changed, 85 insertions, 15 deletions
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 6fe191c58084..9d67955bbc31 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
83 return 1; 83 return 1;
84 } 84 }
85 85
86#ifdef CONFIG_NUMA
87 /* NUMA memory to node map */
88 if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
89 *addrp = nodemap_addr + nodemap_size;
90 return 1;
91 }
92#endif
86 /* XXX ramdisk image here? */ 93 /* XXX ramdisk image here? */
87 return 0; 94 return 0;
88} 95}
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 60477244d1a3..f330f8285499 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -444,6 +444,11 @@ void __init setup_arch(char **cmdline_p)
444 /* reserve ebda region */ 444 /* reserve ebda region */
445 if (ebda_addr) 445 if (ebda_addr)
446 reserve_bootmem_generic(ebda_addr, ebda_size); 446 reserve_bootmem_generic(ebda_addr, ebda_size);
447#ifdef CONFIG_NUMA
448 /* reserve nodemap region */
449 if (nodemap_addr)
450 reserve_bootmem_generic(nodemap_addr, nodemap_size);
451#endif
447 452
448#ifdef CONFIG_SMP 453#ifdef CONFIG_SMP
449 /* 454 /*
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 2ee2e003606c..7d9c428f4094 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; 36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
37 37
38int numa_off __initdata; 38int numa_off __initdata;
39unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size;
39 41
40 42
41/* 43/*
@@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
52 int res = -1; 54 int res = -1;
53 unsigned long addr, end; 55 unsigned long addr, end;
54 56
55 if (shift >= 64) 57 memset(memnodemap, 0xff, memnodemapsize);
56 return -1;
57 memset(memnodemap, 0xff, sizeof(memnodemap));
58 for (i = 0; i < numnodes; i++) { 58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start; 59 addr = nodes[i].start;
60 end = nodes[i].end; 60 end = nodes[i].end;
61 if (addr >= end) 61 if (addr >= end)
62 continue; 62 continue;
63 if ((end >> shift) >= NODEMAPSIZE) 63 if ((end >> shift) >= memnodemapsize)
64 return 0; 64 return 0;
65 do { 65 do {
66 if (memnodemap[addr >> shift] != 0xff) 66 if (memnodemap[addr >> shift] != 0xff)
67 return -1; 67 return -1;
68 memnodemap[addr >> shift] = i; 68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift); 69 addr += (1UL << shift);
70 } while (addr < end); 70 } while (addr < end);
71 res = 1; 71 res = 1;
72 } 72 }
73 return res; 73 return res;
74} 74}
75 75
76int __init compute_hash_shift(struct bootnode *nodes, int numnodes) 76static int __init allocate_cachealigned_memnodemap(void)
77{
78 unsigned long pad, pad_addr;
79
80 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48) {
82 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
83 nodemap_addr, nodemap_addr + nodemap_size);
84 return 0;
85 }
86
87 pad = L1_CACHE_BYTES - 1;
88 pad_addr = 0x8000;
89 nodemap_size = pad + memnodemapsize;
90 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
91 nodemap_size);
92 if (nodemap_addr == -1UL) {
93 printk(KERN_ERR
94 "NUMA: Unable to allocate Memory to Node hash map\n");
95 nodemap_addr = nodemap_size = 0;
96 return -1;
97 }
98 pad_addr = (nodemap_addr + pad) & ~pad;
99 memnodemap = phys_to_virt(pad_addr);
100
101 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
102 nodemap_addr, nodemap_addr + nodemap_size);
103 return 0;
104}
105
106/*
107 * The LSB of all start and end addresses in the node map is the value of the
108 * maximum possible shift.
109 */
110static int __init
111extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
77{ 112{
78 int shift = 20; 113 int i;
114 unsigned long start, end;
115 unsigned long bitfield = 0, memtop = 0;
79 116
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) 117 for (i = 0; i < numnodes; i++) {
81 shift++; 118 start = nodes[i].start;
119 end = nodes[i].end;
120 if (start >= end)
121 continue;
122 bitfield |= start | end;
123 if (end > memtop)
124 memtop = end;
125 }
126 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
127 memnodemapsize = (memtop >> i)+1;
128 return i;
129}
130
131int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
132{
133 int shift;
82 134
135 shift = extract_lsb_from_nodes(nodes, numnodes);
136 if (allocate_cachealigned_memnodemap())
137 return -1;
83 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 138 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
84 shift); 139 shift);
85 140
@@ -290,6 +345,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
290 end_pfn << PAGE_SHIFT); 345 end_pfn << PAGE_SHIFT);
291 /* setup dummy node covering all memory */ 346 /* setup dummy node covering all memory */
292 memnode_shift = 63; 347 memnode_shift = 63;
348 memnodemap = memnode.embedded_map;
293 memnodemap[0] = 0; 349 memnodemap[0] = 0;
294 nodes_clear(node_online_map); 350 nodes_clear(node_online_map);
295 node_set_online(0); 351 node_set_online(0);
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h
index fa2086774105..855fb4a454b6 100644
--- a/include/asm-x86_64/e820.h
+++ b/include/asm-x86_64/e820.h
@@ -56,6 +56,7 @@ extern void finish_e820_parsing(void);
56extern struct e820map e820; 56extern struct e820map e820;
57 57
58extern unsigned ebda_addr, ebda_size; 58extern unsigned ebda_addr, ebda_size;
59extern unsigned long nodemap_addr, nodemap_size;
59#endif/*!__ASSEMBLY__*/ 60#endif/*!__ASSEMBLY__*/
60 61
61#endif/*__E820_HEADER*/ 62#endif/*__E820_HEADER*/
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index c38ebdf6f426..39ef106986eb 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -11,24 +11,25 @@
11 11
12#include <asm/smp.h> 12#include <asm/smp.h>
13 13
14/* Should really switch to dynamic allocation at some point */
15#define NODEMAPSIZE 0x4fff
16
17/* Simple perfect hash to map physical addresses to node numbers */ 14/* Simple perfect hash to map physical addresses to node numbers */
18struct memnode { 15struct memnode {
19 int shift; 16 int shift;
20 u8 map[NODEMAPSIZE]; 17 unsigned int mapsize;
21} ____cacheline_aligned; 18 u8 *map;
19 u8 embedded_map[64-16];
20} ____cacheline_aligned; /* total size = 64 bytes */
22extern struct memnode memnode; 21extern struct memnode memnode;
23#define memnode_shift memnode.shift 22#define memnode_shift memnode.shift
24#define memnodemap memnode.map 23#define memnodemap memnode.map
24#define memnodemapsize memnode.mapsize
25 25
26extern struct pglist_data *node_data[]; 26extern struct pglist_data *node_data[];
27 27
28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
29{ 29{
30 unsigned nid; 30 unsigned nid;
31 VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE); 31 VIRTUAL_BUG_ON(!memnodemap);
32 VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize);
32 nid = memnodemap[addr >> memnode_shift]; 33 nid = memnodemap[addr >> memnode_shift];
33 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 34 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
34 return nid; 35 return nid;