aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-08-26 21:34:10 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-08-26 22:37:12 -0400
commit485761bd6a72d33b3d4fa884927b2b0d983b701e (patch)
treec75562513489f62c8dcfd41acd467bca3d3202cc /arch
parentbebf4688e9dbbfdd421736685d607bced91a3c91 (diff)
[PATCH] x86_64: Tell VM about holes in nodes
Some nodes can have large holes on x86-64. This fixes problems with the VM allowing too many dirty pages because it overestimates the number of available RAM in a node. In extreme cases you can end up with all RAM filled with dirty pages which can lead to deadlocks and other nasty behaviour. This patch just tells the VM about the known holes from e820. Reserved (like the kernel text or mem_map) is still not taken into account, but that should be only a few percent error now. Small detail is that the flat setup uses the NUMA free_area_init_node() now too because it offers more flexibility. (akpm: lotsa thanks to Martin for working this problem out) Cc: Martin Bligh <mbligh@mbligh.org> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86_64/kernel/e820.c34
-rw-r--r--arch/x86_64/mm/init.c16
-rw-r--r--arch/x86_64/mm/numa.c8
3 files changed, 53 insertions, 5 deletions
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 6ded3a50dfe6..b548dea4e5b9 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -185,6 +185,40 @@ unsigned long __init e820_end_of_ram(void)
185} 185}
186 186
187/* 187/*
188 * Compute how much memory is missing in a range.
189 * Unlike the other functions in this file the arguments are in page numbers.
190 */
191unsigned long __init
192e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
193{
194 unsigned long ram = 0;
195 unsigned long start = start_pfn << PAGE_SHIFT;
196 unsigned long end = end_pfn << PAGE_SHIFT;
197 int i;
198 for (i = 0; i < e820.nr_map; i++) {
199 struct e820entry *ei = &e820.map[i];
200 unsigned long last, addr;
201
202 if (ei->type != E820_RAM ||
203 ei->addr+ei->size <= start ||
204 ei->addr >= end)
205 continue;
206
207 addr = round_up(ei->addr, PAGE_SIZE);
208 if (addr < start)
209 addr = start;
210
211 last = round_down(ei->addr + ei->size, PAGE_SIZE);
212 if (last >= end)
213 last = end;
214
215 if (last > addr)
216 ram += last - addr;
217 }
218 return ((end - start) - ram) >> PAGE_SHIFT;
219}
220
221/*
188 * Mark e820 reserved areas as busy for the resource manager. 222 * Mark e820 reserved areas as busy for the resource manager.
189 */ 223 */
190void __init e820_reserve_resources(void) 224void __init e820_reserve_resources(void)
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 72e4b364ed73..aa4a5189ecee 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -322,18 +322,26 @@ void zap_low_mappings(void)
322void __init paging_init(void) 322void __init paging_init(void)
323{ 323{
324 { 324 {
325 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; 325 unsigned long zones_size[MAX_NR_ZONES];
326 unsigned long holes[MAX_NR_ZONES];
326 unsigned int max_dma; 327 unsigned int max_dma;
327 328
329 memset(zones_size, 0, sizeof(zones_size));
330 memset(holes, 0, sizeof(holes));
331
328 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 332 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
329 333
330 if (end_pfn < max_dma) 334 if (end_pfn < max_dma) {
331 zones_size[ZONE_DMA] = end_pfn; 335 zones_size[ZONE_DMA] = end_pfn;
332 else { 336 holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
337 } else {
333 zones_size[ZONE_DMA] = max_dma; 338 zones_size[ZONE_DMA] = max_dma;
339 holes[ZONE_DMA] = e820_hole_size(0, max_dma);
334 zones_size[ZONE_NORMAL] = end_pfn - max_dma; 340 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
341 holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
335 } 342 }
336 free_area_init(zones_size); 343 free_area_init_node(0, NODE_DATA(0), zones_size,
344 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
337 } 345 }
338 return; 346 return;
339} 347}
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 70cb2904a90f..6a156f5692ae 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -126,9 +126,11 @@ void __init setup_node_zones(int nodeid)
126{ 126{
127 unsigned long start_pfn, end_pfn; 127 unsigned long start_pfn, end_pfn;
128 unsigned long zones[MAX_NR_ZONES]; 128 unsigned long zones[MAX_NR_ZONES];
129 unsigned long holes[MAX_NR_ZONES];
129 unsigned long dma_end_pfn; 130 unsigned long dma_end_pfn;
130 131
131 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 132 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
133 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
132 134
133 start_pfn = node_start_pfn(nodeid); 135 start_pfn = node_start_pfn(nodeid);
134 end_pfn = node_end_pfn(nodeid); 136 end_pfn = node_end_pfn(nodeid);
@@ -139,13 +141,17 @@ void __init setup_node_zones(int nodeid)
139 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 141 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
140 if (start_pfn < dma_end_pfn) { 142 if (start_pfn < dma_end_pfn) {
141 zones[ZONE_DMA] = dma_end_pfn - start_pfn; 143 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
144 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
142 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 145 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
146 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
147
143 } else { 148 } else {
144 zones[ZONE_NORMAL] = end_pfn - start_pfn; 149 zones[ZONE_NORMAL] = end_pfn - start_pfn;
150 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
145 } 151 }
146 152
147 free_area_init_node(nodeid, NODE_DATA(nodeid), zones, 153 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
148 start_pfn, NULL); 154 start_pfn, holes);
149} 155}
150 156
151void __init numa_init_array(void) 157void __init numa_init_array(void)