diff options
author | Tejun Heo <tj@kernel.org> | 2011-05-02 08:08:43 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2011-05-02 08:08:47 -0400 |
commit | aff364860aa105b2deacc6f21ec8ef524460e3fc (patch) | |
tree | 18409ebe16b25b141598da9b6386d69416c06afa /arch/x86/mm | |
parent | c7a7b814c9dca9ee01b38e63b4a46de87156d3b6 (diff) | |
parent | 993ba1585cbb03fab012e41d1a5d24330a283b31 (diff) |
Merge branch 'x86/numa' into x86-mm
Merge reason: Pick up x86-32 remap allocator cleanup changes - 14
commits, 3fe14ab541^..993ba1585c.
3fe14ab541: x86-32, numa: Fix failure condition check in alloc_remap()
993ba1585c: x86-32, numa: Update remap allocator comments
Scheduled NUMA init 32/64bit unification changes depend on them.
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/numa_32.c | 268 | ||||
-rw-r--r-- | arch/x86/mm/srat_32.c | 1 |
2 files changed, 111 insertions, 158 deletions
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index bde3906420df..c757c0a3b529 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
@@ -104,13 +104,9 @@ extern unsigned long highend_pfn, highstart_pfn; | |||
104 | 104 | ||
105 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) | 105 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) |
106 | 106 | ||
107 | unsigned long node_remap_size[MAX_NUMNODES]; | ||
108 | static void *node_remap_start_vaddr[MAX_NUMNODES]; | 107 | static void *node_remap_start_vaddr[MAX_NUMNODES]; |
109 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | 108 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); |
110 | 109 | ||
111 | static unsigned long kva_start_pfn; | ||
112 | static unsigned long kva_pages; | ||
113 | |||
114 | int __cpuinit numa_cpu_node(int cpu) | 110 | int __cpuinit numa_cpu_node(int cpu) |
115 | { | 111 | { |
116 | return apic->x86_32_numa_cpu_node(cpu); | 112 | return apic->x86_32_numa_cpu_node(cpu); |
@@ -129,7 +125,6 @@ int __init get_memcfg_numa_flat(void) | |||
129 | node_end_pfn[0] = max_pfn; | 125 | node_end_pfn[0] = max_pfn; |
130 | memblock_x86_register_active_regions(0, 0, max_pfn); | 126 | memblock_x86_register_active_regions(0, 0, max_pfn); |
131 | memory_present(0, 0, max_pfn); | 127 | memory_present(0, 0, max_pfn); |
132 | node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); | ||
133 | 128 | ||
134 | /* Indicate there is one node available. */ | 129 | /* Indicate there is one node available. */ |
135 | nodes_clear(node_online_map); | 130 | nodes_clear(node_online_map); |
@@ -164,9 +159,8 @@ static void __init allocate_pgdat(int nid) | |||
164 | { | 159 | { |
165 | char buf[16]; | 160 | char buf[16]; |
166 | 161 | ||
167 | if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) | 162 | NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE)); |
168 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; | 163 | if (!NODE_DATA(nid)) { |
169 | else { | ||
170 | unsigned long pgdat_phys; | 164 | unsigned long pgdat_phys; |
171 | pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, | 165 | pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, |
172 | max_pfn_mapped<<PAGE_SHIFT, | 166 | max_pfn_mapped<<PAGE_SHIFT, |
@@ -182,25 +176,38 @@ static void __init allocate_pgdat(int nid) | |||
182 | } | 176 | } |
183 | 177 | ||
184 | /* | 178 | /* |
185 | * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel | 179 | * Remap memory allocator |
186 | * virtual address space (KVA) is reserved and portions of nodes are mapped | ||
187 | * using it. This is to allow node-local memory to be allocated for | ||
188 | * structures that would normally require ZONE_NORMAL. The memory is | ||
189 | * allocated with alloc_remap() and callers should be prepared to allocate | ||
190 | * from the bootmem allocator instead. | ||
191 | */ | 180 | */ |
192 | static unsigned long node_remap_start_pfn[MAX_NUMNODES]; | 181 | static unsigned long node_remap_start_pfn[MAX_NUMNODES]; |
193 | static void *node_remap_end_vaddr[MAX_NUMNODES]; | 182 | static void *node_remap_end_vaddr[MAX_NUMNODES]; |
194 | static void *node_remap_alloc_vaddr[MAX_NUMNODES]; | 183 | static void *node_remap_alloc_vaddr[MAX_NUMNODES]; |
195 | static unsigned long node_remap_offset[MAX_NUMNODES]; | ||
196 | 184 | ||
185 | /** | ||
186 | * alloc_remap - Allocate remapped memory | ||
187 | * @nid: NUMA node to allocate memory from | ||
188 | * @size: The size of allocation | ||
189 | * | ||
190 | * Allocate @size bytes from the remap area of NUMA node @nid. The | ||
191 | * size of the remap area is predetermined by init_alloc_remap() and | ||
192 | * only the callers considered there should call this function. For | ||
193 | * more info, please read the comment on top of init_alloc_remap(). | ||
194 | * | ||
195 | * The caller must be ready to handle allocation failure from this | ||
196 | * function and fall back to regular memory allocator in such cases. | ||
197 | * | ||
198 | * CONTEXT: | ||
199 | * Single CPU early boot context. | ||
200 | * | ||
201 | * RETURNS: | ||
202 | * Pointer to the allocated memory on success, %NULL on failure. | ||
203 | */ | ||
197 | void *alloc_remap(int nid, unsigned long size) | 204 | void *alloc_remap(int nid, unsigned long size) |
198 | { | 205 | { |
199 | void *allocation = node_remap_alloc_vaddr[nid]; | 206 | void *allocation = node_remap_alloc_vaddr[nid]; |
200 | 207 | ||
201 | size = ALIGN(size, L1_CACHE_BYTES); | 208 | size = ALIGN(size, L1_CACHE_BYTES); |
202 | 209 | ||
203 | if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) | 210 | if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) |
204 | return NULL; | 211 | return NULL; |
205 | 212 | ||
206 | node_remap_alloc_vaddr[nid] += size; | 213 | node_remap_alloc_vaddr[nid] += size; |
@@ -209,26 +216,6 @@ void *alloc_remap(int nid, unsigned long size) | |||
209 | return allocation; | 216 | return allocation; |
210 | } | 217 | } |
211 | 218 | ||
212 | static void __init remap_numa_kva(void) | ||
213 | { | ||
214 | void *vaddr; | ||
215 | unsigned long pfn; | ||
216 | int node; | ||
217 | |||
218 | for_each_online_node(node) { | ||
219 | printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); | ||
220 | for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { | ||
221 | vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); | ||
222 | printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", | ||
223 | (unsigned long)vaddr, | ||
224 | node_remap_start_pfn[node] + pfn); | ||
225 | set_pmd_pfn((ulong) vaddr, | ||
226 | node_remap_start_pfn[node] + pfn, | ||
227 | PAGE_KERNEL_LARGE); | ||
228 | } | ||
229 | } | ||
230 | } | ||
231 | |||
232 | #ifdef CONFIG_HIBERNATION | 219 | #ifdef CONFIG_HIBERNATION |
233 | /** | 220 | /** |
234 | * resume_map_numa_kva - add KVA mapping to the temporary page tables created | 221 | * resume_map_numa_kva - add KVA mapping to the temporary page tables created |
@@ -240,15 +227,16 @@ void resume_map_numa_kva(pgd_t *pgd_base) | |||
240 | int node; | 227 | int node; |
241 | 228 | ||
242 | for_each_online_node(node) { | 229 | for_each_online_node(node) { |
243 | unsigned long start_va, start_pfn, size, pfn; | 230 | unsigned long start_va, start_pfn, nr_pages, pfn; |
244 | 231 | ||
245 | start_va = (unsigned long)node_remap_start_vaddr[node]; | 232 | start_va = (unsigned long)node_remap_start_vaddr[node]; |
246 | start_pfn = node_remap_start_pfn[node]; | 233 | start_pfn = node_remap_start_pfn[node]; |
247 | size = node_remap_size[node]; | 234 | nr_pages = (node_remap_end_vaddr[node] - |
235 | node_remap_start_vaddr[node]) >> PAGE_SHIFT; | ||
248 | 236 | ||
249 | printk(KERN_DEBUG "%s: node %d\n", __func__, node); | 237 | printk(KERN_DEBUG "%s: node %d\n", __func__, node); |
250 | 238 | ||
251 | for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { | 239 | for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { |
252 | unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); | 240 | unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); |
253 | pgd_t *pgd = pgd_base + pgd_index(vaddr); | 241 | pgd_t *pgd = pgd_base + pgd_index(vaddr); |
254 | pud_t *pud = pud_offset(pgd, vaddr); | 242 | pud_t *pud = pud_offset(pgd, vaddr); |
@@ -264,132 +252,102 @@ void resume_map_numa_kva(pgd_t *pgd_base) | |||
264 | } | 252 | } |
265 | #endif | 253 | #endif |
266 | 254 | ||
267 | static __init unsigned long calculate_numa_remap_pages(void) | 255 | /** |
256 | * init_alloc_remap - Initialize remap allocator for a NUMA node | ||
257 | * @nid: NUMA node to initizlie remap allocator for | ||
258 | * | ||
259 | * NUMA nodes may end up without any lowmem. As allocating pgdat and | ||
260 | * memmap on a different node with lowmem is inefficient, a special | ||
261 | * remap allocator is implemented which can be used by alloc_remap(). | ||
262 | * | ||
263 | * For each node, the amount of memory which will be necessary for | ||
264 | * pgdat and memmap is calculated and two memory areas of the size are | ||
265 | * allocated - one in the node and the other in lowmem; then, the area | ||
266 | * in the node is remapped to the lowmem area. | ||
267 | * | ||
268 | * As pgdat and memmap must be allocated in lowmem anyway, this | ||
269 | * doesn't waste lowmem address space; however, the actual lowmem | ||
270 | * which gets remapped over is wasted. The amount shouldn't be | ||
271 | * problematic on machines this feature will be used. | ||
272 | * | ||
273 | * Initialization failure isn't fatal. alloc_remap() is used | ||
274 | * opportunistically and the callers will fall back to other memory | ||
275 | * allocation mechanisms on failure. | ||
276 | */ | ||
277 | static __init void init_alloc_remap(int nid) | ||
268 | { | 278 | { |
269 | int nid; | 279 | unsigned long size, pfn; |
270 | unsigned long size, reserve_pages = 0; | 280 | u64 node_pa, remap_pa; |
281 | void *remap_va; | ||
271 | 282 | ||
272 | for_each_online_node(nid) { | 283 | /* |
273 | u64 node_kva_target; | 284 | * The acpi/srat node info can show hot-add memroy zones where |
274 | u64 node_kva_final; | 285 | * memory could be added but not currently present. |
275 | 286 | */ | |
276 | /* | 287 | printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", |
277 | * The acpi/srat node info can show hot-add memroy zones | 288 | nid, node_start_pfn[nid], node_end_pfn[nid]); |
278 | * where memory could be added but not currently present. | 289 | if (node_start_pfn[nid] > max_pfn) |
279 | */ | 290 | return; |
280 | printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", | 291 | if (!node_end_pfn[nid]) |
281 | nid, node_start_pfn[nid], node_end_pfn[nid]); | 292 | return; |
282 | if (node_start_pfn[nid] > max_pfn) | 293 | if (node_end_pfn[nid] > max_pfn) |
283 | continue; | 294 | node_end_pfn[nid] = max_pfn; |
284 | if (!node_end_pfn[nid]) | ||
285 | continue; | ||
286 | if (node_end_pfn[nid] > max_pfn) | ||
287 | node_end_pfn[nid] = max_pfn; | ||
288 | |||
289 | /* ensure the remap includes space for the pgdat. */ | ||
290 | size = node_remap_size[nid] + sizeof(pg_data_t); | ||
291 | |||
292 | /* convert size to large (pmd size) pages, rounding up */ | ||
293 | size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; | ||
294 | /* now the roundup is correct, convert to PAGE_SIZE pages */ | ||
295 | size = size * PTRS_PER_PTE; | ||
296 | |||
297 | node_kva_target = round_down(node_end_pfn[nid] - size, | ||
298 | PTRS_PER_PTE); | ||
299 | node_kva_target <<= PAGE_SHIFT; | ||
300 | do { | ||
301 | node_kva_final = memblock_find_in_range(node_kva_target, | ||
302 | ((u64)node_end_pfn[nid])<<PAGE_SHIFT, | ||
303 | ((u64)size)<<PAGE_SHIFT, | ||
304 | LARGE_PAGE_BYTES); | ||
305 | node_kva_target -= LARGE_PAGE_BYTES; | ||
306 | } while (node_kva_final == MEMBLOCK_ERROR && | ||
307 | (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); | ||
308 | |||
309 | if (node_kva_final == MEMBLOCK_ERROR) | ||
310 | panic("Can not get kva ram\n"); | ||
311 | |||
312 | node_remap_size[nid] = size; | ||
313 | node_remap_offset[nid] = reserve_pages; | ||
314 | reserve_pages += size; | ||
315 | printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" | ||
316 | " node %d at %llx\n", | ||
317 | size, nid, node_kva_final>>PAGE_SHIFT); | ||
318 | |||
319 | /* | ||
320 | * prevent kva address below max_low_pfn want it on system | ||
321 | * with less memory later. | ||
322 | * layout will be: KVA address , KVA RAM | ||
323 | * | ||
324 | * we are supposed to only record the one less then max_low_pfn | ||
325 | * but we could have some hole in high memory, and it will only | ||
326 | * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide | ||
327 | * to use it as free. | ||
328 | * So memblock_x86_reserve_range here, hope we don't run out of that array | ||
329 | */ | ||
330 | memblock_x86_reserve_range(node_kva_final, | ||
331 | node_kva_final+(((u64)size)<<PAGE_SHIFT), | ||
332 | "KVA RAM"); | ||
333 | |||
334 | node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; | ||
335 | } | ||
336 | printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", | ||
337 | reserve_pages); | ||
338 | return reserve_pages; | ||
339 | } | ||
340 | 295 | ||
341 | static void init_remap_allocator(int nid) | 296 | /* calculate the necessary space aligned to large page size */ |
342 | { | 297 | size = node_memmap_size_bytes(nid, node_start_pfn[nid], |
343 | node_remap_start_vaddr[nid] = pfn_to_kaddr( | 298 | min(node_end_pfn[nid], max_pfn)); |
344 | kva_start_pfn + node_remap_offset[nid]); | 299 | size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); |
345 | node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + | 300 | size = ALIGN(size, LARGE_PAGE_BYTES); |
346 | (node_remap_size[nid] * PAGE_SIZE); | 301 | |
347 | node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + | 302 | /* allocate node memory and the lowmem remap area */ |
348 | ALIGN(sizeof(pg_data_t), PAGE_SIZE); | 303 | node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, |
349 | 304 | (u64)node_end_pfn[nid] << PAGE_SHIFT, | |
350 | printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, | 305 | size, LARGE_PAGE_BYTES); |
351 | (ulong) node_remap_start_vaddr[nid], | 306 | if (node_pa == MEMBLOCK_ERROR) { |
352 | (ulong) node_remap_end_vaddr[nid]); | 307 | pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", |
308 | size, nid); | ||
309 | return; | ||
310 | } | ||
311 | memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); | ||
312 | |||
313 | remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, | ||
314 | max_low_pfn << PAGE_SHIFT, | ||
315 | size, LARGE_PAGE_BYTES); | ||
316 | if (remap_pa == MEMBLOCK_ERROR) { | ||
317 | pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", | ||
318 | size, nid); | ||
319 | memblock_x86_free_range(node_pa, node_pa + size); | ||
320 | return; | ||
321 | } | ||
322 | memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); | ||
323 | remap_va = phys_to_virt(remap_pa); | ||
324 | |||
325 | /* perform actual remap */ | ||
326 | for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) | ||
327 | set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), | ||
328 | (node_pa >> PAGE_SHIFT) + pfn, | ||
329 | PAGE_KERNEL_LARGE); | ||
330 | |||
331 | /* initialize remap allocator parameters */ | ||
332 | node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; | ||
333 | node_remap_start_vaddr[nid] = remap_va; | ||
334 | node_remap_end_vaddr[nid] = remap_va + size; | ||
335 | node_remap_alloc_vaddr[nid] = remap_va; | ||
336 | |||
337 | printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", | ||
338 | nid, node_pa, node_pa + size, remap_va, remap_va + size); | ||
353 | } | 339 | } |
354 | 340 | ||
355 | void __init initmem_init(void) | 341 | void __init initmem_init(void) |
356 | { | 342 | { |
357 | int nid; | 343 | int nid; |
358 | long kva_target_pfn; | ||
359 | |||
360 | /* | ||
361 | * When mapping a NUMA machine we allocate the node_mem_map arrays | ||
362 | * from node local memory. They are then mapped directly into KVA | ||
363 | * between zone normal and vmalloc space. Calculate the size of | ||
364 | * this space and use it to adjust the boundary between ZONE_NORMAL | ||
365 | * and ZONE_HIGHMEM. | ||
366 | */ | ||
367 | 344 | ||
368 | get_memcfg_numa(); | 345 | get_memcfg_numa(); |
369 | numa_init_array(); | 346 | numa_init_array(); |
370 | 347 | ||
371 | kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); | 348 | for_each_online_node(nid) |
372 | 349 | init_alloc_remap(nid); | |
373 | kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); | ||
374 | do { | ||
375 | kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT, | ||
376 | max_low_pfn<<PAGE_SHIFT, | ||
377 | kva_pages<<PAGE_SHIFT, | ||
378 | PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; | ||
379 | kva_target_pfn -= PTRS_PER_PTE; | ||
380 | } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); | ||
381 | |||
382 | if (kva_start_pfn == MEMBLOCK_ERROR) | ||
383 | panic("Can not get kva space\n"); | ||
384 | |||
385 | printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", | ||
386 | kva_start_pfn, max_low_pfn); | ||
387 | printk(KERN_INFO "max_pfn = %lx\n", max_pfn); | ||
388 | 350 | ||
389 | /* avoid clash with initrd */ | ||
390 | memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT, | ||
391 | (kva_start_pfn + kva_pages)<<PAGE_SHIFT, | ||
392 | "KVA PG"); | ||
393 | #ifdef CONFIG_HIGHMEM | 351 | #ifdef CONFIG_HIGHMEM |
394 | highstart_pfn = highend_pfn = max_pfn; | 352 | highstart_pfn = highend_pfn = max_pfn; |
395 | if (max_pfn > max_low_pfn) | 353 | if (max_pfn > max_low_pfn) |
@@ -409,12 +367,8 @@ void __init initmem_init(void) | |||
409 | 367 | ||
410 | printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", | 368 | printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", |
411 | (ulong) pfn_to_kaddr(max_low_pfn)); | 369 | (ulong) pfn_to_kaddr(max_low_pfn)); |
412 | for_each_online_node(nid) { | 370 | for_each_online_node(nid) |
413 | init_remap_allocator(nid); | ||
414 | |||
415 | allocate_pgdat(nid); | 371 | allocate_pgdat(nid); |
416 | } | ||
417 | remap_numa_kva(); | ||
418 | 372 | ||
419 | printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", | 373 | printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", |
420 | (ulong) pfn_to_kaddr(highstart_pfn)); | 374 | (ulong) pfn_to_kaddr(highstart_pfn)); |
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 48651c6f657d..1b9e82c96dc5 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c | |||
@@ -276,7 +276,6 @@ int __init get_memcfg_from_srat(void) | |||
276 | unsigned long end = min(node_end_pfn[nid], max_pfn); | 276 | unsigned long end = min(node_end_pfn[nid], max_pfn); |
277 | 277 | ||
278 | memory_present(nid, start, end); | 278 | memory_present(nid, start, end); |
279 | node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); | ||
280 | } | 279 | } |
281 | return 1; | 280 | return 1; |
282 | out_fail: | 281 | out_fail: |