From 3fe14ab541cd9b0d1f243afb7556046f12c8743c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:47 +0200 Subject: x86-32, numa: Fix failure condition check in alloc_remap() node_remap_{start|end}_vaddr[] describe [start, end) ranges; however, alloc_remap() incorrectly failed when the current allocation + size equaled the end but it should fail only when it goes over. Fix it. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-2-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index bde3906420d..84aac47c388 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -200,7 +200,7 @@ void *alloc_remap(int nid, unsigned long size) size = ALIGN(size, L1_CACHE_BYTES); - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) return NULL; node_remap_alloc_vaddr[nid] += size; -- cgit v1.2.2 From a6c24f7a705d939ddd2fcaa443fa3d8e852b933d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:48 +0200 Subject: x86-32, numa: Align pgdat size while initializing alloc_remap When pgdat is reserved in init_remap_allocator(), PAGE_SIZE aligned size will be used. Match the size alignment in initialization to avoid allocation failure down the road. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-3-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 84aac47c388..50e82507eab 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -287,7 +287,8 @@ static __init unsigned long calculate_numa_remap_pages(void) node_end_pfn[nid] = max_pfn; /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); + size = node_remap_size[nid]; + size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); /* convert size to large (pmd size) pages, rounding up */ size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; -- cgit v1.2.2 From 5b8443b25c0f323ec190d094e4b441957b02664e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:49 +0200 Subject: x86-32, numa: Remove redundant top-down alloc code from remap initialization memblock_find_in_range() now does top-down allocation by default, so there's no reason for its callers to explicitly implement it by gradually lowering the start address. Remove redundant top-down allocation logic from init_meminit() and calculate_numa_remap_pages(). Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-4-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 50e82507eab..60701a5e0de 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -270,8 +270,7 @@ static __init unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_kva_target; - u64 node_kva_final; + u64 node_kva; /* * The acpi/srat node info can show hot-add memroy zones @@ -295,19 +294,11 @@ static __init unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_kva_target = round_down(node_end_pfn[nid] - size, - PTRS_PER_PTE); - node_kva_target <<= PAGE_SHIFT; - do { - node_kva_final = memblock_find_in_range(node_kva_target, + node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); - - if (node_kva_final == MEMBLOCK_ERROR) + ((u64)size)<>PAGE_SHIFT); + size, nid, node_kva >> PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system @@ -328,11 +319,11 @@ static __init unsigned long calculate_numa_remap_pages(void) * to use it as free. * So memblock_x86_reserve_range here, hope we don't run out of that array */ - memblock_x86_reserve_range(node_kva_final, - node_kva_final+(((u64)size)<>PAGE_SHIFT; + node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT; } printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", reserve_pages); @@ -356,7 +347,6 @@ static void init_remap_allocator(int nid) void __init initmem_init(void) { int nid; - long kva_target_pfn; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -371,15 +361,10 @@ void __init initmem_init(void) kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - do { - kva_start_pfn = memblock_find_in_range(kva_target_pfn<> PAGE_SHIFT; - kva_target_pfn -= PTRS_PER_PTE; - } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); - + kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, + kva_pages << PAGE_SHIFT, + PTRS_PER_PTE << PAGE_SHIFT) >> PAGE_SHIFT; if (kva_start_pfn == MEMBLOCK_ERROR) panic("Can not get kva space\n"); -- cgit v1.2.2 From 5510db9c1be111528ce46c57f0bec1c9dce258f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:50 +0200 Subject: x86-32, numa: Reorganize calculate_numa_remap_page() Separate the outer node walking loop and per-node logic from calculate_numa_remap_pages(). The outer loop is collapsed into initmem_init() and the per-node logic is moved into a new function - init_alloc_remap(). The new function name is confusing with the existing init_remap_allocator() and the behavior is the function isn't very clean either at this point, but this is to prepare for further cleanups and it will become prettier. This function doesn't introduce any behavior change. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-5-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 125 +++++++++++++++++++++++++------------------------- 1 file changed, 62 insertions(+), 63 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 60701a5e0de..5039e9b21d9 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -264,70 +264,64 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif -static __init unsigned long calculate_numa_remap_pages(void) +static __init unsigned long init_alloc_remap(int nid, unsigned long offset) { - int nid; - unsigned long size, reserve_pages = 0; + unsigned long size; + u64 node_kva; - for_each_online_node(nid) { - u64 node_kva; - - /* - * The acpi/srat node info can show hot-add memroy zones - * where memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, node_start_pfn[nid], node_end_pfn[nid]); - if (node_start_pfn[nid] > max_pfn) - continue; - if (!node_end_pfn[nid]) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid]; - size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, - ((u64)node_end_pfn[nid])<> PAGE_SHIFT); - - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then max_low_pfn - * but we could have some hole in high memory, and it will only - * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide - * to use it as free. - * So memblock_x86_reserve_range here, hope we don't run out of that array - */ - memblock_x86_reserve_range(node_kva, - node_kva + (((u64)size)<> PAGE_SHIFT; - } - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; + /* + * The acpi/srat node info can show hot-add memroy zones where + * memory could be added but not currently present. + */ + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); + if (node_start_pfn[nid] > max_pfn) + return 0; + if (!node_end_pfn[nid]) + return 0; + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + + /* ensure the remap includes space for the pgdat. */ + size = node_remap_size[nid]; + size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + /* convert size to large (pmd size) pages, rounding up */ + size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; + /* now the roundup is correct, convert to PAGE_SIZE pages */ + size = size * PTRS_PER_PTE; + + node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, + (u64)node_end_pfn[nid] << PAGE_SHIFT, + (u64)size << PAGE_SHIFT, + LARGE_PAGE_BYTES); + if (node_kva == MEMBLOCK_ERROR) + panic("Can not get kva ram\n"); + + node_remap_size[nid] = size; + node_remap_offset[nid] = offset; + printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva >> PAGE_SHIFT); + + /* + * prevent kva address below max_low_pfn want it on system + * with less memory later. + * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then + * max_low_pfn but we could have some hole in high memory, + * and it will only check page_is_ram(pfn) && + * !page_is_reserved_early(pfn) to decide to use it as free. + * So memblock_x86_reserve_range here, hope we don't run out + * of that array + */ + memblock_x86_reserve_range(node_kva, + node_kva + ((u64)size << PAGE_SHIFT), + "KVA RAM"); + + node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT; + + return size; } static void init_remap_allocator(int nid) @@ -346,6 +340,7 @@ static void init_remap_allocator(int nid) void __init initmem_init(void) { + unsigned long reserve_pages = 0; int nid; /* @@ -359,7 +354,11 @@ void __init initmem_init(void) get_memcfg_numa(); numa_init_array(); - kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); + for_each_online_node(nid) + reserve_pages += init_alloc_remap(nid, reserve_pages); + kva_pages = roundup(reserve_pages, PTRS_PER_PTE); + printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", + reserve_pages); kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT, -- cgit v1.2.2 From c4d4f577d49c441ab4f1bb6068247dafb366e635 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:51 +0200 Subject: x86-32, numa: Rename @node_kva to @node_pa in init_alloc_remap() init_alloc_remap() is about to do more and using _kva suffix for physical address becomes confusing because the function will be handling both physical and virtual addresses. Rename @node_kva to @node_pa. This is trivial rename and doesn't cause any behavior difference. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-6-git-send-email-tj@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 5039e9b21d9..30933fec8f7 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -267,7 +267,7 @@ void resume_map_numa_kva(pgd_t *pgd_base) static __init unsigned long init_alloc_remap(int nid, unsigned long offset) { unsigned long size; - u64 node_kva; + u64 node_pa; /* * The acpi/srat node info can show hot-add memroy zones where @@ -291,17 +291,17 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, - (u64)node_end_pfn[nid] << PAGE_SHIFT, - (u64)size << PAGE_SHIFT, - LARGE_PAGE_BYTES); - if (node_kva == MEMBLOCK_ERROR) + node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, + (u64)node_end_pfn[nid] << PAGE_SHIFT, + (u64)size << PAGE_SHIFT, + LARGE_PAGE_BYTES); + if (node_pa == MEMBLOCK_ERROR) panic("Can not get kva ram\n"); node_remap_size[nid] = size; node_remap_offset[nid] = offset; printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", - size, nid, node_kva >> PAGE_SHIFT); + size, nid, node_pa >> PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system @@ -315,11 +315,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) * So memblock_x86_reserve_range here, hope we don't run out * of that array */ - memblock_x86_reserve_range(node_kva, - node_kva + ((u64)size << PAGE_SHIFT), + memblock_x86_reserve_range(node_pa, node_pa + ((u64)size << PAGE_SHIFT), "KVA RAM"); - node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT; + node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; return size; } -- cgit v1.2.2 From af7c1a6e8374e05aab4a98ce4d2fb07b66506a02 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:52 +0200 Subject: x86-32, numa: Make @size in init_aloc_remap() represent bytes @size variable in init_alloc_remap() is confusing in that it starts as number of bytes as its name implies and then becomes number of pages. Make it consistently represent bytes. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-7-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 30933fec8f7..99310d26fe3 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -286,22 +286,19 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) size = node_remap_size[nid]; size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; + /* align to large page */ + size = ALIGN(size, LARGE_PAGE_BYTES); node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, (u64)node_end_pfn[nid] << PAGE_SHIFT, - (u64)size << PAGE_SHIFT, - LARGE_PAGE_BYTES); + size, LARGE_PAGE_BYTES); if (node_pa == MEMBLOCK_ERROR) panic("Can not get kva ram\n"); - node_remap_size[nid] = size; + node_remap_size[nid] = size >> PAGE_SHIFT; node_remap_offset[nid] = offset; printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", - size, nid, node_pa >> PAGE_SHIFT); + size >> PAGE_SHIFT, nid, node_pa >> PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system @@ -315,12 +312,11 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) * So memblock_x86_reserve_range here, hope we don't run out * of that array */ - memblock_x86_reserve_range(node_pa, node_pa + ((u64)size << PAGE_SHIFT), - "KVA RAM"); + memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - return size; + return size >> PAGE_SHIFT; } static void init_remap_allocator(int nid) -- cgit v1.2.2 From 7210cf9217937e470a9acbc113a590f476b9c047 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:53 +0200 Subject: x86-32, numa: Calculate remap size in common code Only pgdat and memmap use remap area and there isn't much benefit in allowing per-node override. In addition, the use of node_remap_size[] is confusing in that it contains number of bytes before remap initialization and then number of pages afterwards. Move remap size calculation for memap from specific NUMA config implementations to init_alloc_remap() and make node_remap_size[] static. The only behavior difference is that, before this patch, numaq_32 didn't consider max_pfn when calculating the memmap size but it's enforced after this patch, which is the right thing to do. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-8-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 10 ++++------ arch/x86/mm/srat_32.c | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 99310d26fe3..9a7336550f0 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -104,7 +104,7 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -unsigned long node_remap_size[MAX_NUMNODES]; +static unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); @@ -129,7 +129,6 @@ int __init get_memcfg_numa_flat(void) node_end_pfn[0] = max_pfn; memblock_x86_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); @@ -282,11 +281,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid]; + /* calculate the necessary space aligned to large page size */ + size = node_memmap_size_bytes(nid, node_start_pfn[nid], + min(node_end_pfn[nid], max_pfn)); size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - /* align to large page */ size = ALIGN(size, LARGE_PAGE_BYTES); node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 48651c6f657..1b9e82c96dc 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -276,7 +276,6 @@ int __init get_memcfg_from_srat(void) unsigned long end = min(node_end_pfn[nid], max_pfn); memory_present(nid, start, end); - node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); } return 1; out_fail: -- cgit v1.2.2 From 82044c328d6f6b22882c2a936e487e6d2240817a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:54 +0200 Subject: x86-32, numa: Make init_alloc_remap() less panicky Remap allocator failure isn't fatal. The callers are required to fall back to regular early memory allocation mechanisms on failure anyway, so there's no reason to panic on remap init failure. Whining and returning are enough. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-9-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 9a7336550f0..c127543372f 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -290,8 +290,11 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, (u64)node_end_pfn[nid] << PAGE_SHIFT, size, LARGE_PAGE_BYTES); - if (node_pa == MEMBLOCK_ERROR) - panic("Can not get kva ram\n"); + if (node_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", + size, nid); + return 0; + } node_remap_size[nid] = size >> PAGE_SHIFT; node_remap_offset[nid] = offset; -- cgit v1.2.2 From 0e9f93c1c04c8ab10cc564df54a7ad0f83c67796 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:55 +0200 Subject: x86-32, numa: Move lowmem address space reservation to init_alloc_remap() Remap alloc init is done in the following stages. 1. init_alloc_remap() calculates how much memory is necessary for each node and reserves node local memory. 2. initmem_init() collects how much each node needs and reserves a single contiguous lowmem area which can contain all. 3. init_remap_allocator() initializes allocator parameters from the determined lowmem address and per-node offsets. 4. Actual remap happens. There is no reason for the lowmem remap area to be reserved as a single contiguous area at one go. They don't interact with each other and the memblock allocator will put them side-by-side anyway. This patch breaks up the single lowmem address reservation and put per-node lowmem address reservation into init_alloc_remap() and initializes allocator parameters directly in the function as all the addresses are determined there. This merges steps 2 and 3 into 1. While at it, remove now largely irrelevant comments in init_alloc_remap(). This change causes the following behavior changes. * Remap lowmem areas are allocated in smaller per-node chunks. * Remap lowmem area reservation failure fail future remap allocations instead of panicking. * Remap allocator initialization is less verbose. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-10-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 82 ++++++++++++++++----------------------------------- 1 file changed, 25 insertions(+), 57 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index c127543372f..12bb34c434e 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -108,9 +108,6 @@ static unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static unsigned long kva_start_pfn; -static unsigned long kva_pages; - int __cpuinit numa_cpu_node(int cpu) { return apic->x86_32_numa_cpu_node(cpu); @@ -266,7 +263,8 @@ void resume_map_numa_kva(pgd_t *pgd_base) static __init unsigned long init_alloc_remap(int nid, unsigned long offset) { unsigned long size; - u64 node_pa; + u64 node_pa, remap_pa; + void *remap_va; /* * The acpi/srat node info can show hot-add memroy zones where @@ -287,6 +285,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); size = ALIGN(size, LARGE_PAGE_BYTES); + /* allocate node memory and the lowmem remap area */ node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, (u64)node_end_pfn[nid] << PAGE_SHIFT, size, LARGE_PAGE_BYTES); @@ -295,45 +294,35 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) size, nid); return 0; } + memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + + remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, + size, LARGE_PAGE_BYTES); + if (remap_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", + size, nid); + memblock_x86_free_range(node_pa, node_pa + size); + return 0; + } + memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); + remap_va = phys_to_virt(remap_pa); + /* initialize remap allocator parameters */ + node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; node_remap_size[nid] = size >> PAGE_SHIFT; node_remap_offset[nid] = offset; - printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", - size >> PAGE_SHIFT, nid, node_pa >> PAGE_SHIFT); - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then - * max_low_pfn but we could have some hole in high memory, - * and it will only check page_is_ram(pfn) && - * !page_is_reserved_early(pfn) to decide to use it as free. - * So memblock_x86_reserve_range here, hope we don't run out - * of that array - */ - memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + node_remap_start_vaddr[nid] = remap_va; + node_remap_end_vaddr[nid] = remap_va + size; + node_remap_alloc_vaddr[nid] = remap_va + ALIGN(sizeof(pg_data_t), PAGE_SIZE); - node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; + printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", + nid, node_pa, node_pa + size, remap_va, remap_va + size); return size >> PAGE_SHIFT; } -static void init_remap_allocator(int nid) -{ - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) node_remap_end_vaddr[nid]); -} - void __init initmem_init(void) { unsigned long reserve_pages = 0; @@ -352,25 +341,7 @@ void __init initmem_init(void) for_each_online_node(nid) reserve_pages += init_alloc_remap(nid, reserve_pages); - kva_pages = roundup(reserve_pages, PTRS_PER_PTE); - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - - kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, - kva_pages << PAGE_SHIFT, - PTRS_PER_PTE << PAGE_SHIFT) >> PAGE_SHIFT; - if (kva_start_pfn == MEMBLOCK_ERROR) - panic("Can not get kva space\n"); - - printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", - kva_start_pfn, max_low_pfn); - printk(KERN_INFO "max_pfn = %lx\n", max_pfn); - - /* avoid clash with initrd */ - memblock_x86_reserve_range(kva_start_pfn< max_low_pfn) @@ -390,11 +361,8 @@ void __init initmem_init(void) printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - init_remap_allocator(nid); - + for_each_online_node(nid) allocate_pgdat(nid); - } remap_numa_kva(); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", -- cgit v1.2.2 From 2a286344f06d6341740b284494379373e87648f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:56 +0200 Subject: x86-32, numa: Move remapping for remap allocator into init_alloc_remap() There's no reason to perform the actual remapping separately. Collapse remap_numa_kva() into init_alloc_remap() and, while at it, make it less verbose. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-11-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 12bb34c434e..53ec13a17b9 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -205,26 +205,6 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -static void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn<> PAGE_SHIFT; pfn += PTRS_PER_PTE) + set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), + (node_pa >> PAGE_SHIFT) + pfn, + PAGE_KERNEL_LARGE); + /* initialize remap allocator parameters */ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; node_remap_size[nid] = size >> PAGE_SHIFT; @@ -363,7 +349,6 @@ void __init initmem_init(void) (ulong) pfn_to_kaddr(max_low_pfn)); for_each_online_node(nid) allocate_pgdat(nid); - remap_numa_kva(); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); -- cgit v1.2.2 From b2e3e4fa3eee752b893687783f2a427106c93423 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:57 +0200 Subject: x86-32, numa: Make pgdat allocation use alloc_remap() pgdat allocation is handled differnetly from other remap allocations - it's reserved during initialization. There's no reason to handle this any differnetly. Remap allocator is initialized for every node and if init failed the allocation will fail and pgdat allocation can fall back to generic code like anyone else. Remove special init-time pgdat reservation and make allocate_pgdat() use alloc_remap() like everyone else. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-12-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 53ec13a17b9..0184a9f5a34 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -160,9 +160,8 @@ static void __init allocate_pgdat(int nid) { char buf[16]; - if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { + NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE)); + if (!NODE_DATA(nid)) { unsigned long pgdat_phys; pgdat_phys = memblock_find_in_range(min_low_pfn< [%p-%p)\n", nid, node_pa, node_pa + size, remap_va, remap_va + size); -- cgit v1.2.2 From 1d85b61baf0334dd6bb88261bec42b808204d694 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:58 +0200 Subject: x86-32, numa: Remove now useless node_remap_offset[] With lowmem address reservation moved into init_alloc_remap(), node_remap_offset[] is no longer useful. Remove it and related offset handling code. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-13-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 0184a9f5a34..960ea7bc0ac 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -187,7 +187,6 @@ static void __init allocate_pgdat(int nid) static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; void *alloc_remap(int nid, unsigned long size) { @@ -239,7 +238,7 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif -static __init unsigned long init_alloc_remap(int nid, unsigned long offset) +static __init void init_alloc_remap(int nid) { unsigned long size, pfn; u64 node_pa, remap_pa; @@ -252,9 +251,9 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) - return 0; + return; if (!node_end_pfn[nid]) - return 0; + return; if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; @@ -271,7 +270,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) if (node_pa == MEMBLOCK_ERROR) { pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", size, nid); - return 0; + return; } memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); @@ -282,7 +281,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", size, nid); memblock_x86_free_range(node_pa, node_pa + size); - return 0; + return; } memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); remap_va = phys_to_virt(remap_pa); @@ -296,7 +295,6 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) /* initialize remap allocator parameters */ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; node_remap_size[nid] = size >> PAGE_SHIFT; - node_remap_offset[nid] = offset; node_remap_start_vaddr[nid] = remap_va; node_remap_end_vaddr[nid] = remap_va + size; @@ -304,13 +302,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", nid, node_pa, node_pa + size, remap_va, remap_va + size); - - return size >> PAGE_SHIFT; } void __init initmem_init(void) { - unsigned long reserve_pages = 0; int nid; /* @@ -325,7 +320,7 @@ void __init initmem_init(void) numa_init_array(); for_each_online_node(nid) - reserve_pages += init_alloc_remap(nid, reserve_pages); + init_alloc_remap(nid); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; -- cgit v1.2.2 From 198bd06bbfde2984027e91f64c55eb19a7034a27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:59 +0200 Subject: x86-32, numa: Remove redundant node_remap_size[] Remap area size can be determined from node_remap_start_vaddr[] and node_remap_end_vaddr[] making node_remap_size[] redundant. Remove it. While at it, make resume_map_numa_kva() use @nr_pages for number of pages instead of @size. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-14-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 960ea7bc0ac..f325e6fab75 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -104,7 +104,6 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -static unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); @@ -214,15 +213,16 @@ void resume_map_numa_kva(pgd_t *pgd_base) int node; for_each_online_node(node) { - unsigned long start_va, start_pfn, size, pfn; + unsigned long start_va, start_pfn, nr_pages, pfn; start_va = (unsigned long)node_remap_start_vaddr[node]; start_pfn = node_remap_start_pfn[node]; - size = node_remap_size[node]; + nr_pages = (node_remap_end_vaddr[node] - + node_remap_start_vaddr[node]) >> PAGE_SHIFT; printk(KERN_DEBUG "%s: node %d\n", __func__, node); - for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { + for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); pgd_t *pgd = pgd_base + pgd_index(vaddr); pud_t *pud = pud_offset(pgd, vaddr); @@ -294,8 +294,6 @@ static __init void init_alloc_remap(int nid) /* initialize remap allocator parameters */ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - node_remap_size[nid] = size >> PAGE_SHIFT; - node_remap_start_vaddr[nid] = remap_va; node_remap_end_vaddr[nid] = remap_va + size; node_remap_alloc_vaddr[nid] = remap_va; -- cgit v1.2.2 From 993ba1585cbb03fab012e41d1a5d24330a283b31 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:24:00 +0200 Subject: x86-32, numa: Update remap allocator comments Now that remap allocator is cleaned up, update comments such that they are in docbook function description format and reflect the actual implementation. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-15-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 56 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index f325e6fab75..c757c0a3b52 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -176,17 +176,31 @@ static void __init allocate_pgdat(int nid) } /* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. + * Remap memory allocator */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; static void *node_remap_alloc_vaddr[MAX_NUMNODES]; +/** + * alloc_remap - Allocate remapped memory + * @nid: NUMA node to allocate memory from + * @size: The size of allocation + * + * Allocate @size bytes from the remap area of NUMA node @nid. The + * size of the remap area is predetermined by init_alloc_remap() and + * only the callers considered there should call this function. For + * more info, please read the comment on top of init_alloc_remap(). + * + * The caller must be ready to handle allocation failure from this + * function and fall back to regular memory allocator in such cases. + * + * CONTEXT: + * Single CPU early boot context. + * + * RETURNS: + * Pointer to the allocated memory on success, %NULL on failure. + */ void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; @@ -238,6 +252,28 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif +/** + * init_alloc_remap - Initialize remap allocator for a NUMA node + * @nid: NUMA node to initizlie remap allocator for + * + * NUMA nodes may end up without any lowmem. As allocating pgdat and + * memmap on a different node with lowmem is inefficient, a special + * remap allocator is implemented which can be used by alloc_remap(). + * + * For each node, the amount of memory which will be necessary for + * pgdat and memmap is calculated and two memory areas of the size are + * allocated - one in the node and the other in lowmem; then, the area + * in the node is remapped to the lowmem area. + * + * As pgdat and memmap must be allocated in lowmem anyway, this + * doesn't waste lowmem address space; however, the actual lowmem + * which gets remapped over is wasted. The amount shouldn't be + * problematic on machines this feature will be used. + * + * Initialization failure isn't fatal. alloc_remap() is used + * opportunistically and the callers will fall back to other memory + * allocation mechanisms on failure. + */ static __init void init_alloc_remap(int nid) { unsigned long size, pfn; @@ -306,14 +342,6 @@ void __init initmem_init(void) { int nid; - /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundary between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ - get_memcfg_numa(); numa_init_array(); -- cgit v1.2.2