From e123dd3f0ec1664576456ea1ea045591a0a95f0c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 13 Apr 2008 11:51:06 -0700 Subject: mm: make mem_map allocation continuous vmemmap allocation currently has this layout: [ffffe20000000000-ffffe200001fffff] PMD ->ffff810001400000 on node 0 [ffffe20000200000-ffffe200003fffff] PMD ->ffff810001800000 on node 0 [ffffe20000400000-ffffe200005fffff] PMD ->ffff810001c00000 on node 0 [ffffe20000600000-ffffe200007fffff] PMD ->ffff810002000000 on node 0 [ffffe20000800000-ffffe200009fffff] PMD ->ffff810002400000 on node 0 ... note that there is a 2M hole between them - not optimal. the root cause is that usemap (24 bytes) will be allocated after every 2M mem_map, and it will push next vmemmap (2M) to the next (2M) alignment. solution: try to allocate the mem_map continously. after the patch, we get: [ffffe20000000000-ffffe200001fffff] PMD ->ffff810001400000 on node 0 [ffffe20000200000-ffffe200003fffff] PMD ->ffff810001600000 on node 0 [ffffe20000400000-ffffe200005fffff] PMD ->ffff810001800000 on node 0 [ffffe20000600000-ffffe200007fffff] PMD ->ffff810001a00000 on node 0 [ffffe20000800000-ffffe200009fffff] PMD ->ffff810001c00000 on node 0 ... which is the ideal layout. and usemap will share a page because of they are allocated continuously too: sparse_early_usemap_alloc: usemap = ffff810024e00000 size = 24 sparse_early_usemap_alloc: usemap = ffff810024e00080 size = 24 sparse_early_usemap_alloc: usemap = ffff810024e00100 size = 24 sparse_early_usemap_alloc: usemap = ffff810024e00180 size = 24 ... so we make the bootmem allocation more compact and use less memory for usemap => mission accomplished ;-) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/sparse.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 98d6b39c3472..458109b99e61 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -304,22 +304,48 @@ void __init sparse_init(void) unsigned long pnum; struct page *map; unsigned long *usemap; + unsigned long **usemap_map; + int size; + + /* + * map is using big page (aka 2M in x86 64 bit) + * usemap is less one page (aka 24 bytes) + * so alloc 2M (with 2M align) and 24 bytes in turn will + * make next 2M slip to one more 2M later. + * then in big system, the memory will have a lot of holes... + * here try to allocate 2M pages continously. + * + * powerpc need to call sparse_init_one_section right after each + * sparse_early_mem_map_alloc, so allocate usemap_map at first. + */ + size = sizeof(unsigned long *) * NR_MEM_SECTIONS; + usemap_map = alloc_bootmem(size); + if (!usemap_map) + panic("can not allocate usemap_map\n"); for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { if (!present_section_nr(pnum)) continue; + usemap_map[pnum] = sparse_early_usemap_alloc(pnum); + } - map = sparse_early_mem_map_alloc(pnum); - if (!map) + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + if (!present_section_nr(pnum)) continue; - usemap = sparse_early_usemap_alloc(pnum); + usemap = usemap_map[pnum]; if (!usemap) continue; + map = sparse_early_mem_map_alloc(pnum); + if (!map) + continue; + sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); } + + free_bootmem(__pa(usemap_map), size); } #ifdef CONFIG_MEMORY_HOTPLUG -- cgit v1.2.2 From c2b91e2eec9678dbda274e906cc32ea8f711da3b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 12 Apr 2008 01:19:24 -0700 Subject: x86_64/mm: check and print vmemmap allocation continuous On big systems with lots of memory, don't print out too much during bootup, and make it easy to find if it is continuous. on 256G 8 sockets system will get [ffffe20000000000-ffffe20002bfffff] PMD -> [ffff810001400000-ffff810003ffffff] on node 0 [ffffe2001c700000-ffffe2001c7fffff] potential offnode page_structs [ffffe20002c00000-ffffe2001c7fffff] PMD -> [ffff81000c000000-ffff8100255fffff] on node 0 [ffffe20038700000-ffffe200387fffff] potential offnode page_structs [ffffe2001c800000-ffffe200387fffff] PMD -> [ffff810820200000-ffff81083c1fffff] on node 1 [ffffe20040000000-ffffe2007fffffff] PUD ->ffff811027a00000 on node 2 [ffffe20038800000-ffffe2003fffffff] PMD -> [ffff811020200000-ffff8110279fffff] on node 2 [ffffe20054700000-ffffe200547fffff] potential offnode page_structs [ffffe20040000000-ffffe200547fffff] PMD -> [ffff811027c00000-ffff81103c3fffff] on node 2 [ffffe20070700000-ffffe200707fffff] potential offnode page_structs [ffffe20054800000-ffffe200707fffff] PMD -> [ffff811820200000-ffff81183c1fffff] on node 3 [ffffe20080000000-ffffe200bfffffff] PUD ->ffff81202fa00000 on node 4 [ffffe20070800000-ffffe2007fffffff] PMD -> [ffff812020200000-ffff81202f9fffff] on node 4 [ffffe2008c700000-ffffe2008c7fffff] potential offnode page_structs [ffffe20080000000-ffffe2008c7fffff] PMD -> [ffff81202fc00000-ffff81203c3fffff] on node 4 [ffffe200a8700000-ffffe200a87fffff] potential offnode page_structs [ffffe2008c800000-ffffe200a87fffff] PMD -> [ffff812820200000-ffff81283c1fffff] on node 5 [ffffe200c0000000-ffffe200ffffffff] PUD ->ffff813037a00000 on node 6 [ffffe200a8800000-ffffe200bfffffff] PMD -> [ffff813020200000-ffff8130379fffff] on node 6 [ffffe200c4700000-ffffe200c47fffff] potential offnode page_structs [ffffe200c0000000-ffffe200c47fffff] PMD -> [ffff813037c00000-ffff81303c3fffff] on node 6 [ffffe200c4800000-ffffe200e07fffff] PMD -> [ffff813820200000-ffff81383c1fffff] on node 7 instead of a very long print out... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- mm/sparse.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 458109b99e61..7e9191381f86 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -295,6 +295,9 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) return NULL; } +void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) +{ +} /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -345,6 +348,8 @@ void __init sparse_init(void) usemap); } + vmemmap_populate_print_last(); + free_bootmem(__pa(usemap_map), size); } -- cgit v1.2.2 From ea01ea937dcae2caa146dea1918cccf2f16ed3c4 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Mon, 28 Apr 2008 02:12:01 -0700 Subject: hotplug memory remove: generic __remove_pages() support Generic helper function to remove section mappings and sysfs entries for the section of the memory we are removing. offline_pages() correctly adjusted zone and marked the pages reserved. TODO: Yasunori Goto is working on patches to free up allocations from bootmem. Signed-off-by: Badari Pulavarty Acked-by: Yasunori Goto Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 7e9191381f86..186a85bf7912 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -208,12 +208,13 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p } /* - * We need this if we ever free the mem_maps. While not implemented yet, - * this function is included for parity with its sibling. + * Decode mem_map from the coded memmap */ -static __attribute((unused)) +static struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) { + /* mask off the extra low bits of information */ + coded_mem_map &= SECTION_MAP_MASK; return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } @@ -404,6 +405,28 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +static void free_section_usemap(struct page *memmap, unsigned long *usemap) +{ + if (!usemap) + return; + + /* + * Check to see if allocation came from hot-plug-add + */ + if (PageSlab(virt_to_page(usemap))) { + kfree(usemap); + if (memmap) + __kfree_section_memmap(memmap, PAGES_PER_SECTION); + return; + } + + /* + * TODO: Allocations came from bootmem - how do I free up ? + */ + printk(KERN_WARNING "Not freeing up allocations from bootmem " + "- leaking memory\n"); +} + /* * returns the number of sections whose mem_maps were properly * set. If this is <=0, then that means that the passed-in @@ -456,4 +479,20 @@ out: } return ret; } + +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +{ + struct page *memmap = NULL; + unsigned long *usemap = NULL; + + if (ms->section_mem_map) { + usemap = ms->pageblock_flags; + memmap = sparse_decode_mem_map(ms->section_mem_map, + __section_nr(ms)); + ms->section_mem_map = 0; + ms->pageblock_flags = NULL; + } + + free_section_usemap(memmap, usemap); +} #endif -- cgit v1.2.2 From 04753278769f3b6c3b79a080edb52f21d83bf6e2 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Mon, 28 Apr 2008 02:13:31 -0700 Subject: memory hotplug: register section/node id to free This patch set is to free pages which is allocated by bootmem for memory-hotremove. Some structures of memory management are allocated by bootmem. ex) memmap, etc. To remove memory physically, some of them must be freed according to circumstance. This patch set makes basis to free those pages, and free memmaps. Basic my idea is using remain members of struct page to remember information of users of bootmem (section number or node id). When the section is removing, kernel can confirm it. By this information, some issues can be solved. 1) When the memmap of removing section is allocated on other section by bootmem, it should/can be free. 2) When the memmap of removing section is allocated on the same section, it shouldn't be freed. Because the section has to be logical memory offlined already and all pages must be isolated against page allocater. If it is freed, page allocator may use it which will be removed physically soon. 3) When removing section has other section's memmap, kernel will be able to show easily which section should be removed before it for user. (Not implemented yet) 4) When the above case 2), the page isolation will be able to check and skip memmap's page when logical memory offline (offline_pages()). Current page isolation code fails in this case because this page is just reserved page and it can't distinguish this pages can be removed or not. But, it will be able to do by this patch. (Not implemented yet.) 5) The node information like pgdat has similar issues. But, this will be able to be solved too by this. (Not implemented yet, but, remembering node id in the pages.) Fortunately, current bootmem allocator just keeps PageReserved flags, and doesn't use any other members of page struct. The users of bootmem doesn't use them too. This patch: This is to register information which is node or section's id. Kernel can distinguish which node/section uses the pages allcated by bootmem. This is basis for hot-remove sections or nodes. Signed-off-by: Yasunori Goto Cc: Badari Pulavarty Cc: Yinghai Lu Cc: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 186a85bf7912..8903c484389a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -210,7 +210,6 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p /* * Decode mem_map from the coded memmap */ -static struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) { /* mask off the extra low bits of information */ @@ -233,7 +232,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, return 1; } -static unsigned long usemap_size(void) +unsigned long usemap_size(void) { unsigned long size_bytes; size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; -- cgit v1.2.2 From 9d99217a02a06a7cc83f065b73e976970970c58c Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Mon, 28 Apr 2008 02:13:32 -0700 Subject: memory hotplug: align memmap to page size To free memmap easier, this patch aligns it to page size. Bootmem allocater may mix some objects in one pages. It's not good for freeing memmap of memory hot-remove. Signed-off-by: Yasunori Goto Cc: Badari Pulavarty Cc: Yinghai Lu Cc: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 8903c484389a..5398d48c360a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -273,8 +273,8 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) if (map) return map; - map = alloc_bootmem_node(NODE_DATA(nid), - sizeof(struct page) * PAGES_PER_SECTION); + map = alloc_bootmem_pages_node(NODE_DATA(nid), + PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); return map; } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -- cgit v1.2.2 From 86f6dae1377523689bd8468fed2f2dd180fc0560 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Mon, 28 Apr 2008 02:13:33 -0700 Subject: memory hotplug: allocate usemap on the section with pgdat Usemaps are allocated on the section which has pgdat by this. Because usemap size is very small, many other sections usemaps are allocated on only one page. If a section has usemap, it can't be removed until removing other sections. This dependency is not desirable for memory removing. Pgdat has similar feature. When a section has pgdat area, it must be the last section for removing on the node. So, if section A has pgdat and section B has usemap for section A, Both sections can't be removed due to dependency each other. To solve this issue, this patch collects usemap on same section with pgdat. If other sections doesn't have any dependency, this section will be able to be removed finally. Signed-off-by: Yasunori Goto Cc: Badari Pulavarty Cc: Yinghai Lu Cc: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 5398d48c360a..08f053218ee8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -249,11 +249,22 @@ static unsigned long *__kmalloc_section_usemap(void) static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) { - unsigned long *usemap; + unsigned long *usemap, section_nr; struct mem_section *ms = __nr_to_section(pnum); int nid = sparse_early_nid(ms); + struct pglist_data *pgdat = NODE_DATA(nid); - usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + /* + * Usemap's page can't be freed until freeing other sections + * which use it. And, Pgdat has same feature. + * If section A has pgdat and section B has usemap for other + * sections (includes section A), both sections can't be removed, + * because there is the dependency each other. + * To solve above issue, this collects all usemap on the same section + * which has pgdat. + */ + section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + usemap = alloc_bootmem_section(usemap_size(), section_nr); if (usemap) return usemap; -- cgit v1.2.2 From 0c0a4a517a31e05efb38304668198a873bfec6ca Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Mon, 28 Apr 2008 02:13:34 -0700 Subject: memory hotplug: free memmaps allocated by bootmem This patch is to free memmaps which is allocated by bootmem. Freeing usemap is not necessary. The pages of usemap may be necessary for other sections. If removing section is last section on the node, its section is the final user of usemap page. (usemaps are allocated on its section by previous patch.) But it shouldn't be freed too, because the section must be logical offline state which all pages are isolated against page allocater. If it is freed, page alloctor may use it which will be removed physically soon. It will be disaster. So, this patch keeps it as it is. Signed-off-by: Yasunori Goto Cc: Badari Pulavarty Cc: Yinghai Lu Cc: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 08f053218ee8..dff71f173ae9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -8,6 +8,7 @@ #include #include #include +#include "internal.h" #include #include #include @@ -376,6 +377,9 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { return; /* XXX: Not implemented yet */ } +static void free_map_bootmem(struct page *page, unsigned long nr_pages) +{ +} #else static struct page *__kmalloc_section_memmap(unsigned long nr_pages) { @@ -413,17 +417,47 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) free_pages((unsigned long)memmap, get_order(sizeof(struct page) * nr_pages)); } + +static void free_map_bootmem(struct page *page, unsigned long nr_pages) +{ + unsigned long maps_section_nr, removing_section_nr, i; + int magic; + + for (i = 0; i < nr_pages; i++, page++) { + magic = atomic_read(&page->_mapcount); + + BUG_ON(magic == NODE_INFO); + + maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); + removing_section_nr = page->private; + + /* + * When this function is called, the removing section is + * logical offlined state. This means all pages are isolated + * from page allocator. If removing section's memmap is placed + * on the same section, it must not be freed. + * If it is freed, page allocator may allocate it which will + * be removed physically soon. + */ + if (maps_section_nr != removing_section_nr) + put_page_bootmem(page); + } +} #endif /* CONFIG_SPARSEMEM_VMEMMAP */ static void free_section_usemap(struct page *memmap, unsigned long *usemap) { + struct page *usemap_page; + unsigned long nr_pages; + if (!usemap) return; + usemap_page = virt_to_page(usemap); /* * Check to see if allocation came from hot-plug-add */ - if (PageSlab(virt_to_page(usemap))) { + if (PageSlab(usemap_page)) { kfree(usemap); if (memmap) __kfree_section_memmap(memmap, PAGES_PER_SECTION); @@ -431,10 +465,19 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) } /* - * TODO: Allocations came from bootmem - how do I free up ? + * The usemap came from bootmem. This is packed with other usemaps + * on the section which has pgdat at boot time. Just keep it as is now. */ - printk(KERN_WARNING "Not freeing up allocations from bootmem " - "- leaking memory\n"); + + if (memmap) { + struct page *memmap_page; + memmap_page = virt_to_page(memmap); + + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + + free_map_bootmem(memmap_page, nr_pages); + } } /* -- cgit v1.2.2