aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>2018-10-26 18:10:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-26 19:38:15 -0400
commit907ec5fca3dc38d37737de826f06f25b063aa08e (patch)
tree6785f97d6ec9c701a4712836ce874e72116b285b
parent3821b76c3cdb5f2c5ef1b082de79829e8ff50a7d (diff)
mm: zero remaining unavailable struct pages
Patch series "mm: Fix for movable_node boot option", v3. This patch series contains a fix for the movable_node boot option issue which was introduced by commit 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into memblock.reserved"). The commit breaks the option because it changed the memory gap range to reserved memblock. So, the node is marked as Normal zone even if the SRAT has Hot pluggable affinity. First and second patch fix the original issue which the commit tried to fix, then revert the commit. This patch (of 3): There is a kernel panic that is triggered when reading /proc/kpageflags on the kernel booted with kernel parameter 'memmap=nn[KMG]!ss[KMG]': BUG: unable to handle kernel paging request at fffffffffffffffe PGD 9b20e067 P4D 9b20e067 PUD 9b210067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 2 PID: 1728 Comm: page-types Not tainted 4.17.0-rc6-mm1-v4.17-rc6-180605-0816-00236-g2dfb086ef02c+ #160 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.fc28 04/01/2014 RIP: 0010:stable_page_flags+0x27/0x3c0 Code: 00 00 00 0f 1f 44 00 00 48 85 ff 0f 84 a0 03 00 00 41 54 55 49 89 fc 53 48 8b 57 08 48 8b 2f 48 8d 42 ff 83 e2 01 48 0f 44 c7 <48> 8b 00 f6 c4 01 0f 84 10 03 00 00 31 db 49 8b 54 24 08 4c 89 e7 RSP: 0018:ffffbbd44111fde0 EFLAGS: 00010202 RAX: fffffffffffffffe RBX: 00007fffffffeff9 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000202 RDI: ffffed1182fff5c0 RBP: ffffffffffffffff R08: 0000000000000001 R09: 0000000000000001 R10: ffffbbd44111fed8 R11: 0000000000000000 R12: ffffed1182fff5c0 R13: 00000000000bffd7 R14: 0000000002fff5c0 R15: ffffbbd44111ff10 FS: 00007efc4335a500(0000) GS:ffff93a5bfc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffffffffffffe CR3: 00000000b2a58000 CR4: 00000000001406e0 Call Trace: kpageflags_read+0xc7/0x120 proc_reg_read+0x3c/0x60 __vfs_read+0x36/0x170 vfs_read+0x89/0x130 ksys_pread64+0x71/0x90 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7efc42e75e23 Code: 09 00 ba 9f 01 00 00 e8 ab 81 f4 ff 66 2e 0f 1f 84 00 00 00 00 00 90 83 3d 29 0a 2d 00 00 75 13 49 89 ca b8 11 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 db d3 01 00 48 89 04 24 According to kernel bisection, this problem became visible due to commit f7f99100d8d9 which changes how struct pages are initialized. Memblock layout affects the pfn ranges covered by node/zone. Consider that we have a VM with 2 NUMA nodes and each node has 4GB memory, and the default (no memmap= given) memblock layout is like below: MEMBLOCK configuration: memory size = 0x00000001fff75c00 reserved size = 0x000000000300c000 memory.cnt = 0x4 memory[0x0] [0x0000000000001000-0x000000000009efff], 0x000000000009e000 bytes on node 0 flags: 0x0 memory[0x1] [0x0000000000100000-0x00000000bffd6fff], 0x00000000bfed7000 bytes on node 0 flags: 0x0 memory[0x2] [0x0000000100000000-0x000000013fffffff], 0x0000000040000000 bytes on node 0 flags: 0x0 memory[0x3] [0x0000000140000000-0x000000023fffffff], 0x0000000100000000 bytes on node 1 flags: 0x0 ... If you give memmap=1G!4G (so it just covers memory[0x2]), the range [0x100000000-0x13fffffff] is gone: MEMBLOCK configuration: memory size = 0x00000001bff75c00 reserved size = 0x000000000300c000 memory.cnt = 0x3 memory[0x0] [0x0000000000001000-0x000000000009efff], 0x000000000009e000 bytes on node 0 flags: 0x0 memory[0x1] [0x0000000000100000-0x00000000bffd6fff], 0x00000000bfed7000 bytes on node 0 flags: 0x0 memory[0x2] [0x0000000140000000-0x000000023fffffff], 0x0000000100000000 bytes on node 1 flags: 0x0 ... This causes shrinking node 0's pfn range because it is calculated by the address range of memblock.memory. So some of struct pages in the gap range are left uninitialized. We have a function zero_resv_unavail() which does zeroing the struct pages outside memblock.memory, but currently it covers only the reserved unavailable range (i.e. memblock.memory && !memblock.reserved). This patch extends it to cover all unavailable range, which fixes the reported issue. Link: http://lkml.kernel.org/r/20181002143821.5112-2-msys.mizuma@gmail.com Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com> Tested-by: Oscar Salvador <osalvador@suse.de> Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com> Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memblock.h15
-rw-r--r--mm/page_alloc.c36
2 files changed, 25 insertions, 26 deletions
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 516920549378..2acdd046df2d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
265 for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ 265 for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
266 nid, flags, p_start, p_end, p_nid) 266 nid, flags, p_start, p_end, p_nid)
267 267
268/**
269 * for_each_resv_unavail_range - iterate through reserved and unavailable memory
270 * @i: u64 used as loop variable
271 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
272 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
273 *
274 * Walks over unavailable but reserved (reserved && !memory) areas of memblock.
275 * Available as soon as memblock is initialized.
276 * Note: because this memory does not belong to any physical node, flags and
277 * nid arguments do not make sense and thus not exported as arguments.
278 */
279#define for_each_resv_unavail_range(i, p_start, p_end) \
280 for_each_mem_range(i, &memblock.reserved, &memblock.memory, \
281 NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
282
283static inline void memblock_set_region_flags(struct memblock_region *r, 268static inline void memblock_set_region_flags(struct memblock_region *r,
284 enum memblock_flags flags) 269 enum memblock_flags flags)
285{ 270{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c26d3152f9ba..6d863c5afa08 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6515,29 +6515,42 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
6515 * struct pages which are reserved in memblock allocator and their fields 6515 * struct pages which are reserved in memblock allocator and their fields
6516 * may be accessed (for example page_to_pfn() on some configuration accesses 6516 * may be accessed (for example page_to_pfn() on some configuration accesses
6517 * flags). We must explicitly zero those struct pages. 6517 * flags). We must explicitly zero those struct pages.
6518 *
6519 * This function also addresses a similar issue where struct pages are left
6520 * uninitialized because the physical address range is not covered by
6521 * memblock.memory or memblock.reserved. That could happen when memblock
6522 * layout is manually configured via memmap=.
6518 */ 6523 */
6519void __init zero_resv_unavail(void) 6524void __init zero_resv_unavail(void)
6520{ 6525{
6521 phys_addr_t start, end; 6526 phys_addr_t start, end;
6522 unsigned long pfn; 6527 unsigned long pfn;
6523 u64 i, pgcnt; 6528 u64 i, pgcnt;
6529 phys_addr_t next = 0;
6524 6530
6525 /* 6531 /*
6526 * Loop through ranges that are reserved, but do not have reported 6532 * Loop through unavailable ranges not covered by memblock.memory.
6527 * physical memory backing.
6528 */ 6533 */
6529 pgcnt = 0; 6534 pgcnt = 0;
6530 for_each_resv_unavail_range(i, &start, &end) { 6535 for_each_mem_range(i, &memblock.memory, NULL,
6531 for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { 6536 NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6532 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { 6537 if (next < start) {
6533 pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) 6538 for (pfn = PFN_DOWN(next); pfn < PFN_UP(start); pfn++) {
6534 + pageblock_nr_pages - 1; 6539 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
6535 continue; 6540 continue;
6541 mm_zero_struct_page(pfn_to_page(pfn));
6542 pgcnt++;
6536 } 6543 }
6537 mm_zero_struct_page(pfn_to_page(pfn));
6538 pgcnt++;
6539 } 6544 }
6545 next = end;
6540 } 6546 }
6547 for (pfn = PFN_DOWN(next); pfn < max_pfn; pfn++) {
6548 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
6549 continue;
6550 mm_zero_struct_page(pfn_to_page(pfn));
6551 pgcnt++;
6552 }
6553
6541 6554
6542 /* 6555 /*
6543 * Struct pages that do not have backing memory. This could be because 6556 * Struct pages that do not have backing memory. This could be because
@@ -6547,7 +6560,8 @@ void __init zero_resv_unavail(void)
6547 * this code can be removed. 6560 * this code can be removed.
6548 */ 6561 */
6549 if (pgcnt) 6562 if (pgcnt)
6550 pr_info("Reserved but unavailable: %lld pages", pgcnt); 6563 pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6564
6551} 6565}
6552#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ 6566#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
6553 6567