summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorPavel Tatashin <pasha.tatashin@oracle.com>2017-11-15 20:36:09 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 21:21:05 -0500
commit2f47a91f4dab19aaaa05cdcfced9dfcaf3f5257e (patch)
treead4e007288e6d670ceab935f0329766ad8d597d8 /mm/page_alloc.c
parent783cb68ee2d25d621326366c0b615bf2ccf3b402 (diff)
mm: deferred_init_memmap improvements
Patch series "complete deferred page initialization", v12. SMP machines can benefit from the DEFERRED_STRUCT_PAGE_INIT config option, which defers initializing struct pages until all cpus have been started so it can be done in parallel. However, this feature is sub-optimal, because the deferred page initialization code expects that the struct pages have already been zeroed, and the zeroing is done early in boot with a single thread only. Also, we access that memory and set flags before struct pages are initialized. All of this is fixed in this patchset. In this work we do the following: - Never read access struct page until it was initialized - Never set any fields in struct pages before they are initialized - Zero struct page at the beginning of struct page initialization ========================================================================== Performance improvements on x86 machine with 8 nodes: Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz and 1T of memory: TIME SPEED UP base no deferred: 95.796233s fix no deferred: 79.978956s 19.77% base deferred: 77.254713s fix deferred: 55.050509s 40.34% ========================================================================== SPARC M6 3600 MHz with 15T of memory TIME SPEED UP base no deferred: 358.335727s fix no deferred: 302.320936s 18.52% base deferred: 237.534603s fix deferred: 182.103003s 30.44% ========================================================================== Raw dmesg output with timestamps: x86 base no deferred: https://hastebin.com/ofunepurit.scala x86 base deferred: https://hastebin.com/ifazegeyas.scala x86 fix no deferred: https://hastebin.com/pegocohevo.scala x86 fix deferred: https://hastebin.com/ofupevikuk.scala sparc base no deferred: https://hastebin.com/ibobeteken.go sparc base deferred: https://hastebin.com/fariqimiyu.go sparc fix no deferred: https://hastebin.com/muhegoheyi.go sparc fix deferred: https://hastebin.com/xadinobutu.go This patch (of 11): deferred_init_memmap() is called when struct pages are initialized later in boot by slave CPUs. This patch simplifies and optimizes this function, and also fixes a couple issues (described below). The main change is that now we are iterating through free memblock areas instead of all configured memory. Thus, we do not have to check if the struct page has already been initialized. ===== In deferred_init_memmap() where all deferred struct pages are initialized we have a check like this: if (page->flags) { VM_BUG_ON(page_zone(page) != zone); goto free_range; } This way we are checking if the current deferred page has already been initialized. It works, because memory for struct pages has been zeroed, and the only way flags are not zero if it went through __init_single_page() before. But, once we change the current behavior and won't zero the memory in memblock allocator, we cannot trust anything inside "struct page"es until they are initialized. This patch fixes this. The deferred_init_memmap() is re-written to loop through only free memory ranges provided by memblock. Note, this first issue is relevant only when the following change is merged: ===== This patch fixes another existing issue on systems that have holes in zones i.e CONFIG_HOLES_IN_ZONE is defined. In for_each_mem_pfn_range() we have code like this: if (!pfn_valid_within(pfn) goto free_range; Note: 'page' is not set to NULL and is not incremented but 'pfn' advances. Thus means if deferred struct pages are enabled on systems with these kind of holes, linux would get memory corruptions. I have fixed this issue by defining a new macro that performs all the necessary operations when we free the current set of pages. [pasha.tatashin@oracle.com: buddy page accessed before initialized] Link: http://lkml.kernel.org/r/20171102170221.7401-2-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20171013173214.27300-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com> Reviewed-by: Steven Sistare <steven.sistare@oracle.com> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Bob Picco <bob.picco@oracle.com> Tested-by: Bob Picco <bob.picco@oracle.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: David S. Miller <davem@davemloft.net> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Sam Ravnborg <sam@ravnborg.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Alexander Potapenko <glider@google.com> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c188
1 files changed, 105 insertions, 83 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 30a464b47366..4dee5082d3d7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1408,14 +1408,17 @@ void clear_zone_contiguous(struct zone *zone)
1408} 1408}
1409 1409
1410#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1410#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1411static void __init deferred_free_range(struct page *page, 1411static void __init deferred_free_range(unsigned long pfn,
1412 unsigned long pfn, int nr_pages) 1412 unsigned long nr_pages)
1413{ 1413{
1414 int i; 1414 struct page *page;
1415 unsigned long i;
1415 1416
1416 if (!page) 1417 if (!nr_pages)
1417 return; 1418 return;
1418 1419
1420 page = pfn_to_page(pfn);
1421
1419 /* Free a large naturally-aligned chunk if possible */ 1422 /* Free a large naturally-aligned chunk if possible */
1420 if (nr_pages == pageblock_nr_pages && 1423 if (nr_pages == pageblock_nr_pages &&
1421 (pfn & (pageblock_nr_pages - 1)) == 0) { 1424 (pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1441,19 +1444,109 @@ static inline void __init pgdat_init_report_one_done(void)
1441 complete(&pgdat_init_all_done_comp); 1444 complete(&pgdat_init_all_done_comp);
1442} 1445}
1443 1446
1447/*
1448 * Helper for deferred_init_range, free the given range, reset the counters, and
1449 * return number of pages freed.
1450 */
1451static inline unsigned long __init __def_free(unsigned long *nr_free,
1452 unsigned long *free_base_pfn,
1453 struct page **page)
1454{
1455 unsigned long nr = *nr_free;
1456
1457 deferred_free_range(*free_base_pfn, nr);
1458 *free_base_pfn = 0;
1459 *nr_free = 0;
1460 *page = NULL;
1461
1462 return nr;
1463}
1464
1465static unsigned long __init deferred_init_range(int nid, int zid,
1466 unsigned long start_pfn,
1467 unsigned long end_pfn)
1468{
1469 struct mminit_pfnnid_cache nid_init_state = { };
1470 unsigned long nr_pgmask = pageblock_nr_pages - 1;
1471 unsigned long free_base_pfn = 0;
1472 unsigned long nr_pages = 0;
1473 unsigned long nr_free = 0;
1474 struct page *page = NULL;
1475 unsigned long pfn;
1476
1477 /*
1478 * First we check if pfn is valid on architectures where it is possible
1479 * to have holes within pageblock_nr_pages. On systems where it is not
1480 * possible, this function is optimized out.
1481 *
1482 * Then, we check if a current large page is valid by only checking the
1483 * validity of the head pfn.
1484 *
1485 * meminit_pfn_in_nid is checked on systems where pfns can interleave
1486 * within a node: a pfn is between start and end of a node, but does not
1487 * belong to this memory node.
1488 *
1489 * Finally, we minimize pfn page lookups and scheduler checks by
1490 * performing it only once every pageblock_nr_pages.
1491 *
1492 * We do it in two loops: first we initialize struct page, than free to
1493 * buddy allocator, becuse while we are freeing pages we can access
1494 * pages that are ahead (computing buddy page in __free_one_page()).
1495 */
1496 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1497 if (!pfn_valid_within(pfn))
1498 continue;
1499 if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
1500 if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1501 if (page && (pfn & nr_pgmask))
1502 page++;
1503 else
1504 page = pfn_to_page(pfn);
1505 __init_single_page(page, pfn, zid, nid);
1506 cond_resched();
1507 }
1508 }
1509 }
1510
1511 page = NULL;
1512 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1513 if (!pfn_valid_within(pfn)) {
1514 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1515 } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
1516 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1517 } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1518 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1519 } else if (page && (pfn & nr_pgmask)) {
1520 page++;
1521 nr_free++;
1522 } else {
1523 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1524 page = pfn_to_page(pfn);
1525 free_base_pfn = pfn;
1526 nr_free = 1;
1527 cond_resched();
1528 }
1529 }
1530 /* Free the last block of pages to allocator */
1531 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1532
1533 return nr_pages;
1534}
1535
1444/* Initialise remaining memory on a node */ 1536/* Initialise remaining memory on a node */
1445static int __init deferred_init_memmap(void *data) 1537static int __init deferred_init_memmap(void *data)
1446{ 1538{
1447 pg_data_t *pgdat = data; 1539 pg_data_t *pgdat = data;
1448 int nid = pgdat->node_id; 1540 int nid = pgdat->node_id;
1449 struct mminit_pfnnid_cache nid_init_state = { };
1450 unsigned long start = jiffies; 1541 unsigned long start = jiffies;
1451 unsigned long nr_pages = 0; 1542 unsigned long nr_pages = 0;
1452 unsigned long walk_start, walk_end; 1543 unsigned long spfn, epfn;
1453 int i, zid; 1544 phys_addr_t spa, epa;
1545 int zid;
1454 struct zone *zone; 1546 struct zone *zone;
1455 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1547 unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1456 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1548 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1549 u64 i;
1457 1550
1458 if (first_init_pfn == ULONG_MAX) { 1551 if (first_init_pfn == ULONG_MAX) {
1459 pgdat_init_report_one_done(); 1552 pgdat_init_report_one_done();
@@ -1475,83 +1568,12 @@ static int __init deferred_init_memmap(void *data)
1475 if (first_init_pfn < zone_end_pfn(zone)) 1568 if (first_init_pfn < zone_end_pfn(zone))
1476 break; 1569 break;
1477 } 1570 }
1571 first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
1478 1572
1479 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { 1573 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1480 unsigned long pfn, end_pfn; 1574 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1481 struct page *page = NULL; 1575 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1482 struct page *free_base_page = NULL; 1576 nr_pages += deferred_init_range(nid, zid, spfn, epfn);
1483 unsigned long free_base_pfn = 0;
1484 int nr_to_free = 0;
1485
1486 end_pfn = min(walk_end, zone_end_pfn(zone));
1487 pfn = first_init_pfn;
1488 if (pfn < walk_start)
1489 pfn = walk_start;
1490 if (pfn < zone->zone_start_pfn)
1491 pfn = zone->zone_start_pfn;
1492
1493 for (; pfn < end_pfn; pfn++) {
1494 if (!pfn_valid_within(pfn))
1495 goto free_range;
1496
1497 /*
1498 * Ensure pfn_valid is checked every
1499 * pageblock_nr_pages for memory holes
1500 */
1501 if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1502 if (!pfn_valid(pfn)) {
1503 page = NULL;
1504 goto free_range;
1505 }
1506 }
1507
1508 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1509 page = NULL;
1510 goto free_range;
1511 }
1512
1513 /* Minimise pfn page lookups and scheduler checks */
1514 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1515 page++;
1516 } else {
1517 nr_pages += nr_to_free;
1518 deferred_free_range(free_base_page,
1519 free_base_pfn, nr_to_free);
1520 free_base_page = NULL;
1521 free_base_pfn = nr_to_free = 0;
1522
1523 page = pfn_to_page(pfn);
1524 cond_resched();
1525 }
1526
1527 if (page->flags) {
1528 VM_BUG_ON(page_zone(page) != zone);
1529 goto free_range;
1530 }
1531
1532 __init_single_page(page, pfn, zid, nid);
1533 if (!free_base_page) {
1534 free_base_page = page;
1535 free_base_pfn = pfn;
1536 nr_to_free = 0;
1537 }
1538 nr_to_free++;
1539
1540 /* Where possible, batch up pages for a single free */
1541 continue;
1542free_range:
1543 /* Free the current block of pages to allocator */
1544 nr_pages += nr_to_free;
1545 deferred_free_range(free_base_page, free_base_pfn,
1546 nr_to_free);
1547 free_base_page = NULL;
1548 free_base_pfn = nr_to_free = 0;
1549 }
1550 /* Free the last block of pages to allocator */
1551 nr_pages += nr_to_free;
1552 deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1553
1554 first_init_pfn = max(end_pfn, first_init_pfn);
1555 } 1577 }
1556 1578
1557 /* Sanity check that the next zone really is unpopulated */ 1579 /* Sanity check that the next zone really is unpopulated */