diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 479 |
1 files changed, 278 insertions, 201 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77e4d3c5c57b..73f5d4556b3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -24,7 +24,6 @@ | |||
| 24 | #include <linux/memblock.h> | 24 | #include <linux/memblock.h> |
| 25 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
| 26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
| 27 | #include <linux/kmemcheck.h> | ||
| 28 | #include <linux/kasan.h> | 27 | #include <linux/kasan.h> |
| 29 | #include <linux/module.h> | 28 | #include <linux/module.h> |
| 30 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
| @@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node); | |||
| 83 | EXPORT_PER_CPU_SYMBOL(numa_node); | 82 | EXPORT_PER_CPU_SYMBOL(numa_node); |
| 84 | #endif | 83 | #endif |
| 85 | 84 | ||
| 85 | DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); | ||
| 86 | |||
| 86 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 87 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
| 87 | /* | 88 | /* |
| 88 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. | 89 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. |
| @@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
| 290 | int page_group_by_mobility_disabled __read_mostly; | 291 | int page_group_by_mobility_disabled __read_mostly; |
| 291 | 292 | ||
| 292 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 293 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| 294 | |||
| 295 | /* | ||
| 296 | * Determine how many pages need to be initialized durig early boot | ||
| 297 | * (non-deferred initialization). | ||
| 298 | * The value of first_deferred_pfn will be set later, once non-deferred pages | ||
| 299 | * are initialized, but for now set it ULONG_MAX. | ||
| 300 | */ | ||
| 293 | static inline void reset_deferred_meminit(pg_data_t *pgdat) | 301 | static inline void reset_deferred_meminit(pg_data_t *pgdat) |
| 294 | { | 302 | { |
| 295 | unsigned long max_initialise; | 303 | phys_addr_t start_addr, end_addr; |
| 296 | unsigned long reserved_lowmem; | 304 | unsigned long max_pgcnt; |
| 305 | unsigned long reserved; | ||
| 297 | 306 | ||
| 298 | /* | 307 | /* |
| 299 | * Initialise at least 2G of a node but also take into account that | 308 | * Initialise at least 2G of a node but also take into account that |
| 300 | * two large system hashes that can take up 1GB for 0.25TB/node. | 309 | * two large system hashes that can take up 1GB for 0.25TB/node. |
| 301 | */ | 310 | */ |
| 302 | max_initialise = max(2UL << (30 - PAGE_SHIFT), | 311 | max_pgcnt = max(2UL << (30 - PAGE_SHIFT), |
| 303 | (pgdat->node_spanned_pages >> 8)); | 312 | (pgdat->node_spanned_pages >> 8)); |
| 304 | 313 | ||
| 305 | /* | 314 | /* |
| 306 | * Compensate the all the memblock reservations (e.g. crash kernel) | 315 | * Compensate the all the memblock reservations (e.g. crash kernel) |
| 307 | * from the initial estimation to make sure we will initialize enough | 316 | * from the initial estimation to make sure we will initialize enough |
| 308 | * memory to boot. | 317 | * memory to boot. |
| 309 | */ | 318 | */ |
| 310 | reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, | 319 | start_addr = PFN_PHYS(pgdat->node_start_pfn); |
| 311 | pgdat->node_start_pfn + max_initialise); | 320 | end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); |
| 312 | max_initialise += reserved_lowmem; | 321 | reserved = memblock_reserved_memory_within(start_addr, end_addr); |
| 322 | max_pgcnt += PHYS_PFN(reserved); | ||
| 313 | 323 | ||
| 314 | pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); | 324 | pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); |
| 315 | pgdat->first_deferred_pfn = ULONG_MAX; | 325 | pgdat->first_deferred_pfn = ULONG_MAX; |
| 316 | } | 326 | } |
| 317 | 327 | ||
| @@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, | |||
| 338 | if (zone_end < pgdat_end_pfn(pgdat)) | 348 | if (zone_end < pgdat_end_pfn(pgdat)) |
| 339 | return true; | 349 | return true; |
| 340 | (*nr_initialised)++; | 350 | (*nr_initialised)++; |
| 341 | if ((*nr_initialised > pgdat->static_init_size) && | 351 | if ((*nr_initialised > pgdat->static_init_pgcnt) && |
| 342 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { | 352 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
| 343 | pgdat->first_deferred_pfn = pfn; | 353 | pgdat->first_deferred_pfn = pfn; |
| 344 | return false; | 354 | return false; |
| @@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page, | |||
| 1013 | VM_BUG_ON_PAGE(PageTail(page), page); | 1023 | VM_BUG_ON_PAGE(PageTail(page), page); |
| 1014 | 1024 | ||
| 1015 | trace_mm_page_free(page, order); | 1025 | trace_mm_page_free(page, order); |
| 1016 | kmemcheck_free_shadow(page, order); | ||
| 1017 | 1026 | ||
| 1018 | /* | 1027 | /* |
| 1019 | * Check tail pages before head page information is cleared to | 1028 | * Check tail pages before head page information is cleared to |
| @@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone, | |||
| 1170 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, | 1179 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, |
| 1171 | unsigned long zone, int nid) | 1180 | unsigned long zone, int nid) |
| 1172 | { | 1181 | { |
| 1182 | mm_zero_struct_page(page); | ||
| 1173 | set_page_links(page, zone, nid, pfn); | 1183 | set_page_links(page, zone, nid, pfn); |
| 1174 | init_page_count(page); | 1184 | init_page_count(page); |
| 1175 | page_mapcount_reset(page); | 1185 | page_mapcount_reset(page); |
| @@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone) | |||
| 1410 | } | 1420 | } |
| 1411 | 1421 | ||
| 1412 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 1422 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| 1413 | static void __init deferred_free_range(struct page *page, | 1423 | static void __init deferred_free_range(unsigned long pfn, |
| 1414 | unsigned long pfn, int nr_pages) | 1424 | unsigned long nr_pages) |
| 1415 | { | 1425 | { |
| 1416 | int i; | 1426 | struct page *page; |
| 1427 | unsigned long i; | ||
| 1417 | 1428 | ||
| 1418 | if (!page) | 1429 | if (!nr_pages) |
| 1419 | return; | 1430 | return; |
| 1420 | 1431 | ||
| 1432 | page = pfn_to_page(pfn); | ||
| 1433 | |||
| 1421 | /* Free a large naturally-aligned chunk if possible */ | 1434 | /* Free a large naturally-aligned chunk if possible */ |
| 1422 | if (nr_pages == pageblock_nr_pages && | 1435 | if (nr_pages == pageblock_nr_pages && |
| 1423 | (pfn & (pageblock_nr_pages - 1)) == 0) { | 1436 | (pfn & (pageblock_nr_pages - 1)) == 0) { |
| @@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void) | |||
| 1443 | complete(&pgdat_init_all_done_comp); | 1456 | complete(&pgdat_init_all_done_comp); |
| 1444 | } | 1457 | } |
| 1445 | 1458 | ||
| 1459 | /* | ||
| 1460 | * Helper for deferred_init_range, free the given range, reset the counters, and | ||
| 1461 | * return number of pages freed. | ||
| 1462 | */ | ||
| 1463 | static inline unsigned long __init __def_free(unsigned long *nr_free, | ||
| 1464 | unsigned long *free_base_pfn, | ||
| 1465 | struct page **page) | ||
| 1466 | { | ||
| 1467 | unsigned long nr = *nr_free; | ||
| 1468 | |||
| 1469 | deferred_free_range(*free_base_pfn, nr); | ||
| 1470 | *free_base_pfn = 0; | ||
| 1471 | *nr_free = 0; | ||
| 1472 | *page = NULL; | ||
| 1473 | |||
| 1474 | return nr; | ||
| 1475 | } | ||
| 1476 | |||
| 1477 | static unsigned long __init deferred_init_range(int nid, int zid, | ||
| 1478 | unsigned long start_pfn, | ||
| 1479 | unsigned long end_pfn) | ||
| 1480 | { | ||
| 1481 | struct mminit_pfnnid_cache nid_init_state = { }; | ||
| 1482 | unsigned long nr_pgmask = pageblock_nr_pages - 1; | ||
| 1483 | unsigned long free_base_pfn = 0; | ||
| 1484 | unsigned long nr_pages = 0; | ||
| 1485 | unsigned long nr_free = 0; | ||
| 1486 | struct page *page = NULL; | ||
| 1487 | unsigned long pfn; | ||
| 1488 | |||
| 1489 | /* | ||
| 1490 | * First we check if pfn is valid on architectures where it is possible | ||
| 1491 | * to have holes within pageblock_nr_pages. On systems where it is not | ||
| 1492 | * possible, this function is optimized out. | ||
| 1493 | * | ||
| 1494 | * Then, we check if a current large page is valid by only checking the | ||
| 1495 | * validity of the head pfn. | ||
| 1496 | * | ||
| 1497 | * meminit_pfn_in_nid is checked on systems where pfns can interleave | ||
| 1498 | * within a node: a pfn is between start and end of a node, but does not | ||
| 1499 | * belong to this memory node. | ||
| 1500 | * | ||
| 1501 | * Finally, we minimize pfn page lookups and scheduler checks by | ||
| 1502 | * performing it only once every pageblock_nr_pages. | ||
| 1503 | * | ||
| 1504 | * We do it in two loops: first we initialize struct page, than free to | ||
| 1505 | * buddy allocator, becuse while we are freeing pages we can access | ||
| 1506 | * pages that are ahead (computing buddy page in __free_one_page()). | ||
| 1507 | */ | ||
| 1508 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
| 1509 | if (!pfn_valid_within(pfn)) | ||
| 1510 | continue; | ||
| 1511 | if ((pfn & nr_pgmask) || pfn_valid(pfn)) { | ||
| 1512 | if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { | ||
| 1513 | if (page && (pfn & nr_pgmask)) | ||
| 1514 | page++; | ||
| 1515 | else | ||
| 1516 | page = pfn_to_page(pfn); | ||
| 1517 | __init_single_page(page, pfn, zid, nid); | ||
| 1518 | cond_resched(); | ||
| 1519 | } | ||
| 1520 | } | ||
| 1521 | } | ||
| 1522 | |||
| 1523 | page = NULL; | ||
| 1524 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
| 1525 | if (!pfn_valid_within(pfn)) { | ||
| 1526 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
| 1527 | } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { | ||
| 1528 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
| 1529 | } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { | ||
| 1530 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
| 1531 | } else if (page && (pfn & nr_pgmask)) { | ||
| 1532 | page++; | ||
| 1533 | nr_free++; | ||
| 1534 | } else { | ||
| 1535 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
| 1536 | page = pfn_to_page(pfn); | ||
| 1537 | free_base_pfn = pfn; | ||
| 1538 | nr_free = 1; | ||
| 1539 | cond_resched(); | ||
| 1540 | } | ||
| 1541 | } | ||
| 1542 | /* Free the last block of pages to allocator */ | ||
| 1543 | nr_pages += __def_free(&nr_free, &free_base_pfn, &page); | ||
| 1544 | |||
| 1545 | return nr_pages; | ||
| 1546 | } | ||
| 1547 | |||
| 1446 | /* Initialise remaining memory on a node */ | 1548 | /* Initialise remaining memory on a node */ |
| 1447 | static int __init deferred_init_memmap(void *data) | 1549 | static int __init deferred_init_memmap(void *data) |
| 1448 | { | 1550 | { |
| 1449 | pg_data_t *pgdat = data; | 1551 | pg_data_t *pgdat = data; |
| 1450 | int nid = pgdat->node_id; | 1552 | int nid = pgdat->node_id; |
| 1451 | struct mminit_pfnnid_cache nid_init_state = { }; | ||
| 1452 | unsigned long start = jiffies; | 1553 | unsigned long start = jiffies; |
| 1453 | unsigned long nr_pages = 0; | 1554 | unsigned long nr_pages = 0; |
| 1454 | unsigned long walk_start, walk_end; | 1555 | unsigned long spfn, epfn; |
| 1455 | int i, zid; | 1556 | phys_addr_t spa, epa; |
| 1557 | int zid; | ||
| 1456 | struct zone *zone; | 1558 | struct zone *zone; |
| 1457 | unsigned long first_init_pfn = pgdat->first_deferred_pfn; | 1559 | unsigned long first_init_pfn = pgdat->first_deferred_pfn; |
| 1458 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); | 1560 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); |
| 1561 | u64 i; | ||
| 1459 | 1562 | ||
| 1460 | if (first_init_pfn == ULONG_MAX) { | 1563 | if (first_init_pfn == ULONG_MAX) { |
| 1461 | pgdat_init_report_one_done(); | 1564 | pgdat_init_report_one_done(); |
| @@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data) | |||
| 1477 | if (first_init_pfn < zone_end_pfn(zone)) | 1580 | if (first_init_pfn < zone_end_pfn(zone)) |
| 1478 | break; | 1581 | break; |
| 1479 | } | 1582 | } |
| 1583 | first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); | ||
| 1480 | 1584 | ||
| 1481 | for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { | 1585 | for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
| 1482 | unsigned long pfn, end_pfn; | 1586 | spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
| 1483 | struct page *page = NULL; | 1587 | epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); |
| 1484 | struct page *free_base_page = NULL; | 1588 | nr_pages += deferred_init_range(nid, zid, spfn, epfn); |
| 1485 | unsigned long free_base_pfn = 0; | ||
| 1486 | int nr_to_free = 0; | ||
| 1487 | |||
| 1488 | end_pfn = min(walk_end, zone_end_pfn(zone)); | ||
| 1489 | pfn = first_init_pfn; | ||
| 1490 | if (pfn < walk_start) | ||
| 1491 | pfn = walk_start; | ||
| 1492 | if (pfn < zone->zone_start_pfn) | ||
| 1493 | pfn = zone->zone_start_pfn; | ||
| 1494 | |||
| 1495 | for (; pfn < end_pfn; pfn++) { | ||
| 1496 | if (!pfn_valid_within(pfn)) | ||
| 1497 | goto free_range; | ||
| 1498 | |||
| 1499 | /* | ||
| 1500 | * Ensure pfn_valid is checked every | ||
| 1501 | * pageblock_nr_pages for memory holes | ||
| 1502 | */ | ||
| 1503 | if ((pfn & (pageblock_nr_pages - 1)) == 0) { | ||
| 1504 | if (!pfn_valid(pfn)) { | ||
| 1505 | page = NULL; | ||
| 1506 | goto free_range; | ||
| 1507 | } | ||
| 1508 | } | ||
| 1509 | |||
| 1510 | if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { | ||
| 1511 | page = NULL; | ||
| 1512 | goto free_range; | ||
| 1513 | } | ||
| 1514 | |||
| 1515 | /* Minimise pfn page lookups and scheduler checks */ | ||
| 1516 | if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { | ||
| 1517 | page++; | ||
| 1518 | } else { | ||
| 1519 | nr_pages += nr_to_free; | ||
| 1520 | deferred_free_range(free_base_page, | ||
| 1521 | free_base_pfn, nr_to_free); | ||
| 1522 | free_base_page = NULL; | ||
| 1523 | free_base_pfn = nr_to_free = 0; | ||
| 1524 | |||
| 1525 | page = pfn_to_page(pfn); | ||
| 1526 | cond_resched(); | ||
| 1527 | } | ||
| 1528 | |||
| 1529 | if (page->flags) { | ||
| 1530 | VM_BUG_ON(page_zone(page) != zone); | ||
| 1531 | goto free_range; | ||
| 1532 | } | ||
| 1533 | |||
| 1534 | __init_single_page(page, pfn, zid, nid); | ||
| 1535 | if (!free_base_page) { | ||
| 1536 | free_base_page = page; | ||
| 1537 | free_base_pfn = pfn; | ||
| 1538 | nr_to_free = 0; | ||
| 1539 | } | ||
| 1540 | nr_to_free++; | ||
| 1541 | |||
| 1542 | /* Where possible, batch up pages for a single free */ | ||
| 1543 | continue; | ||
| 1544 | free_range: | ||
| 1545 | /* Free the current block of pages to allocator */ | ||
| 1546 | nr_pages += nr_to_free; | ||
| 1547 | deferred_free_range(free_base_page, free_base_pfn, | ||
| 1548 | nr_to_free); | ||
| 1549 | free_base_page = NULL; | ||
| 1550 | free_base_pfn = nr_to_free = 0; | ||
| 1551 | } | ||
| 1552 | /* Free the last block of pages to allocator */ | ||
| 1553 | nr_pages += nr_to_free; | ||
| 1554 | deferred_free_range(free_base_page, free_base_pfn, nr_to_free); | ||
| 1555 | |||
| 1556 | first_init_pfn = max(end_pfn, first_init_pfn); | ||
| 1557 | } | 1589 | } |
| 1558 | 1590 | ||
| 1559 | /* Sanity check that the next zone really is unpopulated */ | 1591 | /* Sanity check that the next zone really is unpopulated */ |
| @@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags | |||
| 1792 | * Go through the free lists for the given migratetype and remove | 1824 | * Go through the free lists for the given migratetype and remove |
| 1793 | * the smallest available page from the freelists | 1825 | * the smallest available page from the freelists |
| 1794 | */ | 1826 | */ |
| 1795 | static inline | 1827 | static __always_inline |
| 1796 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 1828 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
| 1797 | int migratetype) | 1829 | int migratetype) |
| 1798 | { | 1830 | { |
| @@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
| 1836 | }; | 1868 | }; |
| 1837 | 1869 | ||
| 1838 | #ifdef CONFIG_CMA | 1870 | #ifdef CONFIG_CMA |
| 1839 | static struct page *__rmqueue_cma_fallback(struct zone *zone, | 1871 | static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, |
| 1840 | unsigned int order) | 1872 | unsigned int order) |
| 1841 | { | 1873 | { |
| 1842 | return __rmqueue_smallest(zone, order, MIGRATE_CMA); | 1874 | return __rmqueue_smallest(zone, order, MIGRATE_CMA); |
| @@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, | |||
| 2217 | * deviation from the rest of this file, to make the for loop | 2249 | * deviation from the rest of this file, to make the for loop |
| 2218 | * condition simpler. | 2250 | * condition simpler. |
| 2219 | */ | 2251 | */ |
| 2220 | static inline bool | 2252 | static __always_inline bool |
| 2221 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 2253 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
| 2222 | { | 2254 | { |
| 2223 | struct free_area *area; | 2255 | struct free_area *area; |
| @@ -2289,8 +2321,8 @@ do_steal: | |||
| 2289 | * Do the hard work of removing an element from the buddy allocator. | 2321 | * Do the hard work of removing an element from the buddy allocator. |
| 2290 | * Call me with the zone->lock already held. | 2322 | * Call me with the zone->lock already held. |
| 2291 | */ | 2323 | */ |
| 2292 | static struct page *__rmqueue(struct zone *zone, unsigned int order, | 2324 | static __always_inline struct page * |
| 2293 | int migratetype) | 2325 | __rmqueue(struct zone *zone, unsigned int order, int migratetype) |
| 2294 | { | 2326 | { |
| 2295 | struct page *page; | 2327 | struct page *page; |
| 2296 | 2328 | ||
| @@ -2315,7 +2347,7 @@ retry: | |||
| 2315 | */ | 2347 | */ |
| 2316 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 2348 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
| 2317 | unsigned long count, struct list_head *list, | 2349 | unsigned long count, struct list_head *list, |
| 2318 | int migratetype, bool cold) | 2350 | int migratetype) |
| 2319 | { | 2351 | { |
| 2320 | int i, alloced = 0; | 2352 | int i, alloced = 0; |
| 2321 | 2353 | ||
| @@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| 2329 | continue; | 2361 | continue; |
| 2330 | 2362 | ||
| 2331 | /* | 2363 | /* |
| 2332 | * Split buddy pages returned by expand() are received here | 2364 | * Split buddy pages returned by expand() are received here in |
| 2333 | * in physical page order. The page is added to the callers and | 2365 | * physical page order. The page is added to the tail of |
| 2334 | * list and the list head then moves forward. From the callers | 2366 | * caller's list. From the callers perspective, the linked list |
| 2335 | * perspective, the linked list is ordered by page number in | 2367 | * is ordered by page number under some conditions. This is |
| 2336 | * some conditions. This is useful for IO devices that can | 2368 | * useful for IO devices that can forward direction from the |
| 2337 | * merge IO requests if the physical pages are ordered | 2369 | * head, thus also in the physical page order. This is useful |
| 2338 | * properly. | 2370 | * for IO devices that can merge IO requests if the physical |
| 2371 | * pages are ordered properly. | ||
| 2339 | */ | 2372 | */ |
| 2340 | if (likely(!cold)) | 2373 | list_add_tail(&page->lru, list); |
| 2341 | list_add(&page->lru, list); | ||
| 2342 | else | ||
| 2343 | list_add_tail(&page->lru, list); | ||
| 2344 | list = &page->lru; | ||
| 2345 | alloced++; | 2374 | alloced++; |
| 2346 | if (is_migrate_cma(get_pcppage_migratetype(page))) | 2375 | if (is_migrate_cma(get_pcppage_migratetype(page))) |
| 2347 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | 2376 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, |
| @@ -2478,10 +2507,6 @@ void drain_all_pages(struct zone *zone) | |||
| 2478 | if (WARN_ON_ONCE(!mm_percpu_wq)) | 2507 | if (WARN_ON_ONCE(!mm_percpu_wq)) |
| 2479 | return; | 2508 | return; |
| 2480 | 2509 | ||
| 2481 | /* Workqueues cannot recurse */ | ||
| 2482 | if (current->flags & PF_WQ_WORKER) | ||
| 2483 | return; | ||
| 2484 | |||
| 2485 | /* | 2510 | /* |
| 2486 | * Do not drain if one is already in progress unless it's specific to | 2511 | * Do not drain if one is already in progress unless it's specific to |
| 2487 | * a zone. Such callers are primarily CMA and memory hotplug and need | 2512 | * a zone. Such callers are primarily CMA and memory hotplug and need |
| @@ -2590,24 +2615,25 @@ void mark_free_pages(struct zone *zone) | |||
| 2590 | } | 2615 | } |
| 2591 | #endif /* CONFIG_PM */ | 2616 | #endif /* CONFIG_PM */ |
| 2592 | 2617 | ||
| 2593 | /* | 2618 | static bool free_unref_page_prepare(struct page *page, unsigned long pfn) |
| 2594 | * Free a 0-order page | ||
| 2595 | * cold == true ? free a cold page : free a hot page | ||
| 2596 | */ | ||
| 2597 | void free_hot_cold_page(struct page *page, bool cold) | ||
| 2598 | { | 2619 | { |
| 2599 | struct zone *zone = page_zone(page); | ||
| 2600 | struct per_cpu_pages *pcp; | ||
| 2601 | unsigned long flags; | ||
| 2602 | unsigned long pfn = page_to_pfn(page); | ||
| 2603 | int migratetype; | 2620 | int migratetype; |
| 2604 | 2621 | ||
| 2605 | if (!free_pcp_prepare(page)) | 2622 | if (!free_pcp_prepare(page)) |
| 2606 | return; | 2623 | return false; |
| 2607 | 2624 | ||
| 2608 | migratetype = get_pfnblock_migratetype(page, pfn); | 2625 | migratetype = get_pfnblock_migratetype(page, pfn); |
| 2609 | set_pcppage_migratetype(page, migratetype); | 2626 | set_pcppage_migratetype(page, migratetype); |
| 2610 | local_irq_save(flags); | 2627 | return true; |
| 2628 | } | ||
| 2629 | |||
| 2630 | static void free_unref_page_commit(struct page *page, unsigned long pfn) | ||
| 2631 | { | ||
| 2632 | struct zone *zone = page_zone(page); | ||
| 2633 | struct per_cpu_pages *pcp; | ||
| 2634 | int migratetype; | ||
| 2635 | |||
| 2636 | migratetype = get_pcppage_migratetype(page); | ||
| 2611 | __count_vm_event(PGFREE); | 2637 | __count_vm_event(PGFREE); |
| 2612 | 2638 | ||
| 2613 | /* | 2639 | /* |
| @@ -2620,38 +2646,62 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
| 2620 | if (migratetype >= MIGRATE_PCPTYPES) { | 2646 | if (migratetype >= MIGRATE_PCPTYPES) { |
| 2621 | if (unlikely(is_migrate_isolate(migratetype))) { | 2647 | if (unlikely(is_migrate_isolate(migratetype))) { |
| 2622 | free_one_page(zone, page, pfn, 0, migratetype); | 2648 | free_one_page(zone, page, pfn, 0, migratetype); |
| 2623 | goto out; | 2649 | return; |
| 2624 | } | 2650 | } |
| 2625 | migratetype = MIGRATE_MOVABLE; | 2651 | migratetype = MIGRATE_MOVABLE; |
| 2626 | } | 2652 | } |
| 2627 | 2653 | ||
| 2628 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 2654 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
| 2629 | if (!cold) | 2655 | list_add(&page->lru, &pcp->lists[migratetype]); |
| 2630 | list_add(&page->lru, &pcp->lists[migratetype]); | ||
| 2631 | else | ||
| 2632 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | ||
| 2633 | pcp->count++; | 2656 | pcp->count++; |
| 2634 | if (pcp->count >= pcp->high) { | 2657 | if (pcp->count >= pcp->high) { |
| 2635 | unsigned long batch = READ_ONCE(pcp->batch); | 2658 | unsigned long batch = READ_ONCE(pcp->batch); |
| 2636 | free_pcppages_bulk(zone, batch, pcp); | 2659 | free_pcppages_bulk(zone, batch, pcp); |
| 2637 | pcp->count -= batch; | 2660 | pcp->count -= batch; |
| 2638 | } | 2661 | } |
| 2662 | } | ||
| 2639 | 2663 | ||
| 2640 | out: | 2664 | /* |
| 2665 | * Free a 0-order page | ||
| 2666 | */ | ||
| 2667 | void free_unref_page(struct page *page) | ||
| 2668 | { | ||
| 2669 | unsigned long flags; | ||
| 2670 | unsigned long pfn = page_to_pfn(page); | ||
| 2671 | |||
| 2672 | if (!free_unref_page_prepare(page, pfn)) | ||
| 2673 | return; | ||
| 2674 | |||
| 2675 | local_irq_save(flags); | ||
| 2676 | free_unref_page_commit(page, pfn); | ||
| 2641 | local_irq_restore(flags); | 2677 | local_irq_restore(flags); |
| 2642 | } | 2678 | } |
| 2643 | 2679 | ||
| 2644 | /* | 2680 | /* |
| 2645 | * Free a list of 0-order pages | 2681 | * Free a list of 0-order pages |
| 2646 | */ | 2682 | */ |
| 2647 | void free_hot_cold_page_list(struct list_head *list, bool cold) | 2683 | void free_unref_page_list(struct list_head *list) |
| 2648 | { | 2684 | { |
| 2649 | struct page *page, *next; | 2685 | struct page *page, *next; |
| 2686 | unsigned long flags, pfn; | ||
| 2687 | |||
| 2688 | /* Prepare pages for freeing */ | ||
| 2689 | list_for_each_entry_safe(page, next, list, lru) { | ||
| 2690 | pfn = page_to_pfn(page); | ||
| 2691 | if (!free_unref_page_prepare(page, pfn)) | ||
| 2692 | list_del(&page->lru); | ||
| 2693 | set_page_private(page, pfn); | ||
| 2694 | } | ||
| 2650 | 2695 | ||
| 2696 | local_irq_save(flags); | ||
| 2651 | list_for_each_entry_safe(page, next, list, lru) { | 2697 | list_for_each_entry_safe(page, next, list, lru) { |
| 2652 | trace_mm_page_free_batched(page, cold); | 2698 | unsigned long pfn = page_private(page); |
| 2653 | free_hot_cold_page(page, cold); | 2699 | |
| 2700 | set_page_private(page, 0); | ||
| 2701 | trace_mm_page_free_batched(page); | ||
| 2702 | free_unref_page_commit(page, pfn); | ||
| 2654 | } | 2703 | } |
| 2704 | local_irq_restore(flags); | ||
| 2655 | } | 2705 | } |
| 2656 | 2706 | ||
| 2657 | /* | 2707 | /* |
| @@ -2669,15 +2719,6 @@ void split_page(struct page *page, unsigned int order) | |||
| 2669 | VM_BUG_ON_PAGE(PageCompound(page), page); | 2719 | VM_BUG_ON_PAGE(PageCompound(page), page); |
| 2670 | VM_BUG_ON_PAGE(!page_count(page), page); | 2720 | VM_BUG_ON_PAGE(!page_count(page), page); |
| 2671 | 2721 | ||
| 2672 | #ifdef CONFIG_KMEMCHECK | ||
| 2673 | /* | ||
| 2674 | * Split shadow pages too, because free(page[0]) would | ||
| 2675 | * otherwise free the whole shadow. | ||
| 2676 | */ | ||
| 2677 | if (kmemcheck_page_is_tracked(page)) | ||
| 2678 | split_page(virt_to_page(page[0].shadow), order); | ||
| 2679 | #endif | ||
| 2680 | |||
| 2681 | for (i = 1; i < (1 << order); i++) | 2722 | for (i = 1; i < (1 << order); i++) |
| 2682 | set_page_refcounted(page + i); | 2723 | set_page_refcounted(page + i); |
| 2683 | split_page_owner(page, order); | 2724 | split_page_owner(page, order); |
| @@ -2743,6 +2784,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
| 2743 | #ifdef CONFIG_NUMA | 2784 | #ifdef CONFIG_NUMA |
| 2744 | enum numa_stat_item local_stat = NUMA_LOCAL; | 2785 | enum numa_stat_item local_stat = NUMA_LOCAL; |
| 2745 | 2786 | ||
| 2787 | /* skip numa counters update if numa stats is disabled */ | ||
| 2788 | if (!static_branch_likely(&vm_numa_stat_key)) | ||
| 2789 | return; | ||
| 2790 | |||
| 2746 | if (z->node != numa_node_id()) | 2791 | if (z->node != numa_node_id()) |
| 2747 | local_stat = NUMA_OTHER; | 2792 | local_stat = NUMA_OTHER; |
| 2748 | 2793 | ||
| @@ -2758,7 +2803,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
| 2758 | 2803 | ||
| 2759 | /* Remove page from the per-cpu list, caller must protect the list */ | 2804 | /* Remove page from the per-cpu list, caller must protect the list */ |
| 2760 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | 2805 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, |
| 2761 | bool cold, struct per_cpu_pages *pcp, | 2806 | struct per_cpu_pages *pcp, |
| 2762 | struct list_head *list) | 2807 | struct list_head *list) |
| 2763 | { | 2808 | { |
| 2764 | struct page *page; | 2809 | struct page *page; |
| @@ -2767,16 +2812,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | |||
| 2767 | if (list_empty(list)) { | 2812 | if (list_empty(list)) { |
| 2768 | pcp->count += rmqueue_bulk(zone, 0, | 2813 | pcp->count += rmqueue_bulk(zone, 0, |
| 2769 | pcp->batch, list, | 2814 | pcp->batch, list, |
| 2770 | migratetype, cold); | 2815 | migratetype); |
| 2771 | if (unlikely(list_empty(list))) | 2816 | if (unlikely(list_empty(list))) |
| 2772 | return NULL; | 2817 | return NULL; |
| 2773 | } | 2818 | } |
| 2774 | 2819 | ||
| 2775 | if (cold) | 2820 | page = list_first_entry(list, struct page, lru); |
| 2776 | page = list_last_entry(list, struct page, lru); | ||
| 2777 | else | ||
| 2778 | page = list_first_entry(list, struct page, lru); | ||
| 2779 | |||
| 2780 | list_del(&page->lru); | 2821 | list_del(&page->lru); |
| 2781 | pcp->count--; | 2822 | pcp->count--; |
| 2782 | } while (check_new_pcp(page)); | 2823 | } while (check_new_pcp(page)); |
| @@ -2791,14 +2832,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | |||
| 2791 | { | 2832 | { |
| 2792 | struct per_cpu_pages *pcp; | 2833 | struct per_cpu_pages *pcp; |
| 2793 | struct list_head *list; | 2834 | struct list_head *list; |
| 2794 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | ||
| 2795 | struct page *page; | 2835 | struct page *page; |
| 2796 | unsigned long flags; | 2836 | unsigned long flags; |
| 2797 | 2837 | ||
| 2798 | local_irq_save(flags); | 2838 | local_irq_save(flags); |
| 2799 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 2839 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
| 2800 | list = &pcp->lists[migratetype]; | 2840 | list = &pcp->lists[migratetype]; |
| 2801 | page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); | 2841 | page = __rmqueue_pcplist(zone, migratetype, pcp, list); |
| 2802 | if (page) { | 2842 | if (page) { |
| 2803 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); | 2843 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
| 2804 | zone_statistics(preferred_zone, zone); | 2844 | zone_statistics(preferred_zone, zone); |
| @@ -3006,9 +3046,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, | |||
| 3006 | if (!area->nr_free) | 3046 | if (!area->nr_free) |
| 3007 | continue; | 3047 | continue; |
| 3008 | 3048 | ||
| 3009 | if (alloc_harder) | ||
| 3010 | return true; | ||
| 3011 | |||
| 3012 | for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { | 3049 | for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { |
| 3013 | if (!list_empty(&area->free_list[mt])) | 3050 | if (!list_empty(&area->free_list[mt])) |
| 3014 | return true; | 3051 | return true; |
| @@ -3020,6 +3057,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, | |||
| 3020 | return true; | 3057 | return true; |
| 3021 | } | 3058 | } |
| 3022 | #endif | 3059 | #endif |
| 3060 | if (alloc_harder && | ||
| 3061 | !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) | ||
| 3062 | return true; | ||
| 3023 | } | 3063 | } |
| 3024 | return false; | 3064 | return false; |
| 3025 | } | 3065 | } |
| @@ -3235,20 +3275,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) | |||
| 3235 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) | 3275 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) |
| 3236 | return; | 3276 | return; |
| 3237 | 3277 | ||
| 3238 | pr_warn("%s: ", current->comm); | ||
| 3239 | |||
| 3240 | va_start(args, fmt); | 3278 | va_start(args, fmt); |
| 3241 | vaf.fmt = fmt; | 3279 | vaf.fmt = fmt; |
| 3242 | vaf.va = &args; | 3280 | vaf.va = &args; |
| 3243 | pr_cont("%pV", &vaf); | 3281 | pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", |
| 3282 | current->comm, &vaf, gfp_mask, &gfp_mask, | ||
| 3283 | nodemask_pr_args(nodemask)); | ||
| 3244 | va_end(args); | 3284 | va_end(args); |
| 3245 | 3285 | ||
| 3246 | pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); | ||
| 3247 | if (nodemask) | ||
| 3248 | pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); | ||
| 3249 | else | ||
| 3250 | pr_cont("(null)\n"); | ||
| 3251 | |||
| 3252 | cpuset_print_current_mems_allowed(); | 3286 | cpuset_print_current_mems_allowed(); |
| 3253 | 3287 | ||
| 3254 | dump_stack(); | 3288 | dump_stack(); |
| @@ -3868,8 +3902,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
| 3868 | enum compact_result compact_result; | 3902 | enum compact_result compact_result; |
| 3869 | int compaction_retries; | 3903 | int compaction_retries; |
| 3870 | int no_progress_loops; | 3904 | int no_progress_loops; |
| 3871 | unsigned long alloc_start = jiffies; | ||
| 3872 | unsigned int stall_timeout = 10 * HZ; | ||
| 3873 | unsigned int cpuset_mems_cookie; | 3905 | unsigned int cpuset_mems_cookie; |
| 3874 | int reserve_flags; | 3906 | int reserve_flags; |
| 3875 | 3907 | ||
| @@ -4001,14 +4033,6 @@ retry: | |||
| 4001 | if (!can_direct_reclaim) | 4033 | if (!can_direct_reclaim) |
| 4002 | goto nopage; | 4034 | goto nopage; |
| 4003 | 4035 | ||
| 4004 | /* Make sure we know about allocations which stall for too long */ | ||
| 4005 | if (time_after(jiffies, alloc_start + stall_timeout)) { | ||
| 4006 | warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, | ||
| 4007 | "page allocation stalls for %ums, order:%u", | ||
| 4008 | jiffies_to_msecs(jiffies-alloc_start), order); | ||
| 4009 | stall_timeout += 10 * HZ; | ||
| 4010 | } | ||
| 4011 | |||
| 4012 | /* Avoid recursion of direct reclaim */ | 4036 | /* Avoid recursion of direct reclaim */ |
| 4013 | if (current->flags & PF_MEMALLOC) | 4037 | if (current->flags & PF_MEMALLOC) |
| 4014 | goto nopage; | 4038 | goto nopage; |
| @@ -4223,9 +4247,6 @@ out: | |||
| 4223 | page = NULL; | 4247 | page = NULL; |
| 4224 | } | 4248 | } |
| 4225 | 4249 | ||
| 4226 | if (kmemcheck_enabled && page) | ||
| 4227 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
| 4228 | |||
| 4229 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); | 4250 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); |
| 4230 | 4251 | ||
| 4231 | return page; | 4252 | return page; |
| @@ -4262,7 +4283,7 @@ void __free_pages(struct page *page, unsigned int order) | |||
| 4262 | { | 4283 | { |
| 4263 | if (put_page_testzero(page)) { | 4284 | if (put_page_testzero(page)) { |
| 4264 | if (order == 0) | 4285 | if (order == 0) |
| 4265 | free_hot_cold_page(page, false); | 4286 | free_unref_page(page); |
| 4266 | else | 4287 | else |
| 4267 | __free_pages_ok(page, order); | 4288 | __free_pages_ok(page, order); |
| 4268 | } | 4289 | } |
| @@ -4320,7 +4341,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) | |||
| 4320 | unsigned int order = compound_order(page); | 4341 | unsigned int order = compound_order(page); |
| 4321 | 4342 | ||
| 4322 | if (order == 0) | 4343 | if (order == 0) |
| 4323 | free_hot_cold_page(page, false); | 4344 | free_unref_page(page); |
| 4324 | else | 4345 | else |
| 4325 | __free_pages_ok(page, order); | 4346 | __free_pages_ok(page, order); |
| 4326 | } | 4347 | } |
| @@ -6126,6 +6147,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
| 6126 | } | 6147 | } |
| 6127 | } | 6148 | } |
| 6128 | 6149 | ||
| 6150 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
| 6129 | static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | 6151 | static void __ref alloc_node_mem_map(struct pglist_data *pgdat) |
| 6130 | { | 6152 | { |
| 6131 | unsigned long __maybe_unused start = 0; | 6153 | unsigned long __maybe_unused start = 0; |
| @@ -6135,7 +6157,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | |||
| 6135 | if (!pgdat->node_spanned_pages) | 6157 | if (!pgdat->node_spanned_pages) |
| 6136 | return; | 6158 | return; |
| 6137 | 6159 | ||
| 6138 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
| 6139 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 6160 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
| 6140 | offset = pgdat->node_start_pfn - start; | 6161 | offset = pgdat->node_start_pfn - start; |
| 6141 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | 6162 | /* ia64 gets its own node_mem_map, before this, without bootmem */ |
| @@ -6157,6 +6178,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | |||
| 6157 | pgdat->node_id); | 6178 | pgdat->node_id); |
| 6158 | pgdat->node_mem_map = map + offset; | 6179 | pgdat->node_mem_map = map + offset; |
| 6159 | } | 6180 | } |
| 6181 | pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", | ||
| 6182 | __func__, pgdat->node_id, (unsigned long)pgdat, | ||
| 6183 | (unsigned long)pgdat->node_mem_map); | ||
| 6160 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 6184 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| 6161 | /* | 6185 | /* |
| 6162 | * With no DISCONTIG, the global mem_map is just set as node 0's | 6186 | * With no DISCONTIG, the global mem_map is just set as node 0's |
| @@ -6169,8 +6193,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | |||
| 6169 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 6193 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
| 6170 | } | 6194 | } |
| 6171 | #endif | 6195 | #endif |
| 6172 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
| 6173 | } | 6196 | } |
| 6197 | #else | ||
| 6198 | static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } | ||
| 6199 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
| 6174 | 6200 | ||
| 6175 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | 6201 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, |
| 6176 | unsigned long node_start_pfn, unsigned long *zholes_size) | 6202 | unsigned long node_start_pfn, unsigned long *zholes_size) |
| @@ -6197,16 +6223,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
| 6197 | zones_size, zholes_size); | 6223 | zones_size, zholes_size); |
| 6198 | 6224 | ||
| 6199 | alloc_node_mem_map(pgdat); | 6225 | alloc_node_mem_map(pgdat); |
| 6200 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
| 6201 | printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", | ||
| 6202 | nid, (unsigned long)pgdat, | ||
| 6203 | (unsigned long)pgdat->node_mem_map); | ||
| 6204 | #endif | ||
| 6205 | 6226 | ||
| 6206 | reset_deferred_meminit(pgdat); | 6227 | reset_deferred_meminit(pgdat); |
| 6207 | free_area_init_core(pgdat); | 6228 | free_area_init_core(pgdat); |
| 6208 | } | 6229 | } |
| 6209 | 6230 | ||
| 6231 | #ifdef CONFIG_HAVE_MEMBLOCK | ||
| 6232 | /* | ||
| 6233 | * Only struct pages that are backed by physical memory are zeroed and | ||
| 6234 | * initialized by going through __init_single_page(). But, there are some | ||
| 6235 | * struct pages which are reserved in memblock allocator and their fields | ||
| 6236 | * may be accessed (for example page_to_pfn() on some configuration accesses | ||
| 6237 | * flags). We must explicitly zero those struct pages. | ||
| 6238 | */ | ||
| 6239 | void __paginginit zero_resv_unavail(void) | ||
| 6240 | { | ||
| 6241 | phys_addr_t start, end; | ||
| 6242 | unsigned long pfn; | ||
| 6243 | u64 i, pgcnt; | ||
| 6244 | |||
| 6245 | /* | ||
| 6246 | * Loop through ranges that are reserved, but do not have reported | ||
| 6247 | * physical memory backing. | ||
| 6248 | */ | ||
| 6249 | pgcnt = 0; | ||
| 6250 | for_each_resv_unavail_range(i, &start, &end) { | ||
| 6251 | for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { | ||
| 6252 | mm_zero_struct_page(pfn_to_page(pfn)); | ||
| 6253 | pgcnt++; | ||
| 6254 | } | ||
| 6255 | } | ||
| 6256 | |||
| 6257 | /* | ||
| 6258 | * Struct pages that do not have backing memory. This could be because | ||
| 6259 | * firmware is using some of this memory, or for some other reasons. | ||
| 6260 | * Once memblock is changed so such behaviour is not allowed: i.e. | ||
| 6261 | * list of "reserved" memory must be a subset of list of "memory", then | ||
| 6262 | * this code can be removed. | ||
| 6263 | */ | ||
| 6264 | if (pgcnt) | ||
| 6265 | pr_info("Reserved but unavailable: %lld pages", pgcnt); | ||
| 6266 | } | ||
| 6267 | #endif /* CONFIG_HAVE_MEMBLOCK */ | ||
| 6268 | |||
| 6210 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 6269 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
| 6211 | 6270 | ||
| 6212 | #if MAX_NUMNODES > 1 | 6271 | #if MAX_NUMNODES > 1 |
| @@ -6630,6 +6689,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 6630 | node_set_state(nid, N_MEMORY); | 6689 | node_set_state(nid, N_MEMORY); |
| 6631 | check_for_memory(pgdat, nid); | 6690 | check_for_memory(pgdat, nid); |
| 6632 | } | 6691 | } |
| 6692 | zero_resv_unavail(); | ||
| 6633 | } | 6693 | } |
| 6634 | 6694 | ||
| 6635 | static int __init cmdline_parse_core(char *p, unsigned long *core) | 6695 | static int __init cmdline_parse_core(char *p, unsigned long *core) |
| @@ -6793,6 +6853,7 @@ void __init free_area_init(unsigned long *zones_size) | |||
| 6793 | { | 6853 | { |
| 6794 | free_area_init_node(0, zones_size, | 6854 | free_area_init_node(0, zones_size, |
| 6795 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 6855 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
| 6856 | zero_resv_unavail(); | ||
| 6796 | } | 6857 | } |
| 6797 | 6858 | ||
| 6798 | static int page_alloc_cpu_dead(unsigned int cpu) | 6859 | static int page_alloc_cpu_dead(unsigned int cpu) |
| @@ -7305,18 +7366,17 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
| 7305 | 7366 | ||
| 7306 | log2qty = ilog2(numentries); | 7367 | log2qty = ilog2(numentries); |
| 7307 | 7368 | ||
| 7308 | /* | ||
| 7309 | * memblock allocator returns zeroed memory already, so HASH_ZERO is | ||
| 7310 | * currently not used when HASH_EARLY is specified. | ||
| 7311 | */ | ||
| 7312 | gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; | 7369 | gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; |
| 7313 | do { | 7370 | do { |
| 7314 | size = bucketsize << log2qty; | 7371 | size = bucketsize << log2qty; |
| 7315 | if (flags & HASH_EARLY) | 7372 | if (flags & HASH_EARLY) { |
| 7316 | table = memblock_virt_alloc_nopanic(size, 0); | 7373 | if (flags & HASH_ZERO) |
| 7317 | else if (hashdist) | 7374 | table = memblock_virt_alloc_nopanic(size, 0); |
| 7375 | else | ||
| 7376 | table = memblock_virt_alloc_raw(size, 0); | ||
| 7377 | } else if (hashdist) { | ||
| 7318 | table = __vmalloc(size, gfp_flags, PAGE_KERNEL); | 7378 | table = __vmalloc(size, gfp_flags, PAGE_KERNEL); |
| 7319 | else { | 7379 | } else { |
| 7320 | /* | 7380 | /* |
| 7321 | * If bucketsize is not a power-of-two, we may free | 7381 | * If bucketsize is not a power-of-two, we may free |
| 7322 | * some pages at the end of hash table which | 7382 | * some pages at the end of hash table which |
| @@ -7353,10 +7413,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
| 7353 | * race condition. So you can't expect this function should be exact. | 7413 | * race condition. So you can't expect this function should be exact. |
| 7354 | */ | 7414 | */ |
| 7355 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | 7415 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
| 7416 | int migratetype, | ||
| 7356 | bool skip_hwpoisoned_pages) | 7417 | bool skip_hwpoisoned_pages) |
| 7357 | { | 7418 | { |
| 7358 | unsigned long pfn, iter, found; | 7419 | unsigned long pfn, iter, found; |
| 7359 | int mt; | ||
| 7360 | 7420 | ||
| 7361 | /* | 7421 | /* |
| 7362 | * For avoiding noise data, lru_add_drain_all() should be called | 7422 | * For avoiding noise data, lru_add_drain_all() should be called |
| @@ -7364,8 +7424,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
| 7364 | */ | 7424 | */ |
| 7365 | if (zone_idx(zone) == ZONE_MOVABLE) | 7425 | if (zone_idx(zone) == ZONE_MOVABLE) |
| 7366 | return false; | 7426 | return false; |
| 7367 | mt = get_pageblock_migratetype(page); | 7427 | |
| 7368 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) | 7428 | /* |
| 7429 | * CMA allocations (alloc_contig_range) really need to mark isolate | ||
| 7430 | * CMA pageblocks even when they are not movable in fact so consider | ||
| 7431 | * them movable here. | ||
| 7432 | */ | ||
| 7433 | if (is_migrate_cma(migratetype) && | ||
| 7434 | is_migrate_cma(get_pageblock_migratetype(page))) | ||
| 7369 | return false; | 7435 | return false; |
| 7370 | 7436 | ||
| 7371 | pfn = page_to_pfn(page); | 7437 | pfn = page_to_pfn(page); |
| @@ -7377,6 +7443,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
| 7377 | 7443 | ||
| 7378 | page = pfn_to_page(check); | 7444 | page = pfn_to_page(check); |
| 7379 | 7445 | ||
| 7446 | if (PageReserved(page)) | ||
| 7447 | return true; | ||
| 7448 | |||
| 7380 | /* | 7449 | /* |
| 7381 | * Hugepages are not in LRU lists, but they're movable. | 7450 | * Hugepages are not in LRU lists, but they're movable. |
| 7382 | * We need not scan over tail pages bacause we don't | 7451 | * We need not scan over tail pages bacause we don't |
| @@ -7450,7 +7519,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
| 7450 | if (!zone_spans_pfn(zone, pfn)) | 7519 | if (!zone_spans_pfn(zone, pfn)) |
| 7451 | return false; | 7520 | return false; |
| 7452 | 7521 | ||
| 7453 | return !has_unmovable_pages(zone, page, 0, true); | 7522 | return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); |
| 7454 | } | 7523 | } |
| 7455 | 7524 | ||
| 7456 | #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) | 7525 | #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) |
| @@ -7546,6 +7615,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
| 7546 | .zone = page_zone(pfn_to_page(start)), | 7615 | .zone = page_zone(pfn_to_page(start)), |
| 7547 | .mode = MIGRATE_SYNC, | 7616 | .mode = MIGRATE_SYNC, |
| 7548 | .ignore_skip_hint = true, | 7617 | .ignore_skip_hint = true, |
| 7618 | .no_set_skip_hint = true, | ||
| 7549 | .gfp_mask = current_gfp_context(gfp_mask), | 7619 | .gfp_mask = current_gfp_context(gfp_mask), |
| 7550 | }; | 7620 | }; |
| 7551 | INIT_LIST_HEAD(&cc.migratepages); | 7621 | INIT_LIST_HEAD(&cc.migratepages); |
| @@ -7582,11 +7652,18 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
| 7582 | 7652 | ||
| 7583 | /* | 7653 | /* |
| 7584 | * In case of -EBUSY, we'd like to know which page causes problem. | 7654 | * In case of -EBUSY, we'd like to know which page causes problem. |
| 7585 | * So, just fall through. We will check it in test_pages_isolated(). | 7655 | * So, just fall through. test_pages_isolated() has a tracepoint |
| 7656 | * which will report the busy page. | ||
| 7657 | * | ||
| 7658 | * It is possible that busy pages could become available before | ||
| 7659 | * the call to test_pages_isolated, and the range will actually be | ||
| 7660 | * allocated. So, if we fall through be sure to clear ret so that | ||
| 7661 | * -EBUSY is not accidentally used or returned to caller. | ||
| 7586 | */ | 7662 | */ |
| 7587 | ret = __alloc_contig_migrate_range(&cc, start, end); | 7663 | ret = __alloc_contig_migrate_range(&cc, start, end); |
| 7588 | if (ret && ret != -EBUSY) | 7664 | if (ret && ret != -EBUSY) |
| 7589 | goto done; | 7665 | goto done; |
| 7666 | ret =0; | ||
| 7590 | 7667 | ||
| 7591 | /* | 7668 | /* |
| 7592 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES | 7669 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES |
