aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c479
1 files changed, 278 insertions, 201 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77e4d3c5c57b..73f5d4556b3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
24#include <linux/memblock.h> 24#include <linux/memblock.h>
25#include <linux/compiler.h> 25#include <linux/compiler.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/kmemcheck.h>
28#include <linux/kasan.h> 27#include <linux/kasan.h>
29#include <linux/module.h> 28#include <linux/module.h>
30#include <linux/suspend.h> 29#include <linux/suspend.h>
@@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
83EXPORT_PER_CPU_SYMBOL(numa_node); 82EXPORT_PER_CPU_SYMBOL(numa_node);
84#endif 83#endif
85 84
85DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
86
86#ifdef CONFIG_HAVE_MEMORYLESS_NODES 87#ifdef CONFIG_HAVE_MEMORYLESS_NODES
87/* 88/*
88 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 89 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
290int page_group_by_mobility_disabled __read_mostly; 291int page_group_by_mobility_disabled __read_mostly;
291 292
292#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 293#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
294
295/*
296 * Determine how many pages need to be initialized durig early boot
297 * (non-deferred initialization).
298 * The value of first_deferred_pfn will be set later, once non-deferred pages
299 * are initialized, but for now set it ULONG_MAX.
300 */
293static inline void reset_deferred_meminit(pg_data_t *pgdat) 301static inline void reset_deferred_meminit(pg_data_t *pgdat)
294{ 302{
295 unsigned long max_initialise; 303 phys_addr_t start_addr, end_addr;
296 unsigned long reserved_lowmem; 304 unsigned long max_pgcnt;
305 unsigned long reserved;
297 306
298 /* 307 /*
299 * Initialise at least 2G of a node but also take into account that 308 * Initialise at least 2G of a node but also take into account that
300 * two large system hashes that can take up 1GB for 0.25TB/node. 309 * two large system hashes that can take up 1GB for 0.25TB/node.
301 */ 310 */
302 max_initialise = max(2UL << (30 - PAGE_SHIFT), 311 max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
303 (pgdat->node_spanned_pages >> 8)); 312 (pgdat->node_spanned_pages >> 8));
304 313
305 /* 314 /*
306 * Compensate the all the memblock reservations (e.g. crash kernel) 315 * Compensate the all the memblock reservations (e.g. crash kernel)
307 * from the initial estimation to make sure we will initialize enough 316 * from the initial estimation to make sure we will initialize enough
308 * memory to boot. 317 * memory to boot.
309 */ 318 */
310 reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, 319 start_addr = PFN_PHYS(pgdat->node_start_pfn);
311 pgdat->node_start_pfn + max_initialise); 320 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
312 max_initialise += reserved_lowmem; 321 reserved = memblock_reserved_memory_within(start_addr, end_addr);
322 max_pgcnt += PHYS_PFN(reserved);
313 323
314 pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); 324 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
315 pgdat->first_deferred_pfn = ULONG_MAX; 325 pgdat->first_deferred_pfn = ULONG_MAX;
316} 326}
317 327
@@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
338 if (zone_end < pgdat_end_pfn(pgdat)) 348 if (zone_end < pgdat_end_pfn(pgdat))
339 return true; 349 return true;
340 (*nr_initialised)++; 350 (*nr_initialised)++;
341 if ((*nr_initialised > pgdat->static_init_size) && 351 if ((*nr_initialised > pgdat->static_init_pgcnt) &&
342 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 352 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
343 pgdat->first_deferred_pfn = pfn; 353 pgdat->first_deferred_pfn = pfn;
344 return false; 354 return false;
@@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
1013 VM_BUG_ON_PAGE(PageTail(page), page); 1023 VM_BUG_ON_PAGE(PageTail(page), page);
1014 1024
1015 trace_mm_page_free(page, order); 1025 trace_mm_page_free(page, order);
1016 kmemcheck_free_shadow(page, order);
1017 1026
1018 /* 1027 /*
1019 * Check tail pages before head page information is cleared to 1028 * Check tail pages before head page information is cleared to
@@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone,
1170static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1179static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1171 unsigned long zone, int nid) 1180 unsigned long zone, int nid)
1172{ 1181{
1182 mm_zero_struct_page(page);
1173 set_page_links(page, zone, nid, pfn); 1183 set_page_links(page, zone, nid, pfn);
1174 init_page_count(page); 1184 init_page_count(page);
1175 page_mapcount_reset(page); 1185 page_mapcount_reset(page);
@@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone)
1410} 1420}
1411 1421
1412#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1422#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1413static void __init deferred_free_range(struct page *page, 1423static void __init deferred_free_range(unsigned long pfn,
1414 unsigned long pfn, int nr_pages) 1424 unsigned long nr_pages)
1415{ 1425{
1416 int i; 1426 struct page *page;
1427 unsigned long i;
1417 1428
1418 if (!page) 1429 if (!nr_pages)
1419 return; 1430 return;
1420 1431
1432 page = pfn_to_page(pfn);
1433
1421 /* Free a large naturally-aligned chunk if possible */ 1434 /* Free a large naturally-aligned chunk if possible */
1422 if (nr_pages == pageblock_nr_pages && 1435 if (nr_pages == pageblock_nr_pages &&
1423 (pfn & (pageblock_nr_pages - 1)) == 0) { 1436 (pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void)
1443 complete(&pgdat_init_all_done_comp); 1456 complete(&pgdat_init_all_done_comp);
1444} 1457}
1445 1458
1459/*
1460 * Helper for deferred_init_range, free the given range, reset the counters, and
1461 * return number of pages freed.
1462 */
1463static inline unsigned long __init __def_free(unsigned long *nr_free,
1464 unsigned long *free_base_pfn,
1465 struct page **page)
1466{
1467 unsigned long nr = *nr_free;
1468
1469 deferred_free_range(*free_base_pfn, nr);
1470 *free_base_pfn = 0;
1471 *nr_free = 0;
1472 *page = NULL;
1473
1474 return nr;
1475}
1476
1477static unsigned long __init deferred_init_range(int nid, int zid,
1478 unsigned long start_pfn,
1479 unsigned long end_pfn)
1480{
1481 struct mminit_pfnnid_cache nid_init_state = { };
1482 unsigned long nr_pgmask = pageblock_nr_pages - 1;
1483 unsigned long free_base_pfn = 0;
1484 unsigned long nr_pages = 0;
1485 unsigned long nr_free = 0;
1486 struct page *page = NULL;
1487 unsigned long pfn;
1488
1489 /*
1490 * First we check if pfn is valid on architectures where it is possible
1491 * to have holes within pageblock_nr_pages. On systems where it is not
1492 * possible, this function is optimized out.
1493 *
1494 * Then, we check if a current large page is valid by only checking the
1495 * validity of the head pfn.
1496 *
1497 * meminit_pfn_in_nid is checked on systems where pfns can interleave
1498 * within a node: a pfn is between start and end of a node, but does not
1499 * belong to this memory node.
1500 *
1501 * Finally, we minimize pfn page lookups and scheduler checks by
1502 * performing it only once every pageblock_nr_pages.
1503 *
1504 * We do it in two loops: first we initialize struct page, than free to
1505 * buddy allocator, becuse while we are freeing pages we can access
1506 * pages that are ahead (computing buddy page in __free_one_page()).
1507 */
1508 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1509 if (!pfn_valid_within(pfn))
1510 continue;
1511 if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
1512 if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1513 if (page && (pfn & nr_pgmask))
1514 page++;
1515 else
1516 page = pfn_to_page(pfn);
1517 __init_single_page(page, pfn, zid, nid);
1518 cond_resched();
1519 }
1520 }
1521 }
1522
1523 page = NULL;
1524 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1525 if (!pfn_valid_within(pfn)) {
1526 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1527 } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
1528 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1529 } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1530 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1531 } else if (page && (pfn & nr_pgmask)) {
1532 page++;
1533 nr_free++;
1534 } else {
1535 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1536 page = pfn_to_page(pfn);
1537 free_base_pfn = pfn;
1538 nr_free = 1;
1539 cond_resched();
1540 }
1541 }
1542 /* Free the last block of pages to allocator */
1543 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1544
1545 return nr_pages;
1546}
1547
1446/* Initialise remaining memory on a node */ 1548/* Initialise remaining memory on a node */
1447static int __init deferred_init_memmap(void *data) 1549static int __init deferred_init_memmap(void *data)
1448{ 1550{
1449 pg_data_t *pgdat = data; 1551 pg_data_t *pgdat = data;
1450 int nid = pgdat->node_id; 1552 int nid = pgdat->node_id;
1451 struct mminit_pfnnid_cache nid_init_state = { };
1452 unsigned long start = jiffies; 1553 unsigned long start = jiffies;
1453 unsigned long nr_pages = 0; 1554 unsigned long nr_pages = 0;
1454 unsigned long walk_start, walk_end; 1555 unsigned long spfn, epfn;
1455 int i, zid; 1556 phys_addr_t spa, epa;
1557 int zid;
1456 struct zone *zone; 1558 struct zone *zone;
1457 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1559 unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1458 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1560 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1561 u64 i;
1459 1562
1460 if (first_init_pfn == ULONG_MAX) { 1563 if (first_init_pfn == ULONG_MAX) {
1461 pgdat_init_report_one_done(); 1564 pgdat_init_report_one_done();
@@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data)
1477 if (first_init_pfn < zone_end_pfn(zone)) 1580 if (first_init_pfn < zone_end_pfn(zone))
1478 break; 1581 break;
1479 } 1582 }
1583 first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
1480 1584
1481 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { 1585 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1482 unsigned long pfn, end_pfn; 1586 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1483 struct page *page = NULL; 1587 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1484 struct page *free_base_page = NULL; 1588 nr_pages += deferred_init_range(nid, zid, spfn, epfn);
1485 unsigned long free_base_pfn = 0;
1486 int nr_to_free = 0;
1487
1488 end_pfn = min(walk_end, zone_end_pfn(zone));
1489 pfn = first_init_pfn;
1490 if (pfn < walk_start)
1491 pfn = walk_start;
1492 if (pfn < zone->zone_start_pfn)
1493 pfn = zone->zone_start_pfn;
1494
1495 for (; pfn < end_pfn; pfn++) {
1496 if (!pfn_valid_within(pfn))
1497 goto free_range;
1498
1499 /*
1500 * Ensure pfn_valid is checked every
1501 * pageblock_nr_pages for memory holes
1502 */
1503 if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1504 if (!pfn_valid(pfn)) {
1505 page = NULL;
1506 goto free_range;
1507 }
1508 }
1509
1510 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1511 page = NULL;
1512 goto free_range;
1513 }
1514
1515 /* Minimise pfn page lookups and scheduler checks */
1516 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1517 page++;
1518 } else {
1519 nr_pages += nr_to_free;
1520 deferred_free_range(free_base_page,
1521 free_base_pfn, nr_to_free);
1522 free_base_page = NULL;
1523 free_base_pfn = nr_to_free = 0;
1524
1525 page = pfn_to_page(pfn);
1526 cond_resched();
1527 }
1528
1529 if (page->flags) {
1530 VM_BUG_ON(page_zone(page) != zone);
1531 goto free_range;
1532 }
1533
1534 __init_single_page(page, pfn, zid, nid);
1535 if (!free_base_page) {
1536 free_base_page = page;
1537 free_base_pfn = pfn;
1538 nr_to_free = 0;
1539 }
1540 nr_to_free++;
1541
1542 /* Where possible, batch up pages for a single free */
1543 continue;
1544free_range:
1545 /* Free the current block of pages to allocator */
1546 nr_pages += nr_to_free;
1547 deferred_free_range(free_base_page, free_base_pfn,
1548 nr_to_free);
1549 free_base_page = NULL;
1550 free_base_pfn = nr_to_free = 0;
1551 }
1552 /* Free the last block of pages to allocator */
1553 nr_pages += nr_to_free;
1554 deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1555
1556 first_init_pfn = max(end_pfn, first_init_pfn);
1557 } 1589 }
1558 1590
1559 /* Sanity check that the next zone really is unpopulated */ 1591 /* Sanity check that the next zone really is unpopulated */
@@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
1792 * Go through the free lists for the given migratetype and remove 1824 * Go through the free lists for the given migratetype and remove
1793 * the smallest available page from the freelists 1825 * the smallest available page from the freelists
1794 */ 1826 */
1795static inline 1827static __always_inline
1796struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1828struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1797 int migratetype) 1829 int migratetype)
1798{ 1830{
@@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
1836}; 1868};
1837 1869
1838#ifdef CONFIG_CMA 1870#ifdef CONFIG_CMA
1839static struct page *__rmqueue_cma_fallback(struct zone *zone, 1871static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1840 unsigned int order) 1872 unsigned int order)
1841{ 1873{
1842 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1874 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
@@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2217 * deviation from the rest of this file, to make the for loop 2249 * deviation from the rest of this file, to make the for loop
2218 * condition simpler. 2250 * condition simpler.
2219 */ 2251 */
2220static inline bool 2252static __always_inline bool
2221__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 2253__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
2222{ 2254{
2223 struct free_area *area; 2255 struct free_area *area;
@@ -2289,8 +2321,8 @@ do_steal:
2289 * Do the hard work of removing an element from the buddy allocator. 2321 * Do the hard work of removing an element from the buddy allocator.
2290 * Call me with the zone->lock already held. 2322 * Call me with the zone->lock already held.
2291 */ 2323 */
2292static struct page *__rmqueue(struct zone *zone, unsigned int order, 2324static __always_inline struct page *
2293 int migratetype) 2325__rmqueue(struct zone *zone, unsigned int order, int migratetype)
2294{ 2326{
2295 struct page *page; 2327 struct page *page;
2296 2328
@@ -2315,7 +2347,7 @@ retry:
2315 */ 2347 */
2316static int rmqueue_bulk(struct zone *zone, unsigned int order, 2348static int rmqueue_bulk(struct zone *zone, unsigned int order,
2317 unsigned long count, struct list_head *list, 2349 unsigned long count, struct list_head *list,
2318 int migratetype, bool cold) 2350 int migratetype)
2319{ 2351{
2320 int i, alloced = 0; 2352 int i, alloced = 0;
2321 2353
@@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2329 continue; 2361 continue;
2330 2362
2331 /* 2363 /*
2332 * Split buddy pages returned by expand() are received here 2364 * Split buddy pages returned by expand() are received here in
2333 * in physical page order. The page is added to the callers and 2365 * physical page order. The page is added to the tail of
2334 * list and the list head then moves forward. From the callers 2366 * caller's list. From the callers perspective, the linked list
2335 * perspective, the linked list is ordered by page number in 2367 * is ordered by page number under some conditions. This is
2336 * some conditions. This is useful for IO devices that can 2368 * useful for IO devices that can forward direction from the
2337 * merge IO requests if the physical pages are ordered 2369 * head, thus also in the physical page order. This is useful
2338 * properly. 2370 * for IO devices that can merge IO requests if the physical
2371 * pages are ordered properly.
2339 */ 2372 */
2340 if (likely(!cold)) 2373 list_add_tail(&page->lru, list);
2341 list_add(&page->lru, list);
2342 else
2343 list_add_tail(&page->lru, list);
2344 list = &page->lru;
2345 alloced++; 2374 alloced++;
2346 if (is_migrate_cma(get_pcppage_migratetype(page))) 2375 if (is_migrate_cma(get_pcppage_migratetype(page)))
2347 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2376 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -2478,10 +2507,6 @@ void drain_all_pages(struct zone *zone)
2478 if (WARN_ON_ONCE(!mm_percpu_wq)) 2507 if (WARN_ON_ONCE(!mm_percpu_wq))
2479 return; 2508 return;
2480 2509
2481 /* Workqueues cannot recurse */
2482 if (current->flags & PF_WQ_WORKER)
2483 return;
2484
2485 /* 2510 /*
2486 * Do not drain if one is already in progress unless it's specific to 2511 * Do not drain if one is already in progress unless it's specific to
2487 * a zone. Such callers are primarily CMA and memory hotplug and need 2512 * a zone. Such callers are primarily CMA and memory hotplug and need
@@ -2590,24 +2615,25 @@ void mark_free_pages(struct zone *zone)
2590} 2615}
2591#endif /* CONFIG_PM */ 2616#endif /* CONFIG_PM */
2592 2617
2593/* 2618static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
2594 * Free a 0-order page
2595 * cold == true ? free a cold page : free a hot page
2596 */
2597void free_hot_cold_page(struct page *page, bool cold)
2598{ 2619{
2599 struct zone *zone = page_zone(page);
2600 struct per_cpu_pages *pcp;
2601 unsigned long flags;
2602 unsigned long pfn = page_to_pfn(page);
2603 int migratetype; 2620 int migratetype;
2604 2621
2605 if (!free_pcp_prepare(page)) 2622 if (!free_pcp_prepare(page))
2606 return; 2623 return false;
2607 2624
2608 migratetype = get_pfnblock_migratetype(page, pfn); 2625 migratetype = get_pfnblock_migratetype(page, pfn);
2609 set_pcppage_migratetype(page, migratetype); 2626 set_pcppage_migratetype(page, migratetype);
2610 local_irq_save(flags); 2627 return true;
2628}
2629
2630static void free_unref_page_commit(struct page *page, unsigned long pfn)
2631{
2632 struct zone *zone = page_zone(page);
2633 struct per_cpu_pages *pcp;
2634 int migratetype;
2635
2636 migratetype = get_pcppage_migratetype(page);
2611 __count_vm_event(PGFREE); 2637 __count_vm_event(PGFREE);
2612 2638
2613 /* 2639 /*
@@ -2620,38 +2646,62 @@ void free_hot_cold_page(struct page *page, bool cold)
2620 if (migratetype >= MIGRATE_PCPTYPES) { 2646 if (migratetype >= MIGRATE_PCPTYPES) {
2621 if (unlikely(is_migrate_isolate(migratetype))) { 2647 if (unlikely(is_migrate_isolate(migratetype))) {
2622 free_one_page(zone, page, pfn, 0, migratetype); 2648 free_one_page(zone, page, pfn, 0, migratetype);
2623 goto out; 2649 return;
2624 } 2650 }
2625 migratetype = MIGRATE_MOVABLE; 2651 migratetype = MIGRATE_MOVABLE;
2626 } 2652 }
2627 2653
2628 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2654 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2629 if (!cold) 2655 list_add(&page->lru, &pcp->lists[migratetype]);
2630 list_add(&page->lru, &pcp->lists[migratetype]);
2631 else
2632 list_add_tail(&page->lru, &pcp->lists[migratetype]);
2633 pcp->count++; 2656 pcp->count++;
2634 if (pcp->count >= pcp->high) { 2657 if (pcp->count >= pcp->high) {
2635 unsigned long batch = READ_ONCE(pcp->batch); 2658 unsigned long batch = READ_ONCE(pcp->batch);
2636 free_pcppages_bulk(zone, batch, pcp); 2659 free_pcppages_bulk(zone, batch, pcp);
2637 pcp->count -= batch; 2660 pcp->count -= batch;
2638 } 2661 }
2662}
2639 2663
2640out: 2664/*
2665 * Free a 0-order page
2666 */
2667void free_unref_page(struct page *page)
2668{
2669 unsigned long flags;
2670 unsigned long pfn = page_to_pfn(page);
2671
2672 if (!free_unref_page_prepare(page, pfn))
2673 return;
2674
2675 local_irq_save(flags);
2676 free_unref_page_commit(page, pfn);
2641 local_irq_restore(flags); 2677 local_irq_restore(flags);
2642} 2678}
2643 2679
2644/* 2680/*
2645 * Free a list of 0-order pages 2681 * Free a list of 0-order pages
2646 */ 2682 */
2647void free_hot_cold_page_list(struct list_head *list, bool cold) 2683void free_unref_page_list(struct list_head *list)
2648{ 2684{
2649 struct page *page, *next; 2685 struct page *page, *next;
2686 unsigned long flags, pfn;
2687
2688 /* Prepare pages for freeing */
2689 list_for_each_entry_safe(page, next, list, lru) {
2690 pfn = page_to_pfn(page);
2691 if (!free_unref_page_prepare(page, pfn))
2692 list_del(&page->lru);
2693 set_page_private(page, pfn);
2694 }
2650 2695
2696 local_irq_save(flags);
2651 list_for_each_entry_safe(page, next, list, lru) { 2697 list_for_each_entry_safe(page, next, list, lru) {
2652 trace_mm_page_free_batched(page, cold); 2698 unsigned long pfn = page_private(page);
2653 free_hot_cold_page(page, cold); 2699
2700 set_page_private(page, 0);
2701 trace_mm_page_free_batched(page);
2702 free_unref_page_commit(page, pfn);
2654 } 2703 }
2704 local_irq_restore(flags);
2655} 2705}
2656 2706
2657/* 2707/*
@@ -2669,15 +2719,6 @@ void split_page(struct page *page, unsigned int order)
2669 VM_BUG_ON_PAGE(PageCompound(page), page); 2719 VM_BUG_ON_PAGE(PageCompound(page), page);
2670 VM_BUG_ON_PAGE(!page_count(page), page); 2720 VM_BUG_ON_PAGE(!page_count(page), page);
2671 2721
2672#ifdef CONFIG_KMEMCHECK
2673 /*
2674 * Split shadow pages too, because free(page[0]) would
2675 * otherwise free the whole shadow.
2676 */
2677 if (kmemcheck_page_is_tracked(page))
2678 split_page(virt_to_page(page[0].shadow), order);
2679#endif
2680
2681 for (i = 1; i < (1 << order); i++) 2722 for (i = 1; i < (1 << order); i++)
2682 set_page_refcounted(page + i); 2723 set_page_refcounted(page + i);
2683 split_page_owner(page, order); 2724 split_page_owner(page, order);
@@ -2743,6 +2784,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2743#ifdef CONFIG_NUMA 2784#ifdef CONFIG_NUMA
2744 enum numa_stat_item local_stat = NUMA_LOCAL; 2785 enum numa_stat_item local_stat = NUMA_LOCAL;
2745 2786
2787 /* skip numa counters update if numa stats is disabled */
2788 if (!static_branch_likely(&vm_numa_stat_key))
2789 return;
2790
2746 if (z->node != numa_node_id()) 2791 if (z->node != numa_node_id())
2747 local_stat = NUMA_OTHER; 2792 local_stat = NUMA_OTHER;
2748 2793
@@ -2758,7 +2803,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2758 2803
2759/* Remove page from the per-cpu list, caller must protect the list */ 2804/* Remove page from the per-cpu list, caller must protect the list */
2760static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 2805static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2761 bool cold, struct per_cpu_pages *pcp, 2806 struct per_cpu_pages *pcp,
2762 struct list_head *list) 2807 struct list_head *list)
2763{ 2808{
2764 struct page *page; 2809 struct page *page;
@@ -2767,16 +2812,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2767 if (list_empty(list)) { 2812 if (list_empty(list)) {
2768 pcp->count += rmqueue_bulk(zone, 0, 2813 pcp->count += rmqueue_bulk(zone, 0,
2769 pcp->batch, list, 2814 pcp->batch, list,
2770 migratetype, cold); 2815 migratetype);
2771 if (unlikely(list_empty(list))) 2816 if (unlikely(list_empty(list)))
2772 return NULL; 2817 return NULL;
2773 } 2818 }
2774 2819
2775 if (cold) 2820 page = list_first_entry(list, struct page, lru);
2776 page = list_last_entry(list, struct page, lru);
2777 else
2778 page = list_first_entry(list, struct page, lru);
2779
2780 list_del(&page->lru); 2821 list_del(&page->lru);
2781 pcp->count--; 2822 pcp->count--;
2782 } while (check_new_pcp(page)); 2823 } while (check_new_pcp(page));
@@ -2791,14 +2832,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2791{ 2832{
2792 struct per_cpu_pages *pcp; 2833 struct per_cpu_pages *pcp;
2793 struct list_head *list; 2834 struct list_head *list;
2794 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2795 struct page *page; 2835 struct page *page;
2796 unsigned long flags; 2836 unsigned long flags;
2797 2837
2798 local_irq_save(flags); 2838 local_irq_save(flags);
2799 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2839 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2800 list = &pcp->lists[migratetype]; 2840 list = &pcp->lists[migratetype];
2801 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); 2841 page = __rmqueue_pcplist(zone, migratetype, pcp, list);
2802 if (page) { 2842 if (page) {
2803 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2843 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2804 zone_statistics(preferred_zone, zone); 2844 zone_statistics(preferred_zone, zone);
@@ -3006,9 +3046,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3006 if (!area->nr_free) 3046 if (!area->nr_free)
3007 continue; 3047 continue;
3008 3048
3009 if (alloc_harder)
3010 return true;
3011
3012 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3049 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3013 if (!list_empty(&area->free_list[mt])) 3050 if (!list_empty(&area->free_list[mt]))
3014 return true; 3051 return true;
@@ -3020,6 +3057,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3020 return true; 3057 return true;
3021 } 3058 }
3022#endif 3059#endif
3060 if (alloc_harder &&
3061 !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3062 return true;
3023 } 3063 }
3024 return false; 3064 return false;
3025} 3065}
@@ -3235,20 +3275,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3235 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 3275 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3236 return; 3276 return;
3237 3277
3238 pr_warn("%s: ", current->comm);
3239
3240 va_start(args, fmt); 3278 va_start(args, fmt);
3241 vaf.fmt = fmt; 3279 vaf.fmt = fmt;
3242 vaf.va = &args; 3280 vaf.va = &args;
3243 pr_cont("%pV", &vaf); 3281 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
3282 current->comm, &vaf, gfp_mask, &gfp_mask,
3283 nodemask_pr_args(nodemask));
3244 va_end(args); 3284 va_end(args);
3245 3285
3246 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
3247 if (nodemask)
3248 pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
3249 else
3250 pr_cont("(null)\n");
3251
3252 cpuset_print_current_mems_allowed(); 3286 cpuset_print_current_mems_allowed();
3253 3287
3254 dump_stack(); 3288 dump_stack();
@@ -3868,8 +3902,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3868 enum compact_result compact_result; 3902 enum compact_result compact_result;
3869 int compaction_retries; 3903 int compaction_retries;
3870 int no_progress_loops; 3904 int no_progress_loops;
3871 unsigned long alloc_start = jiffies;
3872 unsigned int stall_timeout = 10 * HZ;
3873 unsigned int cpuset_mems_cookie; 3905 unsigned int cpuset_mems_cookie;
3874 int reserve_flags; 3906 int reserve_flags;
3875 3907
@@ -4001,14 +4033,6 @@ retry:
4001 if (!can_direct_reclaim) 4033 if (!can_direct_reclaim)
4002 goto nopage; 4034 goto nopage;
4003 4035
4004 /* Make sure we know about allocations which stall for too long */
4005 if (time_after(jiffies, alloc_start + stall_timeout)) {
4006 warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
4007 "page allocation stalls for %ums, order:%u",
4008 jiffies_to_msecs(jiffies-alloc_start), order);
4009 stall_timeout += 10 * HZ;
4010 }
4011
4012 /* Avoid recursion of direct reclaim */ 4036 /* Avoid recursion of direct reclaim */
4013 if (current->flags & PF_MEMALLOC) 4037 if (current->flags & PF_MEMALLOC)
4014 goto nopage; 4038 goto nopage;
@@ -4223,9 +4247,6 @@ out:
4223 page = NULL; 4247 page = NULL;
4224 } 4248 }
4225 4249
4226 if (kmemcheck_enabled && page)
4227 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
4228
4229 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 4250 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
4230 4251
4231 return page; 4252 return page;
@@ -4262,7 +4283,7 @@ void __free_pages(struct page *page, unsigned int order)
4262{ 4283{
4263 if (put_page_testzero(page)) { 4284 if (put_page_testzero(page)) {
4264 if (order == 0) 4285 if (order == 0)
4265 free_hot_cold_page(page, false); 4286 free_unref_page(page);
4266 else 4287 else
4267 __free_pages_ok(page, order); 4288 __free_pages_ok(page, order);
4268 } 4289 }
@@ -4320,7 +4341,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
4320 unsigned int order = compound_order(page); 4341 unsigned int order = compound_order(page);
4321 4342
4322 if (order == 0) 4343 if (order == 0)
4323 free_hot_cold_page(page, false); 4344 free_unref_page(page);
4324 else 4345 else
4325 __free_pages_ok(page, order); 4346 __free_pages_ok(page, order);
4326 } 4347 }
@@ -6126,6 +6147,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
6126 } 6147 }
6127} 6148}
6128 6149
6150#ifdef CONFIG_FLAT_NODE_MEM_MAP
6129static void __ref alloc_node_mem_map(struct pglist_data *pgdat) 6151static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6130{ 6152{
6131 unsigned long __maybe_unused start = 0; 6153 unsigned long __maybe_unused start = 0;
@@ -6135,7 +6157,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6135 if (!pgdat->node_spanned_pages) 6157 if (!pgdat->node_spanned_pages)
6136 return; 6158 return;
6137 6159
6138#ifdef CONFIG_FLAT_NODE_MEM_MAP
6139 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 6160 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
6140 offset = pgdat->node_start_pfn - start; 6161 offset = pgdat->node_start_pfn - start;
6141 /* ia64 gets its own node_mem_map, before this, without bootmem */ 6162 /* ia64 gets its own node_mem_map, before this, without bootmem */
@@ -6157,6 +6178,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6157 pgdat->node_id); 6178 pgdat->node_id);
6158 pgdat->node_mem_map = map + offset; 6179 pgdat->node_mem_map = map + offset;
6159 } 6180 }
6181 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
6182 __func__, pgdat->node_id, (unsigned long)pgdat,
6183 (unsigned long)pgdat->node_mem_map);
6160#ifndef CONFIG_NEED_MULTIPLE_NODES 6184#ifndef CONFIG_NEED_MULTIPLE_NODES
6161 /* 6185 /*
6162 * With no DISCONTIG, the global mem_map is just set as node 0's 6186 * With no DISCONTIG, the global mem_map is just set as node 0's
@@ -6169,8 +6193,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6169#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6193#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6170 } 6194 }
6171#endif 6195#endif
6172#endif /* CONFIG_FLAT_NODE_MEM_MAP */
6173} 6196}
6197#else
6198static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
6199#endif /* CONFIG_FLAT_NODE_MEM_MAP */
6174 6200
6175void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 6201void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6176 unsigned long node_start_pfn, unsigned long *zholes_size) 6202 unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -6197,16 +6223,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6197 zones_size, zholes_size); 6223 zones_size, zholes_size);
6198 6224
6199 alloc_node_mem_map(pgdat); 6225 alloc_node_mem_map(pgdat);
6200#ifdef CONFIG_FLAT_NODE_MEM_MAP
6201 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
6202 nid, (unsigned long)pgdat,
6203 (unsigned long)pgdat->node_mem_map);
6204#endif
6205 6226
6206 reset_deferred_meminit(pgdat); 6227 reset_deferred_meminit(pgdat);
6207 free_area_init_core(pgdat); 6228 free_area_init_core(pgdat);
6208} 6229}
6209 6230
6231#ifdef CONFIG_HAVE_MEMBLOCK
6232/*
6233 * Only struct pages that are backed by physical memory are zeroed and
6234 * initialized by going through __init_single_page(). But, there are some
6235 * struct pages which are reserved in memblock allocator and their fields
6236 * may be accessed (for example page_to_pfn() on some configuration accesses
6237 * flags). We must explicitly zero those struct pages.
6238 */
6239void __paginginit zero_resv_unavail(void)
6240{
6241 phys_addr_t start, end;
6242 unsigned long pfn;
6243 u64 i, pgcnt;
6244
6245 /*
6246 * Loop through ranges that are reserved, but do not have reported
6247 * physical memory backing.
6248 */
6249 pgcnt = 0;
6250 for_each_resv_unavail_range(i, &start, &end) {
6251 for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
6252 mm_zero_struct_page(pfn_to_page(pfn));
6253 pgcnt++;
6254 }
6255 }
6256
6257 /*
6258 * Struct pages that do not have backing memory. This could be because
6259 * firmware is using some of this memory, or for some other reasons.
6260 * Once memblock is changed so such behaviour is not allowed: i.e.
6261 * list of "reserved" memory must be a subset of list of "memory", then
6262 * this code can be removed.
6263 */
6264 if (pgcnt)
6265 pr_info("Reserved but unavailable: %lld pages", pgcnt);
6266}
6267#endif /* CONFIG_HAVE_MEMBLOCK */
6268
6210#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 6269#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6211 6270
6212#if MAX_NUMNODES > 1 6271#if MAX_NUMNODES > 1
@@ -6630,6 +6689,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6630 node_set_state(nid, N_MEMORY); 6689 node_set_state(nid, N_MEMORY);
6631 check_for_memory(pgdat, nid); 6690 check_for_memory(pgdat, nid);
6632 } 6691 }
6692 zero_resv_unavail();
6633} 6693}
6634 6694
6635static int __init cmdline_parse_core(char *p, unsigned long *core) 6695static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6793,6 +6853,7 @@ void __init free_area_init(unsigned long *zones_size)
6793{ 6853{
6794 free_area_init_node(0, zones_size, 6854 free_area_init_node(0, zones_size,
6795 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 6855 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
6856 zero_resv_unavail();
6796} 6857}
6797 6858
6798static int page_alloc_cpu_dead(unsigned int cpu) 6859static int page_alloc_cpu_dead(unsigned int cpu)
@@ -7305,18 +7366,17 @@ void *__init alloc_large_system_hash(const char *tablename,
7305 7366
7306 log2qty = ilog2(numentries); 7367 log2qty = ilog2(numentries);
7307 7368
7308 /*
7309 * memblock allocator returns zeroed memory already, so HASH_ZERO is
7310 * currently not used when HASH_EARLY is specified.
7311 */
7312 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 7369 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
7313 do { 7370 do {
7314 size = bucketsize << log2qty; 7371 size = bucketsize << log2qty;
7315 if (flags & HASH_EARLY) 7372 if (flags & HASH_EARLY) {
7316 table = memblock_virt_alloc_nopanic(size, 0); 7373 if (flags & HASH_ZERO)
7317 else if (hashdist) 7374 table = memblock_virt_alloc_nopanic(size, 0);
7375 else
7376 table = memblock_virt_alloc_raw(size, 0);
7377 } else if (hashdist) {
7318 table = __vmalloc(size, gfp_flags, PAGE_KERNEL); 7378 table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
7319 else { 7379 } else {
7320 /* 7380 /*
7321 * If bucketsize is not a power-of-two, we may free 7381 * If bucketsize is not a power-of-two, we may free
7322 * some pages at the end of hash table which 7382 * some pages at the end of hash table which
@@ -7353,10 +7413,10 @@ void *__init alloc_large_system_hash(const char *tablename,
7353 * race condition. So you can't expect this function should be exact. 7413 * race condition. So you can't expect this function should be exact.
7354 */ 7414 */
7355bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7415bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7416 int migratetype,
7356 bool skip_hwpoisoned_pages) 7417 bool skip_hwpoisoned_pages)
7357{ 7418{
7358 unsigned long pfn, iter, found; 7419 unsigned long pfn, iter, found;
7359 int mt;
7360 7420
7361 /* 7421 /*
7362 * For avoiding noise data, lru_add_drain_all() should be called 7422 * For avoiding noise data, lru_add_drain_all() should be called
@@ -7364,8 +7424,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7364 */ 7424 */
7365 if (zone_idx(zone) == ZONE_MOVABLE) 7425 if (zone_idx(zone) == ZONE_MOVABLE)
7366 return false; 7426 return false;
7367 mt = get_pageblock_migratetype(page); 7427
7368 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 7428 /*
7429 * CMA allocations (alloc_contig_range) really need to mark isolate
7430 * CMA pageblocks even when they are not movable in fact so consider
7431 * them movable here.
7432 */
7433 if (is_migrate_cma(migratetype) &&
7434 is_migrate_cma(get_pageblock_migratetype(page)))
7369 return false; 7435 return false;
7370 7436
7371 pfn = page_to_pfn(page); 7437 pfn = page_to_pfn(page);
@@ -7377,6 +7443,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7377 7443
7378 page = pfn_to_page(check); 7444 page = pfn_to_page(check);
7379 7445
7446 if (PageReserved(page))
7447 return true;
7448
7380 /* 7449 /*
7381 * Hugepages are not in LRU lists, but they're movable. 7450 * Hugepages are not in LRU lists, but they're movable.
7382 * We need not scan over tail pages bacause we don't 7451 * We need not scan over tail pages bacause we don't
@@ -7450,7 +7519,7 @@ bool is_pageblock_removable_nolock(struct page *page)
7450 if (!zone_spans_pfn(zone, pfn)) 7519 if (!zone_spans_pfn(zone, pfn))
7451 return false; 7520 return false;
7452 7521
7453 return !has_unmovable_pages(zone, page, 0, true); 7522 return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
7454} 7523}
7455 7524
7456#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 7525#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
@@ -7546,6 +7615,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7546 .zone = page_zone(pfn_to_page(start)), 7615 .zone = page_zone(pfn_to_page(start)),
7547 .mode = MIGRATE_SYNC, 7616 .mode = MIGRATE_SYNC,
7548 .ignore_skip_hint = true, 7617 .ignore_skip_hint = true,
7618 .no_set_skip_hint = true,
7549 .gfp_mask = current_gfp_context(gfp_mask), 7619 .gfp_mask = current_gfp_context(gfp_mask),
7550 }; 7620 };
7551 INIT_LIST_HEAD(&cc.migratepages); 7621 INIT_LIST_HEAD(&cc.migratepages);
@@ -7582,11 +7652,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7582 7652
7583 /* 7653 /*
7584 * In case of -EBUSY, we'd like to know which page causes problem. 7654 * In case of -EBUSY, we'd like to know which page causes problem.
7585 * So, just fall through. We will check it in test_pages_isolated(). 7655 * So, just fall through. test_pages_isolated() has a tracepoint
7656 * which will report the busy page.
7657 *
7658 * It is possible that busy pages could become available before
7659 * the call to test_pages_isolated, and the range will actually be
7660 * allocated. So, if we fall through be sure to clear ret so that
7661 * -EBUSY is not accidentally used or returned to caller.
7586 */ 7662 */
7587 ret = __alloc_contig_migrate_range(&cc, start, end); 7663 ret = __alloc_contig_migrate_range(&cc, start, end);
7588 if (ret && ret != -EBUSY) 7664 if (ret && ret != -EBUSY)
7589 goto done; 7665 goto done;
7666 ret =0;
7590 7667
7591 /* 7668 /*
7592 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 7669 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES