aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c224
1 files changed, 157 insertions, 67 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3222193c46c6..e75865d58ba7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -155,16 +155,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
155 * The following functions are used by the suspend/hibernate code to temporarily 155 * The following functions are used by the suspend/hibernate code to temporarily
156 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 156 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
157 * while devices are suspended. To avoid races with the suspend/hibernate code, 157 * while devices are suspended. To avoid races with the suspend/hibernate code,
158 * they should always be called with pm_mutex held (gfp_allowed_mask also should 158 * they should always be called with system_transition_mutex held
159 * only be modified with pm_mutex held, unless the suspend/hibernate code is 159 * (gfp_allowed_mask also should only be modified with system_transition_mutex
160 * guaranteed not to run in parallel with that modification). 160 * held, unless the suspend/hibernate code is guaranteed not to run in parallel
161 * with that modification).
161 */ 162 */
162 163
163static gfp_t saved_gfp_mask; 164static gfp_t saved_gfp_mask;
164 165
165void pm_restore_gfp_mask(void) 166void pm_restore_gfp_mask(void)
166{ 167{
167 WARN_ON(!mutex_is_locked(&pm_mutex)); 168 WARN_ON(!mutex_is_locked(&system_transition_mutex));
168 if (saved_gfp_mask) { 169 if (saved_gfp_mask) {
169 gfp_allowed_mask = saved_gfp_mask; 170 gfp_allowed_mask = saved_gfp_mask;
170 saved_gfp_mask = 0; 171 saved_gfp_mask = 0;
@@ -173,7 +174,7 @@ void pm_restore_gfp_mask(void)
173 174
174void pm_restrict_gfp_mask(void) 175void pm_restrict_gfp_mask(void)
175{ 176{
176 WARN_ON(!mutex_is_locked(&pm_mutex)); 177 WARN_ON(!mutex_is_locked(&system_transition_mutex));
177 WARN_ON(saved_gfp_mask); 178 WARN_ON(saved_gfp_mask);
178 saved_gfp_mask = gfp_allowed_mask; 179 saved_gfp_mask = gfp_allowed_mask;
179 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); 180 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
@@ -2908,10 +2909,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2908 if (!static_branch_likely(&vm_numa_stat_key)) 2909 if (!static_branch_likely(&vm_numa_stat_key))
2909 return; 2910 return;
2910 2911
2911 if (z->node != numa_node_id()) 2912 if (zone_to_nid(z) != numa_node_id())
2912 local_stat = NUMA_OTHER; 2913 local_stat = NUMA_OTHER;
2913 2914
2914 if (z->node == preferred_zone->node) 2915 if (zone_to_nid(z) == zone_to_nid(preferred_zone))
2915 __inc_numa_state(z, NUMA_HIT); 2916 __inc_numa_state(z, NUMA_HIT);
2916 else { 2917 else {
2917 __inc_numa_state(z, NUMA_MISS); 2918 __inc_numa_state(z, NUMA_MISS);
@@ -4164,11 +4165,12 @@ retry:
4164 alloc_flags = reserve_flags; 4165 alloc_flags = reserve_flags;
4165 4166
4166 /* 4167 /*
4167 * Reset the zonelist iterators if memory policies can be ignored. 4168 * Reset the nodemask and zonelist iterators if memory policies can be
4168 * These allocations are high priority and system rather than user 4169 * ignored. These allocations are high priority and system rather than
4169 * orientated. 4170 * user oriented.
4170 */ 4171 */
4171 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 4172 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
4173 ac->nodemask = NULL;
4172 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4174 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4173 ac->high_zoneidx, ac->nodemask); 4175 ac->high_zoneidx, ac->nodemask);
4174 } 4176 }
@@ -4402,19 +4404,15 @@ out:
4402EXPORT_SYMBOL(__alloc_pages_nodemask); 4404EXPORT_SYMBOL(__alloc_pages_nodemask);
4403 4405
4404/* 4406/*
4405 * Common helper functions. 4407 * Common helper functions. Never use with __GFP_HIGHMEM because the returned
4408 * address cannot represent highmem pages. Use alloc_pages and then kmap if
4409 * you need to access high mem.
4406 */ 4410 */
4407unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 4411unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
4408{ 4412{
4409 struct page *page; 4413 struct page *page;
4410 4414
4411 /* 4415 page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
4412 * __get_free_pages() returns a virtual address, which cannot represent
4413 * a highmem page
4414 */
4415 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
4416
4417 page = alloc_pages(gfp_mask, order);
4418 if (!page) 4416 if (!page)
4419 return 0; 4417 return 0;
4420 return (unsigned long) page_address(page); 4418 return (unsigned long) page_address(page);
@@ -5280,7 +5278,7 @@ int local_memory_node(int node)
5280 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 5278 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5281 gfp_zone(GFP_KERNEL), 5279 gfp_zone(GFP_KERNEL),
5282 NULL); 5280 NULL);
5283 return z->zone->node; 5281 return zone_to_nid(z->zone);
5284} 5282}
5285#endif 5283#endif
5286 5284
@@ -5566,13 +5564,12 @@ static int zone_batchsize(struct zone *zone)
5566 5564
5567 /* 5565 /*
5568 * The per-cpu-pages pools are set to around 1000th of the 5566 * The per-cpu-pages pools are set to around 1000th of the
5569 * size of the zone. But no more than 1/2 of a meg. 5567 * size of the zone.
5570 *
5571 * OK, so we don't know how big the cache is. So guess.
5572 */ 5568 */
5573 batch = zone->managed_pages / 1024; 5569 batch = zone->managed_pages / 1024;
5574 if (batch * PAGE_SIZE > 512 * 1024) 5570 /* But no more than a meg. */
5575 batch = (512 * 1024) / PAGE_SIZE; 5571 if (batch * PAGE_SIZE > 1024 * 1024)
5572 batch = (1024 * 1024) / PAGE_SIZE;
5576 batch /= 4; /* We effectively *= 4 below */ 5573 batch /= 4; /* We effectively *= 4 below */
5577 if (batch < 1) 5574 if (batch < 1)
5578 batch = 1; 5575 batch = 1;
@@ -6123,7 +6120,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
6123 return usemapsize / 8; 6120 return usemapsize / 8;
6124} 6121}
6125 6122
6126static void __init setup_usemap(struct pglist_data *pgdat, 6123static void __ref setup_usemap(struct pglist_data *pgdat,
6127 struct zone *zone, 6124 struct zone *zone,
6128 unsigned long zone_start_pfn, 6125 unsigned long zone_start_pfn,
6129 unsigned long zonesize) 6126 unsigned long zonesize)
@@ -6143,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
6143#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 6140#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
6144 6141
6145/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 6142/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
6146void __paginginit set_pageblock_order(void) 6143void __init set_pageblock_order(void)
6147{ 6144{
6148 unsigned int order; 6145 unsigned int order;
6149 6146
@@ -6171,14 +6168,14 @@ void __paginginit set_pageblock_order(void)
6171 * include/linux/pageblock-flags.h for the values of pageblock_order based on 6168 * include/linux/pageblock-flags.h for the values of pageblock_order based on
6172 * the kernel config 6169 * the kernel config
6173 */ 6170 */
6174void __paginginit set_pageblock_order(void) 6171void __init set_pageblock_order(void)
6175{ 6172{
6176} 6173}
6177 6174
6178#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 6175#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
6179 6176
6180static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, 6177static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
6181 unsigned long present_pages) 6178 unsigned long present_pages)
6182{ 6179{
6183 unsigned long pages = spanned_pages; 6180 unsigned long pages = spanned_pages;
6184 6181
@@ -6197,39 +6194,99 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
6197 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; 6194 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
6198} 6195}
6199 6196
6200/*
6201 * Set up the zone data structures:
6202 * - mark all pages reserved
6203 * - mark all memory queues empty
6204 * - clear the memory bitmaps
6205 *
6206 * NOTE: pgdat should get zeroed by caller.
6207 */
6208static void __paginginit free_area_init_core(struct pglist_data *pgdat)
6209{
6210 enum zone_type j;
6211 int nid = pgdat->node_id;
6212
6213 pgdat_resize_init(pgdat);
6214#ifdef CONFIG_NUMA_BALANCING 6197#ifdef CONFIG_NUMA_BALANCING
6198static void pgdat_init_numabalancing(struct pglist_data *pgdat)
6199{
6215 spin_lock_init(&pgdat->numabalancing_migrate_lock); 6200 spin_lock_init(&pgdat->numabalancing_migrate_lock);
6216 pgdat->numabalancing_migrate_nr_pages = 0; 6201 pgdat->numabalancing_migrate_nr_pages = 0;
6217 pgdat->numabalancing_migrate_next_window = jiffies; 6202 pgdat->numabalancing_migrate_next_window = jiffies;
6203}
6204#else
6205static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
6218#endif 6206#endif
6207
6219#ifdef CONFIG_TRANSPARENT_HUGEPAGE 6208#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6209static void pgdat_init_split_queue(struct pglist_data *pgdat)
6210{
6220 spin_lock_init(&pgdat->split_queue_lock); 6211 spin_lock_init(&pgdat->split_queue_lock);
6221 INIT_LIST_HEAD(&pgdat->split_queue); 6212 INIT_LIST_HEAD(&pgdat->split_queue);
6222 pgdat->split_queue_len = 0; 6213 pgdat->split_queue_len = 0;
6214}
6215#else
6216static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
6223#endif 6217#endif
6224 init_waitqueue_head(&pgdat->kswapd_wait); 6218
6225 init_waitqueue_head(&pgdat->pfmemalloc_wait);
6226#ifdef CONFIG_COMPACTION 6219#ifdef CONFIG_COMPACTION
6220static void pgdat_init_kcompactd(struct pglist_data *pgdat)
6221{
6227 init_waitqueue_head(&pgdat->kcompactd_wait); 6222 init_waitqueue_head(&pgdat->kcompactd_wait);
6223}
6224#else
6225static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
6228#endif 6226#endif
6227
6228static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
6229{
6230 pgdat_resize_init(pgdat);
6231
6232 pgdat_init_numabalancing(pgdat);
6233 pgdat_init_split_queue(pgdat);
6234 pgdat_init_kcompactd(pgdat);
6235
6236 init_waitqueue_head(&pgdat->kswapd_wait);
6237 init_waitqueue_head(&pgdat->pfmemalloc_wait);
6238
6229 pgdat_page_ext_init(pgdat); 6239 pgdat_page_ext_init(pgdat);
6230 spin_lock_init(&pgdat->lru_lock); 6240 spin_lock_init(&pgdat->lru_lock);
6231 lruvec_init(node_lruvec(pgdat)); 6241 lruvec_init(node_lruvec(pgdat));
6242}
6243
6244static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6245 unsigned long remaining_pages)
6246{
6247 zone->managed_pages = remaining_pages;
6248 zone_set_nid(zone, nid);
6249 zone->name = zone_names[idx];
6250 zone->zone_pgdat = NODE_DATA(nid);
6251 spin_lock_init(&zone->lock);
6252 zone_seqlock_init(zone);
6253 zone_pcp_init(zone);
6254}
6255
6256/*
6257 * Set up the zone data structures
6258 * - init pgdat internals
6259 * - init all zones belonging to this node
6260 *
6261 * NOTE: this function is only called during memory hotplug
6262 */
6263#ifdef CONFIG_MEMORY_HOTPLUG
6264void __ref free_area_init_core_hotplug(int nid)
6265{
6266 enum zone_type z;
6267 pg_data_t *pgdat = NODE_DATA(nid);
6268
6269 pgdat_init_internals(pgdat);
6270 for (z = 0; z < MAX_NR_ZONES; z++)
6271 zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
6272}
6273#endif
6274
6275/*
6276 * Set up the zone data structures:
6277 * - mark all pages reserved
6278 * - mark all memory queues empty
6279 * - clear the memory bitmaps
6280 *
6281 * NOTE: pgdat should get zeroed by caller.
6282 * NOTE: this function is only called during early init.
6283 */
6284static void __init free_area_init_core(struct pglist_data *pgdat)
6285{
6286 enum zone_type j;
6287 int nid = pgdat->node_id;
6232 6288
6289 pgdat_init_internals(pgdat);
6233 pgdat->per_cpu_nodestats = &boot_nodestats; 6290 pgdat->per_cpu_nodestats = &boot_nodestats;
6234 6291
6235 for (j = 0; j < MAX_NR_ZONES; j++) { 6292 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -6277,15 +6334,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
6277 * when the bootmem allocator frees pages into the buddy system. 6334 * when the bootmem allocator frees pages into the buddy system.
6278 * And all highmem pages will be managed by the buddy system. 6335 * And all highmem pages will be managed by the buddy system.
6279 */ 6336 */
6280 zone->managed_pages = freesize; 6337 zone_init_internals(zone, j, nid, freesize);
6281#ifdef CONFIG_NUMA
6282 zone->node = nid;
6283#endif
6284 zone->name = zone_names[j];
6285 zone->zone_pgdat = pgdat;
6286 spin_lock_init(&zone->lock);
6287 zone_seqlock_init(zone);
6288 zone_pcp_init(zone);
6289 6338
6290 if (!size) 6339 if (!size)
6291 continue; 6340 continue;
@@ -6345,8 +6394,24 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6345static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } 6394static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
6346#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 6395#endif /* CONFIG_FLAT_NODE_MEM_MAP */
6347 6396
6348void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 6397#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6349 unsigned long node_start_pfn, unsigned long *zholes_size) 6398static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6399{
6400 /*
6401 * We start only with one section of pages, more pages are added as
6402 * needed until the rest of deferred pages are initialized.
6403 */
6404 pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6405 pgdat->node_spanned_pages);
6406 pgdat->first_deferred_pfn = ULONG_MAX;
6407}
6408#else
6409static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
6410#endif
6411
6412void __init free_area_init_node(int nid, unsigned long *zones_size,
6413 unsigned long node_start_pfn,
6414 unsigned long *zholes_size)
6350{ 6415{
6351 pg_data_t *pgdat = NODE_DATA(nid); 6416 pg_data_t *pgdat = NODE_DATA(nid);
6352 unsigned long start_pfn = 0; 6417 unsigned long start_pfn = 0;
@@ -6370,16 +6435,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6370 zones_size, zholes_size); 6435 zones_size, zholes_size);
6371 6436
6372 alloc_node_mem_map(pgdat); 6437 alloc_node_mem_map(pgdat);
6438 pgdat_set_deferred_range(pgdat);
6373 6439
6374#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6375 /*
6376 * We start only with one section of pages, more pages are added as
6377 * needed until the rest of deferred pages are initialized.
6378 */
6379 pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6380 pgdat->node_spanned_pages);
6381 pgdat->first_deferred_pfn = ULONG_MAX;
6382#endif
6383 free_area_init_core(pgdat); 6440 free_area_init_core(pgdat);
6384} 6441}
6385 6442
@@ -6391,7 +6448,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6391 * may be accessed (for example page_to_pfn() on some configuration accesses 6448 * may be accessed (for example page_to_pfn() on some configuration accesses
6392 * flags). We must explicitly zero those struct pages. 6449 * flags). We must explicitly zero those struct pages.
6393 */ 6450 */
6394void __paginginit zero_resv_unavail(void) 6451void __init zero_resv_unavail(void)
6395{ 6452{
6396 phys_addr_t start, end; 6453 phys_addr_t start, end;
6397 unsigned long pfn; 6454 unsigned long pfn;
@@ -6404,8 +6461,11 @@ void __paginginit zero_resv_unavail(void)
6404 pgcnt = 0; 6461 pgcnt = 0;
6405 for_each_resv_unavail_range(i, &start, &end) { 6462 for_each_resv_unavail_range(i, &start, &end) {
6406 for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { 6463 for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
6407 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) 6464 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6465 pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6466 + pageblock_nr_pages - 1;
6408 continue; 6467 continue;
6468 }
6409 mm_zero_struct_page(pfn_to_page(pfn)); 6469 mm_zero_struct_page(pfn_to_page(pfn));
6410 pgcnt++; 6470 pgcnt++;
6411 } 6471 }
@@ -8036,3 +8096,33 @@ bool is_free_buddy_page(struct page *page)
8036 8096
8037 return order < MAX_ORDER; 8097 return order < MAX_ORDER;
8038} 8098}
8099
8100#ifdef CONFIG_MEMORY_FAILURE
8101/*
8102 * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
8103 * test is performed under the zone lock to prevent a race against page
8104 * allocation.
8105 */
8106bool set_hwpoison_free_buddy_page(struct page *page)
8107{
8108 struct zone *zone = page_zone(page);
8109 unsigned long pfn = page_to_pfn(page);
8110 unsigned long flags;
8111 unsigned int order;
8112 bool hwpoisoned = false;
8113
8114 spin_lock_irqsave(&zone->lock, flags);
8115 for (order = 0; order < MAX_ORDER; order++) {
8116 struct page *page_head = page - (pfn & ((1 << order) - 1));
8117
8118 if (PageBuddy(page_head) && page_order(page_head) >= order) {
8119 if (!TestSetPageHWPoison(page))
8120 hwpoisoned = true;
8121 break;
8122 }
8123 }
8124 spin_unlock_irqrestore(&zone->lock, flags);
8125
8126 return hwpoisoned;
8127}
8128#endif