aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c317
1 files changed, 183 insertions, 134 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f3d603cef2c0..2302f250d6b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -65,6 +65,7 @@
65#include <linux/page_owner.h> 65#include <linux/page_owner.h>
66#include <linux/kthread.h> 66#include <linux/kthread.h>
67#include <linux/memcontrol.h> 67#include <linux/memcontrol.h>
68#include <linux/ftrace.h>
68 69
69#include <asm/sections.h> 70#include <asm/sections.h>
70#include <asm/tlbflush.h> 71#include <asm/tlbflush.h>
@@ -291,6 +292,26 @@ int page_group_by_mobility_disabled __read_mostly;
291#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 292#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
292static inline void reset_deferred_meminit(pg_data_t *pgdat) 293static inline void reset_deferred_meminit(pg_data_t *pgdat)
293{ 294{
295 unsigned long max_initialise;
296 unsigned long reserved_lowmem;
297
298 /*
299 * Initialise at least 2G of a node but also take into account that
300 * two large system hashes that can take up 1GB for 0.25TB/node.
301 */
302 max_initialise = max(2UL << (30 - PAGE_SHIFT),
303 (pgdat->node_spanned_pages >> 8));
304
305 /*
306 * Compensate the all the memblock reservations (e.g. crash kernel)
307 * from the initial estimation to make sure we will initialize enough
308 * memory to boot.
309 */
310 reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
311 pgdat->node_start_pfn + max_initialise);
312 max_initialise += reserved_lowmem;
313
314 pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
294 pgdat->first_deferred_pfn = ULONG_MAX; 315 pgdat->first_deferred_pfn = ULONG_MAX;
295} 316}
296 317
@@ -313,20 +334,11 @@ static inline bool update_defer_init(pg_data_t *pgdat,
313 unsigned long pfn, unsigned long zone_end, 334 unsigned long pfn, unsigned long zone_end,
314 unsigned long *nr_initialised) 335 unsigned long *nr_initialised)
315{ 336{
316 unsigned long max_initialise;
317
318 /* Always populate low zones for address-contrained allocations */ 337 /* Always populate low zones for address-contrained allocations */
319 if (zone_end < pgdat_end_pfn(pgdat)) 338 if (zone_end < pgdat_end_pfn(pgdat))
320 return true; 339 return true;
321 /*
322 * Initialise at least 2G of a node but also take into account that
323 * two large system hashes that can take up 1GB for 0.25TB/node.
324 */
325 max_initialise = max(2UL << (30 - PAGE_SHIFT),
326 (pgdat->node_spanned_pages >> 8));
327
328 (*nr_initialised)++; 340 (*nr_initialised)++;
329 if ((*nr_initialised > max_initialise) && 341 if ((*nr_initialised > pgdat->static_init_size) &&
330 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 342 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
331 pgdat->first_deferred_pfn = pfn; 343 pgdat->first_deferred_pfn = pfn;
332 return false; 344 return false;
@@ -1090,14 +1102,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1090{ 1102{
1091 int migratetype = 0; 1103 int migratetype = 0;
1092 int batch_free = 0; 1104 int batch_free = 0;
1093 unsigned long nr_scanned, flags;
1094 bool isolated_pageblocks; 1105 bool isolated_pageblocks;
1095 1106
1096 spin_lock_irqsave(&zone->lock, flags); 1107 spin_lock(&zone->lock);
1097 isolated_pageblocks = has_isolate_pageblock(zone); 1108 isolated_pageblocks = has_isolate_pageblock(zone);
1098 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1099 if (nr_scanned)
1100 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
1101 1109
1102 while (count) { 1110 while (count) {
1103 struct page *page; 1111 struct page *page;
@@ -1142,7 +1150,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1142 trace_mm_page_pcpu_drain(page, 0, mt); 1150 trace_mm_page_pcpu_drain(page, 0, mt);
1143 } while (--count && --batch_free && !list_empty(list)); 1151 } while (--count && --batch_free && !list_empty(list));
1144 } 1152 }
1145 spin_unlock_irqrestore(&zone->lock, flags); 1153 spin_unlock(&zone->lock);
1146} 1154}
1147 1155
1148static void free_one_page(struct zone *zone, 1156static void free_one_page(struct zone *zone,
@@ -1150,19 +1158,13 @@ static void free_one_page(struct zone *zone,
1150 unsigned int order, 1158 unsigned int order,
1151 int migratetype) 1159 int migratetype)
1152{ 1160{
1153 unsigned long nr_scanned, flags; 1161 spin_lock(&zone->lock);
1154 spin_lock_irqsave(&zone->lock, flags);
1155 __count_vm_events(PGFREE, 1 << order);
1156 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1157 if (nr_scanned)
1158 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
1159
1160 if (unlikely(has_isolate_pageblock(zone) || 1162 if (unlikely(has_isolate_pageblock(zone) ||
1161 is_migrate_isolate(migratetype))) { 1163 is_migrate_isolate(migratetype))) {
1162 migratetype = get_pfnblock_migratetype(page, pfn); 1164 migratetype = get_pfnblock_migratetype(page, pfn);
1163 } 1165 }
1164 __free_one_page(page, pfn, zone, order, migratetype); 1166 __free_one_page(page, pfn, zone, order, migratetype);
1165 spin_unlock_irqrestore(&zone->lock, flags); 1167 spin_unlock(&zone->lock);
1166} 1168}
1167 1169
1168static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1170static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1240,6 +1242,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1240 1242
1241static void __free_pages_ok(struct page *page, unsigned int order) 1243static void __free_pages_ok(struct page *page, unsigned int order)
1242{ 1244{
1245 unsigned long flags;
1243 int migratetype; 1246 int migratetype;
1244 unsigned long pfn = page_to_pfn(page); 1247 unsigned long pfn = page_to_pfn(page);
1245 1248
@@ -1247,7 +1250,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
1247 return; 1250 return;
1248 1251
1249 migratetype = get_pfnblock_migratetype(page, pfn); 1252 migratetype = get_pfnblock_migratetype(page, pfn);
1253 local_irq_save(flags);
1254 __count_vm_events(PGFREE, 1 << order);
1250 free_one_page(page_zone(page), page, pfn, order, migratetype); 1255 free_one_page(page_zone(page), page, pfn, order, migratetype);
1256 local_irq_restore(flags);
1251} 1257}
1252 1258
1253static void __init __free_pages_boot_core(struct page *page, unsigned int order) 1259static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -1695,10 +1701,10 @@ static inline int check_new_page(struct page *page)
1695 return 1; 1701 return 1;
1696} 1702}
1697 1703
1698static inline bool free_pages_prezeroed(bool poisoned) 1704static inline bool free_pages_prezeroed(void)
1699{ 1705{
1700 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && 1706 return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1701 page_poisoning_enabled() && poisoned; 1707 page_poisoning_enabled();
1702} 1708}
1703 1709
1704#ifdef CONFIG_DEBUG_VM 1710#ifdef CONFIG_DEBUG_VM
@@ -1752,17 +1758,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
1752 unsigned int alloc_flags) 1758 unsigned int alloc_flags)
1753{ 1759{
1754 int i; 1760 int i;
1755 bool poisoned = true;
1756
1757 for (i = 0; i < (1 << order); i++) {
1758 struct page *p = page + i;
1759 if (poisoned)
1760 poisoned &= page_is_poisoned(p);
1761 }
1762 1761
1763 post_alloc_hook(page, order, gfp_flags); 1762 post_alloc_hook(page, order, gfp_flags);
1764 1763
1765 if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) 1764 if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
1766 for (i = 0; i < (1 << order); i++) 1765 for (i = 0; i < (1 << order); i++)
1767 clear_highpage(page + i); 1766 clear_highpage(page + i);
1768 1767
@@ -1844,9 +1843,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1844 * Note that start_page and end_pages are not aligned on a pageblock 1843 * Note that start_page and end_pages are not aligned on a pageblock
1845 * boundary. If alignment is required, use move_freepages_block() 1844 * boundary. If alignment is required, use move_freepages_block()
1846 */ 1845 */
1847int move_freepages(struct zone *zone, 1846static int move_freepages(struct zone *zone,
1848 struct page *start_page, struct page *end_page, 1847 struct page *start_page, struct page *end_page,
1849 int migratetype) 1848 int migratetype, int *num_movable)
1850{ 1849{
1851 struct page *page; 1850 struct page *page;
1852 unsigned int order; 1851 unsigned int order;
@@ -1863,6 +1862,9 @@ int move_freepages(struct zone *zone,
1863 VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); 1862 VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1864#endif 1863#endif
1865 1864
1865 if (num_movable)
1866 *num_movable = 0;
1867
1866 for (page = start_page; page <= end_page;) { 1868 for (page = start_page; page <= end_page;) {
1867 if (!pfn_valid_within(page_to_pfn(page))) { 1869 if (!pfn_valid_within(page_to_pfn(page))) {
1868 page++; 1870 page++;
@@ -1873,6 +1875,15 @@ int move_freepages(struct zone *zone,
1873 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 1875 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1874 1876
1875 if (!PageBuddy(page)) { 1877 if (!PageBuddy(page)) {
1878 /*
1879 * We assume that pages that could be isolated for
1880 * migration are movable. But we don't actually try
1881 * isolating, as that would be expensive.
1882 */
1883 if (num_movable &&
1884 (PageLRU(page) || __PageMovable(page)))
1885 (*num_movable)++;
1886
1876 page++; 1887 page++;
1877 continue; 1888 continue;
1878 } 1889 }
@@ -1888,7 +1899,7 @@ int move_freepages(struct zone *zone,
1888} 1899}
1889 1900
1890int move_freepages_block(struct zone *zone, struct page *page, 1901int move_freepages_block(struct zone *zone, struct page *page,
1891 int migratetype) 1902 int migratetype, int *num_movable)
1892{ 1903{
1893 unsigned long start_pfn, end_pfn; 1904 unsigned long start_pfn, end_pfn;
1894 struct page *start_page, *end_page; 1905 struct page *start_page, *end_page;
@@ -1905,7 +1916,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
1905 if (!zone_spans_pfn(zone, end_pfn)) 1916 if (!zone_spans_pfn(zone, end_pfn))
1906 return 0; 1917 return 0;
1907 1918
1908 return move_freepages(zone, start_page, end_page, migratetype); 1919 return move_freepages(zone, start_page, end_page, migratetype,
1920 num_movable);
1909} 1921}
1910 1922
1911static void change_pageblock_range(struct page *pageblock_page, 1923static void change_pageblock_range(struct page *pageblock_page,
@@ -1955,28 +1967,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
1955/* 1967/*
1956 * This function implements actual steal behaviour. If order is large enough, 1968 * This function implements actual steal behaviour. If order is large enough,
1957 * we can steal whole pageblock. If not, we first move freepages in this 1969 * we can steal whole pageblock. If not, we first move freepages in this
1958 * pageblock and check whether half of pages are moved or not. If half of 1970 * pageblock to our migratetype and determine how many already-allocated pages
1959 * pages are moved, we can change migratetype of pageblock and permanently 1971 * are there in the pageblock with a compatible migratetype. If at least half
1960 * use it's pages as requested migratetype in the future. 1972 * of pages are free or compatible, we can change migratetype of the pageblock
1973 * itself, so pages freed in the future will be put on the correct free list.
1961 */ 1974 */
1962static void steal_suitable_fallback(struct zone *zone, struct page *page, 1975static void steal_suitable_fallback(struct zone *zone, struct page *page,
1963 int start_type) 1976 int start_type, bool whole_block)
1964{ 1977{
1965 unsigned int current_order = page_order(page); 1978 unsigned int current_order = page_order(page);
1966 int pages; 1979 struct free_area *area;
1980 int free_pages, movable_pages, alike_pages;
1981 int old_block_type;
1982
1983 old_block_type = get_pageblock_migratetype(page);
1984
1985 /*
1986 * This can happen due to races and we want to prevent broken
1987 * highatomic accounting.
1988 */
1989 if (is_migrate_highatomic(old_block_type))
1990 goto single_page;
1967 1991
1968 /* Take ownership for orders >= pageblock_order */ 1992 /* Take ownership for orders >= pageblock_order */
1969 if (current_order >= pageblock_order) { 1993 if (current_order >= pageblock_order) {
1970 change_pageblock_range(page, current_order, start_type); 1994 change_pageblock_range(page, current_order, start_type);
1971 return; 1995 goto single_page;
1996 }
1997
1998 /* We are not allowed to try stealing from the whole block */
1999 if (!whole_block)
2000 goto single_page;
2001
2002 free_pages = move_freepages_block(zone, page, start_type,
2003 &movable_pages);
2004 /*
2005 * Determine how many pages are compatible with our allocation.
2006 * For movable allocation, it's the number of movable pages which
2007 * we just obtained. For other types it's a bit more tricky.
2008 */
2009 if (start_type == MIGRATE_MOVABLE) {
2010 alike_pages = movable_pages;
2011 } else {
2012 /*
2013 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2014 * to MOVABLE pageblock, consider all non-movable pages as
2015 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2016 * vice versa, be conservative since we can't distinguish the
2017 * exact migratetype of non-movable pages.
2018 */
2019 if (old_block_type == MIGRATE_MOVABLE)
2020 alike_pages = pageblock_nr_pages
2021 - (free_pages + movable_pages);
2022 else
2023 alike_pages = 0;
1972 } 2024 }
1973 2025
1974 pages = move_freepages_block(zone, page, start_type); 2026 /* moving whole block can fail due to zone boundary conditions */
2027 if (!free_pages)
2028 goto single_page;
1975 2029
1976 /* Claim the whole block if over half of it is free */ 2030 /*
1977 if (pages >= (1 << (pageblock_order-1)) || 2031 * If a sufficient number of pages in the block are either free or of
2032 * comparable migratability as our allocation, claim the whole block.
2033 */
2034 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
1978 page_group_by_mobility_disabled) 2035 page_group_by_mobility_disabled)
1979 set_pageblock_migratetype(page, start_type); 2036 set_pageblock_migratetype(page, start_type);
2037
2038 return;
2039
2040single_page:
2041 area = &zone->free_area[current_order];
2042 list_move(&page->lru, &area->free_list[start_type]);
1980} 2043}
1981 2044
1982/* 2045/*
@@ -2042,11 +2105,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2042 2105
2043 /* Yoink! */ 2106 /* Yoink! */
2044 mt = get_pageblock_migratetype(page); 2107 mt = get_pageblock_migratetype(page);
2045 if (mt != MIGRATE_HIGHATOMIC && 2108 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
2046 !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { 2109 && !is_migrate_cma(mt)) {
2047 zone->nr_reserved_highatomic += pageblock_nr_pages; 2110 zone->nr_reserved_highatomic += pageblock_nr_pages;
2048 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 2111 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
2049 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); 2112 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
2050 } 2113 }
2051 2114
2052out_unlock: 2115out_unlock:
@@ -2100,8 +2163,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2100 * from highatomic to ac->migratetype. So we should 2163 * from highatomic to ac->migratetype. So we should
2101 * adjust the count once. 2164 * adjust the count once.
2102 */ 2165 */
2103 if (get_pageblock_migratetype(page) == 2166 if (is_migrate_highatomic_page(page)) {
2104 MIGRATE_HIGHATOMIC) {
2105 /* 2167 /*
2106 * It should never happen but changes to 2168 * It should never happen but changes to
2107 * locking could inadvertently allow a per-cpu 2169 * locking could inadvertently allow a per-cpu
@@ -2124,7 +2186,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2124 * may increase. 2186 * may increase.
2125 */ 2187 */
2126 set_pageblock_migratetype(page, ac->migratetype); 2188 set_pageblock_migratetype(page, ac->migratetype);
2127 ret = move_freepages_block(zone, page, ac->migratetype); 2189 ret = move_freepages_block(zone, page, ac->migratetype,
2190 NULL);
2128 if (ret) { 2191 if (ret) {
2129 spin_unlock_irqrestore(&zone->lock, flags); 2192 spin_unlock_irqrestore(&zone->lock, flags);
2130 return ret; 2193 return ret;
@@ -2136,8 +2199,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2136 return false; 2199 return false;
2137} 2200}
2138 2201
2139/* Remove an element from the buddy allocator from the fallback list */ 2202/*
2140static inline struct page * 2203 * Try finding a free buddy page on the fallback list and put it on the free
2204 * list of requested migratetype, possibly along with other pages from the same
2205 * block, depending on fragmentation avoidance heuristics. Returns true if
2206 * fallback was found so that __rmqueue_smallest() can grab it.
2207 */
2208static inline bool
2141__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) 2209__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
2142{ 2210{
2143 struct free_area *area; 2211 struct free_area *area;
@@ -2158,33 +2226,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
2158 2226
2159 page = list_first_entry(&area->free_list[fallback_mt], 2227 page = list_first_entry(&area->free_list[fallback_mt],
2160 struct page, lru); 2228 struct page, lru);
2161 if (can_steal &&
2162 get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
2163 steal_suitable_fallback(zone, page, start_migratetype);
2164 2229
2165 /* Remove the page from the freelists */ 2230 steal_suitable_fallback(zone, page, start_migratetype,
2166 area->nr_free--; 2231 can_steal);
2167 list_del(&page->lru);
2168 rmv_page_order(page);
2169
2170 expand(zone, page, order, current_order, area,
2171 start_migratetype);
2172 /*
2173 * The pcppage_migratetype may differ from pageblock's
2174 * migratetype depending on the decisions in
2175 * find_suitable_fallback(). This is OK as long as it does not
2176 * differ for MIGRATE_CMA pageblocks. Those can be used as
2177 * fallback only via special __rmqueue_cma_fallback() function
2178 */
2179 set_pcppage_migratetype(page, start_migratetype);
2180 2232
2181 trace_mm_page_alloc_extfrag(page, order, current_order, 2233 trace_mm_page_alloc_extfrag(page, order, current_order,
2182 start_migratetype, fallback_mt); 2234 start_migratetype, fallback_mt);
2183 2235
2184 return page; 2236 return true;
2185 } 2237 }
2186 2238
2187 return NULL; 2239 return false;
2188} 2240}
2189 2241
2190/* 2242/*
@@ -2196,13 +2248,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
2196{ 2248{
2197 struct page *page; 2249 struct page *page;
2198 2250
2251retry:
2199 page = __rmqueue_smallest(zone, order, migratetype); 2252 page = __rmqueue_smallest(zone, order, migratetype);
2200 if (unlikely(!page)) { 2253 if (unlikely(!page)) {
2201 if (migratetype == MIGRATE_MOVABLE) 2254 if (migratetype == MIGRATE_MOVABLE)
2202 page = __rmqueue_cma_fallback(zone, order); 2255 page = __rmqueue_cma_fallback(zone, order);
2203 2256
2204 if (!page) 2257 if (!page && __rmqueue_fallback(zone, order, migratetype))
2205 page = __rmqueue_fallback(zone, order, migratetype); 2258 goto retry;
2206 } 2259 }
2207 2260
2208 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2261 trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -2219,9 +2272,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2219 int migratetype, bool cold) 2272 int migratetype, bool cold)
2220{ 2273{
2221 int i, alloced = 0; 2274 int i, alloced = 0;
2222 unsigned long flags;
2223 2275
2224 spin_lock_irqsave(&zone->lock, flags); 2276 spin_lock(&zone->lock);
2225 for (i = 0; i < count; ++i) { 2277 for (i = 0; i < count; ++i) {
2226 struct page *page = __rmqueue(zone, order, migratetype); 2278 struct page *page = __rmqueue(zone, order, migratetype);
2227 if (unlikely(page == NULL)) 2279 if (unlikely(page == NULL))
@@ -2257,7 +2309,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2257 * pages added to the pcp list. 2309 * pages added to the pcp list.
2258 */ 2310 */
2259 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2311 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2260 spin_unlock_irqrestore(&zone->lock, flags); 2312 spin_unlock(&zone->lock);
2261 return alloced; 2313 return alloced;
2262} 2314}
2263 2315
@@ -2485,25 +2537,22 @@ void free_hot_cold_page(struct page *page, bool cold)
2485{ 2537{
2486 struct zone *zone = page_zone(page); 2538 struct zone *zone = page_zone(page);
2487 struct per_cpu_pages *pcp; 2539 struct per_cpu_pages *pcp;
2540 unsigned long flags;
2488 unsigned long pfn = page_to_pfn(page); 2541 unsigned long pfn = page_to_pfn(page);
2489 int migratetype; 2542 int migratetype;
2490 2543
2491 if (in_interrupt()) {
2492 __free_pages_ok(page, 0);
2493 return;
2494 }
2495
2496 if (!free_pcp_prepare(page)) 2544 if (!free_pcp_prepare(page))
2497 return; 2545 return;
2498 2546
2499 migratetype = get_pfnblock_migratetype(page, pfn); 2547 migratetype = get_pfnblock_migratetype(page, pfn);
2500 set_pcppage_migratetype(page, migratetype); 2548 set_pcppage_migratetype(page, migratetype);
2501 preempt_disable(); 2549 local_irq_save(flags);
2550 __count_vm_event(PGFREE);
2502 2551
2503 /* 2552 /*
2504 * We only track unmovable, reclaimable and movable on pcp lists. 2553 * We only track unmovable, reclaimable and movable on pcp lists.
2505 * Free ISOLATE pages back to the allocator because they are being 2554 * Free ISOLATE pages back to the allocator because they are being
2506 * offlined but treat RESERVE as movable pages so we can get those 2555 * offlined but treat HIGHATOMIC as movable pages so we can get those
2507 * areas back if necessary. Otherwise, we may have to free 2556 * areas back if necessary. Otherwise, we may have to free
2508 * excessively into the page allocator 2557 * excessively into the page allocator
2509 */ 2558 */
@@ -2515,7 +2564,6 @@ void free_hot_cold_page(struct page *page, bool cold)
2515 migratetype = MIGRATE_MOVABLE; 2564 migratetype = MIGRATE_MOVABLE;
2516 } 2565 }
2517 2566
2518 __count_vm_event(PGFREE);
2519 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2567 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2520 if (!cold) 2568 if (!cold)
2521 list_add(&page->lru, &pcp->lists[migratetype]); 2569 list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2529,7 +2577,7 @@ void free_hot_cold_page(struct page *page, bool cold)
2529 } 2577 }
2530 2578
2531out: 2579out:
2532 preempt_enable(); 2580 local_irq_restore(flags);
2533} 2581}
2534 2582
2535/* 2583/*
@@ -2614,7 +2662,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
2614 for (; page < endpage; page += pageblock_nr_pages) { 2662 for (; page < endpage; page += pageblock_nr_pages) {
2615 int mt = get_pageblock_migratetype(page); 2663 int mt = get_pageblock_migratetype(page);
2616 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) 2664 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
2617 && mt != MIGRATE_HIGHATOMIC) 2665 && !is_migrate_highatomic(mt))
2618 set_pageblock_migratetype(page, 2666 set_pageblock_migratetype(page,
2619 MIGRATE_MOVABLE); 2667 MIGRATE_MOVABLE);
2620 } 2668 }
@@ -2654,8 +2702,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2654{ 2702{
2655 struct page *page; 2703 struct page *page;
2656 2704
2657 VM_BUG_ON(in_interrupt());
2658
2659 do { 2705 do {
2660 if (list_empty(list)) { 2706 if (list_empty(list)) {
2661 pcp->count += rmqueue_bulk(zone, 0, 2707 pcp->count += rmqueue_bulk(zone, 0,
@@ -2686,8 +2732,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2686 struct list_head *list; 2732 struct list_head *list;
2687 bool cold = ((gfp_flags & __GFP_COLD) != 0); 2733 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2688 struct page *page; 2734 struct page *page;
2735 unsigned long flags;
2689 2736
2690 preempt_disable(); 2737 local_irq_save(flags);
2691 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2738 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2692 list = &pcp->lists[migratetype]; 2739 list = &pcp->lists[migratetype];
2693 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); 2740 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
@@ -2695,7 +2742,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2695 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2742 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2696 zone_statistics(preferred_zone, zone); 2743 zone_statistics(preferred_zone, zone);
2697 } 2744 }
2698 preempt_enable(); 2745 local_irq_restore(flags);
2699 return page; 2746 return page;
2700} 2747}
2701 2748
@@ -2711,7 +2758,7 @@ struct page *rmqueue(struct zone *preferred_zone,
2711 unsigned long flags; 2758 unsigned long flags;
2712 struct page *page; 2759 struct page *page;
2713 2760
2714 if (likely(order == 0) && !in_interrupt()) { 2761 if (likely(order == 0)) {
2715 page = rmqueue_pcplist(preferred_zone, zone, order, 2762 page = rmqueue_pcplist(preferred_zone, zone, order,
2716 gfp_flags, migratetype); 2763 gfp_flags, migratetype);
2717 goto out; 2764 goto out;
@@ -3113,8 +3160,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3113 static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, 3160 static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3114 DEFAULT_RATELIMIT_BURST); 3161 DEFAULT_RATELIMIT_BURST);
3115 3162
3116 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 3163 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3117 debug_guardpage_minorder() > 0)
3118 return; 3164 return;
3119 3165
3120 pr_warn("%s: ", current->comm); 3166 pr_warn("%s: ", current->comm);
@@ -3248,14 +3294,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3248 enum compact_priority prio, enum compact_result *compact_result) 3294 enum compact_priority prio, enum compact_result *compact_result)
3249{ 3295{
3250 struct page *page; 3296 struct page *page;
3297 unsigned int noreclaim_flag;
3251 3298
3252 if (!order) 3299 if (!order)
3253 return NULL; 3300 return NULL;
3254 3301
3255 current->flags |= PF_MEMALLOC; 3302 noreclaim_flag = memalloc_noreclaim_save();
3256 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3303 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3257 prio); 3304 prio);
3258 current->flags &= ~PF_MEMALLOC; 3305 memalloc_noreclaim_restore(noreclaim_flag);
3259 3306
3260 if (*compact_result <= COMPACT_INACTIVE) 3307 if (*compact_result <= COMPACT_INACTIVE)
3261 return NULL; 3308 return NULL;
@@ -3402,12 +3449,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3402{ 3449{
3403 struct reclaim_state reclaim_state; 3450 struct reclaim_state reclaim_state;
3404 int progress; 3451 int progress;
3452 unsigned int noreclaim_flag;
3405 3453
3406 cond_resched(); 3454 cond_resched();
3407 3455
3408 /* We now go into synchronous reclaim */ 3456 /* We now go into synchronous reclaim */
3409 cpuset_memory_pressure_bump(); 3457 cpuset_memory_pressure_bump();
3410 current->flags |= PF_MEMALLOC; 3458 noreclaim_flag = memalloc_noreclaim_save();
3411 lockdep_set_current_reclaim_state(gfp_mask); 3459 lockdep_set_current_reclaim_state(gfp_mask);
3412 reclaim_state.reclaimed_slab = 0; 3460 reclaim_state.reclaimed_slab = 0;
3413 current->reclaim_state = &reclaim_state; 3461 current->reclaim_state = &reclaim_state;
@@ -3417,7 +3465,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3417 3465
3418 current->reclaim_state = NULL; 3466 current->reclaim_state = NULL;
3419 lockdep_clear_current_reclaim_state(); 3467 lockdep_clear_current_reclaim_state();
3420 current->flags &= ~PF_MEMALLOC; 3468 memalloc_noreclaim_restore(noreclaim_flag);
3421 3469
3422 cond_resched(); 3470 cond_resched();
3423 3471
@@ -3525,19 +3573,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3525} 3573}
3526 3574
3527/* 3575/*
3528 * Maximum number of reclaim retries without any progress before OOM killer
3529 * is consider as the only way to move forward.
3530 */
3531#define MAX_RECLAIM_RETRIES 16
3532
3533/*
3534 * Checks whether it makes sense to retry the reclaim to make a forward progress 3576 * Checks whether it makes sense to retry the reclaim to make a forward progress
3535 * for the given allocation request. 3577 * for the given allocation request.
3536 * The reclaim feedback represented by did_some_progress (any progress during 3578 *
3537 * the last reclaim round) and no_progress_loops (number of reclaim rounds without 3579 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
3538 * any progress in a row) is considered as well as the reclaimable pages on the 3580 * without success, or when we couldn't even meet the watermark if we
3539 * applicable zone list (with a backoff mechanism which is a function of 3581 * reclaimed all remaining pages on the LRU lists.
3540 * no_progress_loops).
3541 * 3582 *
3542 * Returns true if a retry is viable or false to enter the oom path. 3583 * Returns true if a retry is viable or false to enter the oom path.
3543 */ 3584 */
@@ -3582,13 +3623,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3582 bool wmark; 3623 bool wmark;
3583 3624
3584 available = reclaimable = zone_reclaimable_pages(zone); 3625 available = reclaimable = zone_reclaimable_pages(zone);
3585 available -= DIV_ROUND_UP((*no_progress_loops) * available,
3586 MAX_RECLAIM_RETRIES);
3587 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 3626 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
3588 3627
3589 /* 3628 /*
3590 * Would the allocation succeed if we reclaimed the whole 3629 * Would the allocation succeed if we reclaimed all
3591 * available? 3630 * reclaimable pages?
3592 */ 3631 */
3593 wmark = __zone_watermark_ok(zone, order, min_wmark, 3632 wmark = __zone_watermark_ok(zone, order, min_wmark,
3594 ac_classzone_idx(ac), alloc_flags, available); 3633 ac_classzone_idx(ac), alloc_flags, available);
@@ -3639,6 +3678,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3639 struct alloc_context *ac) 3678 struct alloc_context *ac)
3640{ 3679{
3641 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 3680 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
3681 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
3642 struct page *page = NULL; 3682 struct page *page = NULL;
3643 unsigned int alloc_flags; 3683 unsigned int alloc_flags;
3644 unsigned long did_some_progress; 3684 unsigned long did_some_progress;
@@ -3706,12 +3746,17 @@ retry_cpuset:
3706 3746
3707 /* 3747 /*
3708 * For costly allocations, try direct compaction first, as it's likely 3748 * For costly allocations, try direct compaction first, as it's likely
3709 * that we have enough base pages and don't need to reclaim. Don't try 3749 * that we have enough base pages and don't need to reclaim. For non-
3710 * that for allocations that are allowed to ignore watermarks, as the 3750 * movable high-order allocations, do that as well, as compaction will
3711 * ALLOC_NO_WATERMARKS attempt didn't yet happen. 3751 * try prevent permanent fragmentation by migrating from blocks of the
3752 * same migratetype.
3753 * Don't try this for allocations that are allowed to ignore
3754 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
3712 */ 3755 */
3713 if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && 3756 if (can_direct_reclaim &&
3714 !gfp_pfmemalloc_allowed(gfp_mask)) { 3757 (costly_order ||
3758 (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
3759 && !gfp_pfmemalloc_allowed(gfp_mask)) {
3715 page = __alloc_pages_direct_compact(gfp_mask, order, 3760 page = __alloc_pages_direct_compact(gfp_mask, order,
3716 alloc_flags, ac, 3761 alloc_flags, ac,
3717 INIT_COMPACT_PRIORITY, 3762 INIT_COMPACT_PRIORITY,
@@ -3723,7 +3768,7 @@ retry_cpuset:
3723 * Checks for costly allocations with __GFP_NORETRY, which 3768 * Checks for costly allocations with __GFP_NORETRY, which
3724 * includes THP page fault allocations 3769 * includes THP page fault allocations
3725 */ 3770 */
3726 if (gfp_mask & __GFP_NORETRY) { 3771 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
3727 /* 3772 /*
3728 * If compaction is deferred for high-order allocations, 3773 * If compaction is deferred for high-order allocations,
3729 * it is because sync compaction recently failed. If 3774 * it is because sync compaction recently failed. If
@@ -3774,7 +3819,7 @@ retry:
3774 3819
3775 /* Make sure we know about allocations which stall for too long */ 3820 /* Make sure we know about allocations which stall for too long */
3776 if (time_after(jiffies, alloc_start + stall_timeout)) { 3821 if (time_after(jiffies, alloc_start + stall_timeout)) {
3777 warn_alloc(gfp_mask, ac->nodemask, 3822 warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
3778 "page allocation stalls for %ums, order:%u", 3823 "page allocation stalls for %ums, order:%u",
3779 jiffies_to_msecs(jiffies-alloc_start), order); 3824 jiffies_to_msecs(jiffies-alloc_start), order);
3780 stall_timeout += 10 * HZ; 3825 stall_timeout += 10 * HZ;
@@ -3804,7 +3849,7 @@ retry:
3804 * Do not retry costly high order allocations unless they are 3849 * Do not retry costly high order allocations unless they are
3805 * __GFP_REPEAT 3850 * __GFP_REPEAT
3806 */ 3851 */
3807 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) 3852 if (costly_order && !(gfp_mask & __GFP_REPEAT))
3808 goto nopage; 3853 goto nopage;
3809 3854
3810 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 3855 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -3836,7 +3881,9 @@ retry:
3836 goto got_pg; 3881 goto got_pg;
3837 3882
3838 /* Avoid allocations with no watermarks from looping endlessly */ 3883 /* Avoid allocations with no watermarks from looping endlessly */
3839 if (test_thread_flag(TIF_MEMDIE)) 3884 if (test_thread_flag(TIF_MEMDIE) &&
3885 (alloc_flags == ALLOC_NO_WATERMARKS ||
3886 (gfp_mask & __GFP_NOMEMALLOC)))
3840 goto nopage; 3887 goto nopage;
3841 3888
3842 /* Retry as long as the OOM killer is making progress */ 3889 /* Retry as long as the OOM killer is making progress */
@@ -3974,10 +4021,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3974 goto out; 4021 goto out;
3975 4022
3976 /* 4023 /*
3977 * Runtime PM, block IO and its error handling path can deadlock 4024 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
3978 * because I/O on the device might not complete. 4025 * resp. GFP_NOIO which has to be inherited for all allocation requests
4026 * from a particular context which has been marked by
4027 * memalloc_no{fs,io}_{save,restore}.
3979 */ 4028 */
3980 alloc_mask = memalloc_noio_flags(gfp_mask); 4029 alloc_mask = current_gfp_context(gfp_mask);
3981 ac.spread_dirty_pages = false; 4030 ac.spread_dirty_pages = false;
3982 4031
3983 /* 4032 /*
@@ -4250,7 +4299,8 @@ EXPORT_SYMBOL(free_pages_exact);
4250 * nr_free_zone_pages() counts the number of counts pages which are beyond the 4299 * nr_free_zone_pages() counts the number of counts pages which are beyond the
4251 * high watermark within all zones at or below a given zone index. For each 4300 * high watermark within all zones at or below a given zone index. For each
4252 * zone, the number of pages is calculated as: 4301 * zone, the number of pages is calculated as:
4253 * managed_pages - high_pages 4302 *
4303 * nr_free_zone_pages = managed_pages - high_pages
4254 */ 4304 */
4255static unsigned long nr_free_zone_pages(int offset) 4305static unsigned long nr_free_zone_pages(int offset)
4256{ 4306{
@@ -4512,7 +4562,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4512#endif 4562#endif
4513 " writeback_tmp:%lukB" 4563 " writeback_tmp:%lukB"
4514 " unstable:%lukB" 4564 " unstable:%lukB"
4515 " pages_scanned:%lu"
4516 " all_unreclaimable? %s" 4565 " all_unreclaimable? %s"
4517 "\n", 4566 "\n",
4518 pgdat->node_id, 4567 pgdat->node_id,
@@ -4535,8 +4584,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4535#endif 4584#endif
4536 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 4585 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
4537 K(node_page_state(pgdat, NR_UNSTABLE_NFS)), 4586 K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
4538 node_page_state(pgdat, NR_PAGES_SCANNED), 4587 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
4539 !pgdat_reclaimable(pgdat) ? "yes" : "no"); 4588 "yes" : "no");
4540 } 4589 }
4541 4590
4542 for_each_populated_zone(zone) { 4591 for_each_populated_zone(zone) {
@@ -6100,7 +6149,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6100 /* pg_data_t should be reset to zero when it's allocated */ 6149 /* pg_data_t should be reset to zero when it's allocated */
6101 WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); 6150 WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
6102 6151
6103 reset_deferred_meminit(pgdat);
6104 pgdat->node_id = nid; 6152 pgdat->node_id = nid;
6105 pgdat->node_start_pfn = node_start_pfn; 6153 pgdat->node_start_pfn = node_start_pfn;
6106 pgdat->per_cpu_nodestats = NULL; 6154 pgdat->per_cpu_nodestats = NULL;
@@ -6122,6 +6170,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6122 (unsigned long)pgdat->node_mem_map); 6170 (unsigned long)pgdat->node_mem_map);
6123#endif 6171#endif
6124 6172
6173 reset_deferred_meminit(pgdat);
6125 free_area_init_core(pgdat); 6174 free_area_init_core(pgdat);
6126} 6175}
6127 6176
@@ -7431,7 +7480,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7431 .zone = page_zone(pfn_to_page(start)), 7480 .zone = page_zone(pfn_to_page(start)),
7432 .mode = MIGRATE_SYNC, 7481 .mode = MIGRATE_SYNC,
7433 .ignore_skip_hint = true, 7482 .ignore_skip_hint = true,
7434 .gfp_mask = memalloc_noio_flags(gfp_mask), 7483 .gfp_mask = current_gfp_context(gfp_mask),
7435 }; 7484 };
7436 INIT_LIST_HEAD(&cc.migratepages); 7485 INIT_LIST_HEAD(&cc.migratepages);
7437 7486