aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c253
1 files changed, 190 insertions, 63 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7990ca154d1b..794e6715c226 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
57#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
58#include <linux/memcontrol.h> 58#include <linux/memcontrol.h>
59#include <linux/prefetch.h> 59#include <linux/prefetch.h>
60#include <linux/page-debug-flags.h>
60 61
61#include <asm/tlbflush.h> 62#include <asm/tlbflush.h>
62#include <asm/div64.h> 63#include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
96 97
97unsigned long totalram_pages __read_mostly; 98unsigned long totalram_pages __read_mostly;
98unsigned long totalreserve_pages __read_mostly; 99unsigned long totalreserve_pages __read_mostly;
100/*
101 * When calculating the number of globally allowed dirty pages, there
102 * is a certain number of per-zone reserves that should not be
103 * considered dirtyable memory. This is the sum of those reserves
104 * over all existing zones that contribute dirtyable memory.
105 */
106unsigned long dirty_balance_reserve __read_mostly;
107
99int percpu_pagelist_fraction; 108int percpu_pagelist_fraction;
100gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 109gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
101 110
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
127 saved_gfp_mask = gfp_allowed_mask; 136 saved_gfp_mask = gfp_allowed_mask;
128 gfp_allowed_mask &= ~GFP_IOFS; 137 gfp_allowed_mask &= ~GFP_IOFS;
129} 138}
139
140bool pm_suspended_storage(void)
141{
142 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
143 return false;
144 return true;
145}
130#endif /* CONFIG_PM_SLEEP */ 146#endif /* CONFIG_PM_SLEEP */
131 147
132#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 148#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
381 clear_highpage(page + i); 397 clear_highpage(page + i);
382} 398}
383 399
400#ifdef CONFIG_DEBUG_PAGEALLOC
401unsigned int _debug_guardpage_minorder;
402
403static int __init debug_guardpage_minorder_setup(char *buf)
404{
405 unsigned long res;
406
407 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
408 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
409 return 0;
410 }
411 _debug_guardpage_minorder = res;
412 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
413 return 0;
414}
415__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
416
417static inline void set_page_guard_flag(struct page *page)
418{
419 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
420}
421
422static inline void clear_page_guard_flag(struct page *page)
423{
424 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
425}
426#else
427static inline void set_page_guard_flag(struct page *page) { }
428static inline void clear_page_guard_flag(struct page *page) { }
429#endif
430
384static inline void set_page_order(struct page *page, int order) 431static inline void set_page_order(struct page *page, int order)
385{ 432{
386 set_page_private(page, order); 433 set_page_private(page, order);
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
438 if (page_zone_id(page) != page_zone_id(buddy)) 485 if (page_zone_id(page) != page_zone_id(buddy))
439 return 0; 486 return 0;
440 487
488 if (page_is_guard(buddy) && page_order(buddy) == order) {
489 VM_BUG_ON(page_count(buddy) != 0);
490 return 1;
491 }
492
441 if (PageBuddy(buddy) && page_order(buddy) == order) { 493 if (PageBuddy(buddy) && page_order(buddy) == order) {
442 VM_BUG_ON(page_count(buddy) != 0); 494 VM_BUG_ON(page_count(buddy) != 0);
443 return 1; 495 return 1;
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page,
494 buddy = page + (buddy_idx - page_idx); 546 buddy = page + (buddy_idx - page_idx);
495 if (!page_is_buddy(page, buddy, order)) 547 if (!page_is_buddy(page, buddy, order))
496 break; 548 break;
497 549 /*
498 /* Our buddy is free, merge with it and move up one order. */ 550 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
499 list_del(&buddy->lru); 551 * merge with it and move up one order.
500 zone->free_area[order].nr_free--; 552 */
501 rmv_page_order(buddy); 553 if (page_is_guard(buddy)) {
554 clear_page_guard_flag(buddy);
555 set_page_private(page, 0);
556 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
557 } else {
558 list_del(&buddy->lru);
559 zone->free_area[order].nr_free--;
560 rmv_page_order(buddy);
561 }
502 combined_idx = buddy_idx & page_idx; 562 combined_idx = buddy_idx & page_idx;
503 page = page + (combined_idx - page_idx); 563 page = page + (combined_idx - page_idx);
504 page_idx = combined_idx; 564 page_idx = combined_idx;
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
632 int i; 692 int i;
633 int bad = 0; 693 int bad = 0;
634 694
635 trace_mm_page_free_direct(page, order); 695 trace_mm_page_free(page, order);
636 kmemcheck_free_shadow(page, order); 696 kmemcheck_free_shadow(page, order);
637 697
638 if (PageAnon(page)) 698 if (PageAnon(page))
@@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
670 local_irq_restore(flags); 730 local_irq_restore(flags);
671} 731}
672 732
673/*
674 * permit the bootmem allocator to evade page validation on high-order frees
675 */
676void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 733void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
677{ 734{
678 if (order == 0) { 735 unsigned int nr_pages = 1 << order;
679 __ClearPageReserved(page); 736 unsigned int loop;
680 set_page_count(page, 0);
681 set_page_refcounted(page);
682 __free_page(page);
683 } else {
684 int loop;
685
686 prefetchw(page);
687 for (loop = 0; loop < (1 << order); loop++) {
688 struct page *p = &page[loop];
689 737
690 if (loop + 1 < (1 << order)) 738 prefetchw(page);
691 prefetchw(p + 1); 739 for (loop = 0; loop < nr_pages; loop++) {
692 __ClearPageReserved(p); 740 struct page *p = &page[loop];
693 set_page_count(p, 0);
694 }
695 741
696 set_page_refcounted(page); 742 if (loop + 1 < nr_pages)
697 __free_pages(page, order); 743 prefetchw(p + 1);
744 __ClearPageReserved(p);
745 set_page_count(p, 0);
698 } 746 }
747
748 set_page_refcounted(page);
749 __free_pages(page, order);
699} 750}
700 751
701 752
@@ -724,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page,
724 high--; 775 high--;
725 size >>= 1; 776 size >>= 1;
726 VM_BUG_ON(bad_range(zone, &page[size])); 777 VM_BUG_ON(bad_range(zone, &page[size]));
778
779#ifdef CONFIG_DEBUG_PAGEALLOC
780 if (high < debug_guardpage_minorder()) {
781 /*
782 * Mark as guard pages (or page), that will allow to
783 * merge back to allocator when buddy will be freed.
784 * Corresponding page table entries will not be touched,
785 * pages will stay not present in virtual address space
786 */
787 INIT_LIST_HEAD(&page[size].lru);
788 set_page_guard_flag(&page[size]);
789 set_page_private(&page[size], high);
790 /* Guard pages are not available for any usage */
791 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
792 continue;
793 }
794#endif
727 list_add(&page[size].lru, &area->free_list[migratetype]); 795 list_add(&page[size].lru, &area->free_list[migratetype]);
728 area->nr_free++; 796 area->nr_free++;
729 set_page_order(&page[size], high); 797 set_page_order(&page[size], high);
@@ -1189,6 +1257,19 @@ out:
1189} 1257}
1190 1258
1191/* 1259/*
1260 * Free a list of 0-order pages
1261 */
1262void free_hot_cold_page_list(struct list_head *list, int cold)
1263{
1264 struct page *page, *next;
1265
1266 list_for_each_entry_safe(page, next, list, lru) {
1267 trace_mm_page_free_batched(page, cold);
1268 free_hot_cold_page(page, cold);
1269 }
1270}
1271
1272/*
1192 * split_page takes a non-compound higher-order page, and splits it into 1273 * split_page takes a non-compound higher-order page, and splits it into
1193 * n (1<<order) sub-pages: page[0..n] 1274 * n (1<<order) sub-pages: page[0..n]
1194 * Each sub-page must be freed individually. 1275 * Each sub-page must be freed individually.
@@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1435 long min = mark; 1516 long min = mark;
1436 int o; 1517 int o;
1437 1518
1438 free_pages -= (1 << order) + 1; 1519 free_pages -= (1 << order) - 1;
1439 if (alloc_flags & ALLOC_HIGH) 1520 if (alloc_flags & ALLOC_HIGH)
1440 min -= min / 2; 1521 min -= min / 2;
1441 if (alloc_flags & ALLOC_HARDER) 1522 if (alloc_flags & ALLOC_HARDER)
@@ -1645,6 +1726,35 @@ zonelist_scan:
1645 if ((alloc_flags & ALLOC_CPUSET) && 1726 if ((alloc_flags & ALLOC_CPUSET) &&
1646 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1727 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1647 continue; 1728 continue;
1729 /*
1730 * When allocating a page cache page for writing, we
1731 * want to get it from a zone that is within its dirty
1732 * limit, such that no single zone holds more than its
1733 * proportional share of globally allowed dirty pages.
1734 * The dirty limits take into account the zone's
1735 * lowmem reserves and high watermark so that kswapd
1736 * should be able to balance it without having to
1737 * write pages from its LRU list.
1738 *
1739 * This may look like it could increase pressure on
1740 * lower zones by failing allocations in higher zones
1741 * before they are full. But the pages that do spill
1742 * over are limited as the lower zones are protected
1743 * by this very same mechanism. It should not become
1744 * a practical burden to them.
1745 *
1746 * XXX: For now, allow allocations to potentially
1747 * exceed the per-zone dirty limit in the slowpath
1748 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1749 * which is important when on a NUMA setup the allowed
1750 * zones are together not big enough to reach the
1751 * global limit. The proper fix for these situations
1752 * will require awareness of zones in the
1753 * dirty-throttling and the flusher threads.
1754 */
1755 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1756 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1757 goto this_zone_full;
1648 1758
1649 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1759 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1650 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1760 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1734{ 1844{
1735 unsigned int filter = SHOW_MEM_FILTER_NODES; 1845 unsigned int filter = SHOW_MEM_FILTER_NODES;
1736 1846
1737 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 1847 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
1848 debug_guardpage_minorder() > 0)
1738 return; 1849 return;
1739 1850
1740 /* 1851 /*
@@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1773 1884
1774static inline int 1885static inline int
1775should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1886should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1887 unsigned long did_some_progress,
1776 unsigned long pages_reclaimed) 1888 unsigned long pages_reclaimed)
1777{ 1889{
1778 /* Do not loop if specifically requested */ 1890 /* Do not loop if specifically requested */
1779 if (gfp_mask & __GFP_NORETRY) 1891 if (gfp_mask & __GFP_NORETRY)
1780 return 0; 1892 return 0;
1781 1893
1894 /* Always retry if specifically requested */
1895 if (gfp_mask & __GFP_NOFAIL)
1896 return 1;
1897
1898 /*
1899 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
1900 * making forward progress without invoking OOM. Suspend also disables
1901 * storage devices so kswapd will not help. Bail if we are suspending.
1902 */
1903 if (!did_some_progress && pm_suspended_storage())
1904 return 0;
1905
1782 /* 1906 /*
1783 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 1907 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1784 * means __GFP_NOFAIL, but that may not be true in other 1908 * means __GFP_NOFAIL, but that may not be true in other
@@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1797 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 1921 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1798 return 1; 1922 return 1;
1799 1923
1800 /*
1801 * Don't let big-order allocations loop unless the caller
1802 * explicitly requests that.
1803 */
1804 if (gfp_mask & __GFP_NOFAIL)
1805 return 1;
1806
1807 return 0; 1924 return 0;
1808} 1925}
1809 1926
@@ -2196,7 +2313,8 @@ rebalance:
2196 2313
2197 /* Check if we should retry the allocation */ 2314 /* Check if we should retry the allocation */
2198 pages_reclaimed += did_some_progress; 2315 pages_reclaimed += did_some_progress;
2199 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2316 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2317 pages_reclaimed)) {
2200 /* Wait for some write requests to complete then retry */ 2318 /* Wait for some write requests to complete then retry */
2201 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2319 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2202 goto rebalance; 2320 goto rebalance;
@@ -2306,16 +2424,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
2306} 2424}
2307EXPORT_SYMBOL(get_zeroed_page); 2425EXPORT_SYMBOL(get_zeroed_page);
2308 2426
2309void __pagevec_free(struct pagevec *pvec)
2310{
2311 int i = pagevec_count(pvec);
2312
2313 while (--i >= 0) {
2314 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
2315 free_hot_cold_page(pvec->pages[i], pvec->cold);
2316 }
2317}
2318
2319void __free_pages(struct page *page, unsigned int order) 2427void __free_pages(struct page *page, unsigned int order)
2320{ 2428{
2321 if (put_page_testzero(page)) { 2429 if (put_page_testzero(page)) {
@@ -3385,25 +3493,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3385 if (page_to_nid(page) != zone_to_nid(zone)) 3493 if (page_to_nid(page) != zone_to_nid(zone))
3386 continue; 3494 continue;
3387 3495
3388 /* Blocks with reserved pages will never free, skip them. */
3389 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3390 if (pageblock_is_reserved(pfn, block_end_pfn))
3391 continue;
3392
3393 block_migratetype = get_pageblock_migratetype(page); 3496 block_migratetype = get_pageblock_migratetype(page);
3394 3497
3395 /* If this block is reserved, account for it */ 3498 /* Only test what is necessary when the reserves are not met */
3396 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { 3499 if (reserve > 0) {
3397 reserve--; 3500 /*
3398 continue; 3501 * Blocks with reserved pages will never free, skip
3399 } 3502 * them.
3503 */
3504 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3505 if (pageblock_is_reserved(pfn, block_end_pfn))
3506 continue;
3400 3507
3401 /* Suitable for reserving if this block is movable */ 3508 /* If this block is reserved, account for it */
3402 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { 3509 if (block_migratetype == MIGRATE_RESERVE) {
3403 set_pageblock_migratetype(page, MIGRATE_RESERVE); 3510 reserve--;
3404 move_freepages_block(zone, page, MIGRATE_RESERVE); 3511 continue;
3405 reserve--; 3512 }
3406 continue; 3513
3514 /* Suitable for reserving if this block is movable */
3515 if (block_migratetype == MIGRATE_MOVABLE) {
3516 set_pageblock_migratetype(page,
3517 MIGRATE_RESERVE);
3518 move_freepages_block(zone, page,
3519 MIGRATE_RESERVE);
3520 reserve--;
3521 continue;
3522 }
3407 } 3523 }
3408 3524
3409 /* 3525 /*
@@ -4734,8 +4850,19 @@ static void calculate_totalreserve_pages(void)
4734 if (max > zone->present_pages) 4850 if (max > zone->present_pages)
4735 max = zone->present_pages; 4851 max = zone->present_pages;
4736 reserve_pages += max; 4852 reserve_pages += max;
4853 /*
4854 * Lowmem reserves are not available to
4855 * GFP_HIGHUSER page cache allocations and
4856 * kswapd tries to balance zones to their high
4857 * watermark. As a result, neither should be
4858 * regarded as dirtyable memory, to prevent a
4859 * situation where reclaim has to clean pages
4860 * in order to balance the zones.
4861 */
4862 zone->dirty_balance_reserve = max;
4737 } 4863 }
4738 } 4864 }
4865 dirty_balance_reserve = reserve_pages;
4739 totalreserve_pages = reserve_pages; 4866 totalreserve_pages = reserve_pages;
4740} 4867}
4741 4868