diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 253 |
1 files changed, 190 insertions, 63 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7990ca154d1b..794e6715c226 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
58 | #include <linux/memcontrol.h> | 58 | #include <linux/memcontrol.h> |
59 | #include <linux/prefetch.h> | 59 | #include <linux/prefetch.h> |
60 | #include <linux/page-debug-flags.h> | ||
60 | 61 | ||
61 | #include <asm/tlbflush.h> | 62 | #include <asm/tlbflush.h> |
62 | #include <asm/div64.h> | 63 | #include <asm/div64.h> |
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states); | |||
96 | 97 | ||
97 | unsigned long totalram_pages __read_mostly; | 98 | unsigned long totalram_pages __read_mostly; |
98 | unsigned long totalreserve_pages __read_mostly; | 99 | unsigned long totalreserve_pages __read_mostly; |
100 | /* | ||
101 | * When calculating the number of globally allowed dirty pages, there | ||
102 | * is a certain number of per-zone reserves that should not be | ||
103 | * considered dirtyable memory. This is the sum of those reserves | ||
104 | * over all existing zones that contribute dirtyable memory. | ||
105 | */ | ||
106 | unsigned long dirty_balance_reserve __read_mostly; | ||
107 | |||
99 | int percpu_pagelist_fraction; | 108 | int percpu_pagelist_fraction; |
100 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 109 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
101 | 110 | ||
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void) | |||
127 | saved_gfp_mask = gfp_allowed_mask; | 136 | saved_gfp_mask = gfp_allowed_mask; |
128 | gfp_allowed_mask &= ~GFP_IOFS; | 137 | gfp_allowed_mask &= ~GFP_IOFS; |
129 | } | 138 | } |
139 | |||
140 | bool pm_suspended_storage(void) | ||
141 | { | ||
142 | if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) | ||
143 | return false; | ||
144 | return true; | ||
145 | } | ||
130 | #endif /* CONFIG_PM_SLEEP */ | 146 | #endif /* CONFIG_PM_SLEEP */ |
131 | 147 | ||
132 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 148 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
381 | clear_highpage(page + i); | 397 | clear_highpage(page + i); |
382 | } | 398 | } |
383 | 399 | ||
400 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
401 | unsigned int _debug_guardpage_minorder; | ||
402 | |||
403 | static int __init debug_guardpage_minorder_setup(char *buf) | ||
404 | { | ||
405 | unsigned long res; | ||
406 | |||
407 | if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { | ||
408 | printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); | ||
409 | return 0; | ||
410 | } | ||
411 | _debug_guardpage_minorder = res; | ||
412 | printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); | ||
413 | return 0; | ||
414 | } | ||
415 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | ||
416 | |||
417 | static inline void set_page_guard_flag(struct page *page) | ||
418 | { | ||
419 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | ||
420 | } | ||
421 | |||
422 | static inline void clear_page_guard_flag(struct page *page) | ||
423 | { | ||
424 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | ||
425 | } | ||
426 | #else | ||
427 | static inline void set_page_guard_flag(struct page *page) { } | ||
428 | static inline void clear_page_guard_flag(struct page *page) { } | ||
429 | #endif | ||
430 | |||
384 | static inline void set_page_order(struct page *page, int order) | 431 | static inline void set_page_order(struct page *page, int order) |
385 | { | 432 | { |
386 | set_page_private(page, order); | 433 | set_page_private(page, order); |
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
438 | if (page_zone_id(page) != page_zone_id(buddy)) | 485 | if (page_zone_id(page) != page_zone_id(buddy)) |
439 | return 0; | 486 | return 0; |
440 | 487 | ||
488 | if (page_is_guard(buddy) && page_order(buddy) == order) { | ||
489 | VM_BUG_ON(page_count(buddy) != 0); | ||
490 | return 1; | ||
491 | } | ||
492 | |||
441 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 493 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
442 | VM_BUG_ON(page_count(buddy) != 0); | 494 | VM_BUG_ON(page_count(buddy) != 0); |
443 | return 1; | 495 | return 1; |
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page, | |||
494 | buddy = page + (buddy_idx - page_idx); | 546 | buddy = page + (buddy_idx - page_idx); |
495 | if (!page_is_buddy(page, buddy, order)) | 547 | if (!page_is_buddy(page, buddy, order)) |
496 | break; | 548 | break; |
497 | 549 | /* | |
498 | /* Our buddy is free, merge with it and move up one order. */ | 550 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
499 | list_del(&buddy->lru); | 551 | * merge with it and move up one order. |
500 | zone->free_area[order].nr_free--; | 552 | */ |
501 | rmv_page_order(buddy); | 553 | if (page_is_guard(buddy)) { |
554 | clear_page_guard_flag(buddy); | ||
555 | set_page_private(page, 0); | ||
556 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
557 | } else { | ||
558 | list_del(&buddy->lru); | ||
559 | zone->free_area[order].nr_free--; | ||
560 | rmv_page_order(buddy); | ||
561 | } | ||
502 | combined_idx = buddy_idx & page_idx; | 562 | combined_idx = buddy_idx & page_idx; |
503 | page = page + (combined_idx - page_idx); | 563 | page = page + (combined_idx - page_idx); |
504 | page_idx = combined_idx; | 564 | page_idx = combined_idx; |
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
632 | int i; | 692 | int i; |
633 | int bad = 0; | 693 | int bad = 0; |
634 | 694 | ||
635 | trace_mm_page_free_direct(page, order); | 695 | trace_mm_page_free(page, order); |
636 | kmemcheck_free_shadow(page, order); | 696 | kmemcheck_free_shadow(page, order); |
637 | 697 | ||
638 | if (PageAnon(page)) | 698 | if (PageAnon(page)) |
@@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
670 | local_irq_restore(flags); | 730 | local_irq_restore(flags); |
671 | } | 731 | } |
672 | 732 | ||
673 | /* | ||
674 | * permit the bootmem allocator to evade page validation on high-order frees | ||
675 | */ | ||
676 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 733 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
677 | { | 734 | { |
678 | if (order == 0) { | 735 | unsigned int nr_pages = 1 << order; |
679 | __ClearPageReserved(page); | 736 | unsigned int loop; |
680 | set_page_count(page, 0); | ||
681 | set_page_refcounted(page); | ||
682 | __free_page(page); | ||
683 | } else { | ||
684 | int loop; | ||
685 | |||
686 | prefetchw(page); | ||
687 | for (loop = 0; loop < (1 << order); loop++) { | ||
688 | struct page *p = &page[loop]; | ||
689 | 737 | ||
690 | if (loop + 1 < (1 << order)) | 738 | prefetchw(page); |
691 | prefetchw(p + 1); | 739 | for (loop = 0; loop < nr_pages; loop++) { |
692 | __ClearPageReserved(p); | 740 | struct page *p = &page[loop]; |
693 | set_page_count(p, 0); | ||
694 | } | ||
695 | 741 | ||
696 | set_page_refcounted(page); | 742 | if (loop + 1 < nr_pages) |
697 | __free_pages(page, order); | 743 | prefetchw(p + 1); |
744 | __ClearPageReserved(p); | ||
745 | set_page_count(p, 0); | ||
698 | } | 746 | } |
747 | |||
748 | set_page_refcounted(page); | ||
749 | __free_pages(page, order); | ||
699 | } | 750 | } |
700 | 751 | ||
701 | 752 | ||
@@ -724,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page, | |||
724 | high--; | 775 | high--; |
725 | size >>= 1; | 776 | size >>= 1; |
726 | VM_BUG_ON(bad_range(zone, &page[size])); | 777 | VM_BUG_ON(bad_range(zone, &page[size])); |
778 | |||
779 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
780 | if (high < debug_guardpage_minorder()) { | ||
781 | /* | ||
782 | * Mark as guard pages (or page), that will allow to | ||
783 | * merge back to allocator when buddy will be freed. | ||
784 | * Corresponding page table entries will not be touched, | ||
785 | * pages will stay not present in virtual address space | ||
786 | */ | ||
787 | INIT_LIST_HEAD(&page[size].lru); | ||
788 | set_page_guard_flag(&page[size]); | ||
789 | set_page_private(&page[size], high); | ||
790 | /* Guard pages are not available for any usage */ | ||
791 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); | ||
792 | continue; | ||
793 | } | ||
794 | #endif | ||
727 | list_add(&page[size].lru, &area->free_list[migratetype]); | 795 | list_add(&page[size].lru, &area->free_list[migratetype]); |
728 | area->nr_free++; | 796 | area->nr_free++; |
729 | set_page_order(&page[size], high); | 797 | set_page_order(&page[size], high); |
@@ -1189,6 +1257,19 @@ out: | |||
1189 | } | 1257 | } |
1190 | 1258 | ||
1191 | /* | 1259 | /* |
1260 | * Free a list of 0-order pages | ||
1261 | */ | ||
1262 | void free_hot_cold_page_list(struct list_head *list, int cold) | ||
1263 | { | ||
1264 | struct page *page, *next; | ||
1265 | |||
1266 | list_for_each_entry_safe(page, next, list, lru) { | ||
1267 | trace_mm_page_free_batched(page, cold); | ||
1268 | free_hot_cold_page(page, cold); | ||
1269 | } | ||
1270 | } | ||
1271 | |||
1272 | /* | ||
1192 | * split_page takes a non-compound higher-order page, and splits it into | 1273 | * split_page takes a non-compound higher-order page, and splits it into |
1193 | * n (1<<order) sub-pages: page[0..n] | 1274 | * n (1<<order) sub-pages: page[0..n] |
1194 | * Each sub-page must be freed individually. | 1275 | * Each sub-page must be freed individually. |
@@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1435 | long min = mark; | 1516 | long min = mark; |
1436 | int o; | 1517 | int o; |
1437 | 1518 | ||
1438 | free_pages -= (1 << order) + 1; | 1519 | free_pages -= (1 << order) - 1; |
1439 | if (alloc_flags & ALLOC_HIGH) | 1520 | if (alloc_flags & ALLOC_HIGH) |
1440 | min -= min / 2; | 1521 | min -= min / 2; |
1441 | if (alloc_flags & ALLOC_HARDER) | 1522 | if (alloc_flags & ALLOC_HARDER) |
@@ -1645,6 +1726,35 @@ zonelist_scan: | |||
1645 | if ((alloc_flags & ALLOC_CPUSET) && | 1726 | if ((alloc_flags & ALLOC_CPUSET) && |
1646 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1727 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1647 | continue; | 1728 | continue; |
1729 | /* | ||
1730 | * When allocating a page cache page for writing, we | ||
1731 | * want to get it from a zone that is within its dirty | ||
1732 | * limit, such that no single zone holds more than its | ||
1733 | * proportional share of globally allowed dirty pages. | ||
1734 | * The dirty limits take into account the zone's | ||
1735 | * lowmem reserves and high watermark so that kswapd | ||
1736 | * should be able to balance it without having to | ||
1737 | * write pages from its LRU list. | ||
1738 | * | ||
1739 | * This may look like it could increase pressure on | ||
1740 | * lower zones by failing allocations in higher zones | ||
1741 | * before they are full. But the pages that do spill | ||
1742 | * over are limited as the lower zones are protected | ||
1743 | * by this very same mechanism. It should not become | ||
1744 | * a practical burden to them. | ||
1745 | * | ||
1746 | * XXX: For now, allow allocations to potentially | ||
1747 | * exceed the per-zone dirty limit in the slowpath | ||
1748 | * (ALLOC_WMARK_LOW unset) before going into reclaim, | ||
1749 | * which is important when on a NUMA setup the allowed | ||
1750 | * zones are together not big enough to reach the | ||
1751 | * global limit. The proper fix for these situations | ||
1752 | * will require awareness of zones in the | ||
1753 | * dirty-throttling and the flusher threads. | ||
1754 | */ | ||
1755 | if ((alloc_flags & ALLOC_WMARK_LOW) && | ||
1756 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) | ||
1757 | goto this_zone_full; | ||
1648 | 1758 | ||
1649 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1759 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); |
1650 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1760 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
@@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
1734 | { | 1844 | { |
1735 | unsigned int filter = SHOW_MEM_FILTER_NODES; | 1845 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
1736 | 1846 | ||
1737 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) | 1847 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || |
1848 | debug_guardpage_minorder() > 0) | ||
1738 | return; | 1849 | return; |
1739 | 1850 | ||
1740 | /* | 1851 | /* |
@@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
1773 | 1884 | ||
1774 | static inline int | 1885 | static inline int |
1775 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | 1886 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, |
1887 | unsigned long did_some_progress, | ||
1776 | unsigned long pages_reclaimed) | 1888 | unsigned long pages_reclaimed) |
1777 | { | 1889 | { |
1778 | /* Do not loop if specifically requested */ | 1890 | /* Do not loop if specifically requested */ |
1779 | if (gfp_mask & __GFP_NORETRY) | 1891 | if (gfp_mask & __GFP_NORETRY) |
1780 | return 0; | 1892 | return 0; |
1781 | 1893 | ||
1894 | /* Always retry if specifically requested */ | ||
1895 | if (gfp_mask & __GFP_NOFAIL) | ||
1896 | return 1; | ||
1897 | |||
1898 | /* | ||
1899 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim | ||
1900 | * making forward progress without invoking OOM. Suspend also disables | ||
1901 | * storage devices so kswapd will not help. Bail if we are suspending. | ||
1902 | */ | ||
1903 | if (!did_some_progress && pm_suspended_storage()) | ||
1904 | return 0; | ||
1905 | |||
1782 | /* | 1906 | /* |
1783 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | 1907 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER |
1784 | * means __GFP_NOFAIL, but that may not be true in other | 1908 | * means __GFP_NOFAIL, but that may not be true in other |
@@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, | |||
1797 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | 1921 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) |
1798 | return 1; | 1922 | return 1; |
1799 | 1923 | ||
1800 | /* | ||
1801 | * Don't let big-order allocations loop unless the caller | ||
1802 | * explicitly requests that. | ||
1803 | */ | ||
1804 | if (gfp_mask & __GFP_NOFAIL) | ||
1805 | return 1; | ||
1806 | |||
1807 | return 0; | 1924 | return 0; |
1808 | } | 1925 | } |
1809 | 1926 | ||
@@ -2196,7 +2313,8 @@ rebalance: | |||
2196 | 2313 | ||
2197 | /* Check if we should retry the allocation */ | 2314 | /* Check if we should retry the allocation */ |
2198 | pages_reclaimed += did_some_progress; | 2315 | pages_reclaimed += did_some_progress; |
2199 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { | 2316 | if (should_alloc_retry(gfp_mask, order, did_some_progress, |
2317 | pages_reclaimed)) { | ||
2200 | /* Wait for some write requests to complete then retry */ | 2318 | /* Wait for some write requests to complete then retry */ |
2201 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2319 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2202 | goto rebalance; | 2320 | goto rebalance; |
@@ -2306,16 +2424,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) | |||
2306 | } | 2424 | } |
2307 | EXPORT_SYMBOL(get_zeroed_page); | 2425 | EXPORT_SYMBOL(get_zeroed_page); |
2308 | 2426 | ||
2309 | void __pagevec_free(struct pagevec *pvec) | ||
2310 | { | ||
2311 | int i = pagevec_count(pvec); | ||
2312 | |||
2313 | while (--i >= 0) { | ||
2314 | trace_mm_pagevec_free(pvec->pages[i], pvec->cold); | ||
2315 | free_hot_cold_page(pvec->pages[i], pvec->cold); | ||
2316 | } | ||
2317 | } | ||
2318 | |||
2319 | void __free_pages(struct page *page, unsigned int order) | 2427 | void __free_pages(struct page *page, unsigned int order) |
2320 | { | 2428 | { |
2321 | if (put_page_testzero(page)) { | 2429 | if (put_page_testzero(page)) { |
@@ -3385,25 +3493,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3385 | if (page_to_nid(page) != zone_to_nid(zone)) | 3493 | if (page_to_nid(page) != zone_to_nid(zone)) |
3386 | continue; | 3494 | continue; |
3387 | 3495 | ||
3388 | /* Blocks with reserved pages will never free, skip them. */ | ||
3389 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | ||
3390 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3391 | continue; | ||
3392 | |||
3393 | block_migratetype = get_pageblock_migratetype(page); | 3496 | block_migratetype = get_pageblock_migratetype(page); |
3394 | 3497 | ||
3395 | /* If this block is reserved, account for it */ | 3498 | /* Only test what is necessary when the reserves are not met */ |
3396 | if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { | 3499 | if (reserve > 0) { |
3397 | reserve--; | 3500 | /* |
3398 | continue; | 3501 | * Blocks with reserved pages will never free, skip |
3399 | } | 3502 | * them. |
3503 | */ | ||
3504 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | ||
3505 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3506 | continue; | ||
3400 | 3507 | ||
3401 | /* Suitable for reserving if this block is movable */ | 3508 | /* If this block is reserved, account for it */ |
3402 | if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { | 3509 | if (block_migratetype == MIGRATE_RESERVE) { |
3403 | set_pageblock_migratetype(page, MIGRATE_RESERVE); | 3510 | reserve--; |
3404 | move_freepages_block(zone, page, MIGRATE_RESERVE); | 3511 | continue; |
3405 | reserve--; | 3512 | } |
3406 | continue; | 3513 | |
3514 | /* Suitable for reserving if this block is movable */ | ||
3515 | if (block_migratetype == MIGRATE_MOVABLE) { | ||
3516 | set_pageblock_migratetype(page, | ||
3517 | MIGRATE_RESERVE); | ||
3518 | move_freepages_block(zone, page, | ||
3519 | MIGRATE_RESERVE); | ||
3520 | reserve--; | ||
3521 | continue; | ||
3522 | } | ||
3407 | } | 3523 | } |
3408 | 3524 | ||
3409 | /* | 3525 | /* |
@@ -4734,8 +4850,19 @@ static void calculate_totalreserve_pages(void) | |||
4734 | if (max > zone->present_pages) | 4850 | if (max > zone->present_pages) |
4735 | max = zone->present_pages; | 4851 | max = zone->present_pages; |
4736 | reserve_pages += max; | 4852 | reserve_pages += max; |
4853 | /* | ||
4854 | * Lowmem reserves are not available to | ||
4855 | * GFP_HIGHUSER page cache allocations and | ||
4856 | * kswapd tries to balance zones to their high | ||
4857 | * watermark. As a result, neither should be | ||
4858 | * regarded as dirtyable memory, to prevent a | ||
4859 | * situation where reclaim has to clean pages | ||
4860 | * in order to balance the zones. | ||
4861 | */ | ||
4862 | zone->dirty_balance_reserve = max; | ||
4737 | } | 4863 | } |
4738 | } | 4864 | } |
4865 | dirty_balance_reserve = reserve_pages; | ||
4739 | totalreserve_pages = reserve_pages; | 4866 | totalreserve_pages = reserve_pages; |
4740 | } | 4867 | } |
4741 | 4868 | ||