aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c267
1 files changed, 219 insertions, 48 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6326c71b663..08b349931ebc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,6 +49,7 @@
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h> 51#include <linux/memory.h>
52#include <linux/compaction.h>
52#include <trace/events/kmem.h> 53#include <trace/events/kmem.h>
53#include <linux/ftrace_event.h> 54#include <linux/ftrace_event.h>
54 55
@@ -475,6 +476,8 @@ static inline void __free_one_page(struct page *page,
475 int migratetype) 476 int migratetype)
476{ 477{
477 unsigned long page_idx; 478 unsigned long page_idx;
479 unsigned long combined_idx;
480 struct page *buddy;
478 481
479 if (unlikely(PageCompound(page))) 482 if (unlikely(PageCompound(page)))
480 if (unlikely(destroy_compound_page(page, order))) 483 if (unlikely(destroy_compound_page(page, order)))
@@ -488,9 +491,6 @@ static inline void __free_one_page(struct page *page,
488 VM_BUG_ON(bad_range(zone, page)); 491 VM_BUG_ON(bad_range(zone, page));
489 492
490 while (order < MAX_ORDER-1) { 493 while (order < MAX_ORDER-1) {
491 unsigned long combined_idx;
492 struct page *buddy;
493
494 buddy = __page_find_buddy(page, page_idx, order); 494 buddy = __page_find_buddy(page, page_idx, order);
495 if (!page_is_buddy(page, buddy, order)) 495 if (!page_is_buddy(page, buddy, order))
496 break; 496 break;
@@ -505,8 +505,29 @@ static inline void __free_one_page(struct page *page,
505 order++; 505 order++;
506 } 506 }
507 set_page_order(page, order); 507 set_page_order(page, order);
508 list_add(&page->lru, 508
509 &zone->free_area[order].free_list[migratetype]); 509 /*
510 * If this is not the largest possible page, check if the buddy
511 * of the next-highest order is free. If it is, it's possible
512 * that pages are being freed that will coalesce soon. In case,
513 * that is happening, add the free page to the tail of the list
514 * so it's less likely to be used soon and more likely to be merged
515 * as a higher order page
516 */
517 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
518 struct page *higher_page, *higher_buddy;
519 combined_idx = __find_combined_index(page_idx, order);
520 higher_page = page + combined_idx - page_idx;
521 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
522 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
523 list_add_tail(&page->lru,
524 &zone->free_area[order].free_list[migratetype]);
525 goto out;
526 }
527 }
528
529 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
530out:
510 zone->free_area[order].nr_free++; 531 zone->free_area[order].nr_free++;
511} 532}
512 533
@@ -599,20 +620,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
599 spin_unlock(&zone->lock); 620 spin_unlock(&zone->lock);
600} 621}
601 622
602static void __free_pages_ok(struct page *page, unsigned int order) 623static bool free_pages_prepare(struct page *page, unsigned int order)
603{ 624{
604 unsigned long flags;
605 int i; 625 int i;
606 int bad = 0; 626 int bad = 0;
607 int wasMlocked = __TestClearPageMlocked(page);
608 627
609 trace_mm_page_free_direct(page, order); 628 trace_mm_page_free_direct(page, order);
610 kmemcheck_free_shadow(page, order); 629 kmemcheck_free_shadow(page, order);
611 630
612 for (i = 0 ; i < (1 << order) ; ++i) 631 for (i = 0; i < (1 << order); i++) {
613 bad += free_pages_check(page + i); 632 struct page *pg = page + i;
633
634 if (PageAnon(pg))
635 pg->mapping = NULL;
636 bad += free_pages_check(pg);
637 }
614 if (bad) 638 if (bad)
615 return; 639 return false;
616 640
617 if (!PageHighMem(page)) { 641 if (!PageHighMem(page)) {
618 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 642 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -622,6 +646,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
622 arch_free_page(page, order); 646 arch_free_page(page, order);
623 kernel_map_pages(page, 1 << order, 0); 647 kernel_map_pages(page, 1 << order, 0);
624 648
649 return true;
650}
651
652static void __free_pages_ok(struct page *page, unsigned int order)
653{
654 unsigned long flags;
655 int wasMlocked = __TestClearPageMlocked(page);
656
657 if (!free_pages_prepare(page, order))
658 return;
659
625 local_irq_save(flags); 660 local_irq_save(flags);
626 if (unlikely(wasMlocked)) 661 if (unlikely(wasMlocked))
627 free_page_mlock(page); 662 free_page_mlock(page);
@@ -1107,21 +1142,9 @@ void free_hot_cold_page(struct page *page, int cold)
1107 int migratetype; 1142 int migratetype;
1108 int wasMlocked = __TestClearPageMlocked(page); 1143 int wasMlocked = __TestClearPageMlocked(page);
1109 1144
1110 trace_mm_page_free_direct(page, 0); 1145 if (!free_pages_prepare(page, 0))
1111 kmemcheck_free_shadow(page, 0);
1112
1113 if (PageAnon(page))
1114 page->mapping = NULL;
1115 if (free_pages_check(page))
1116 return; 1146 return;
1117 1147
1118 if (!PageHighMem(page)) {
1119 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1120 debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1121 }
1122 arch_free_page(page, 0);
1123 kernel_map_pages(page, 1, 0);
1124
1125 migratetype = get_pageblock_migratetype(page); 1148 migratetype = get_pageblock_migratetype(page);
1126 set_page_private(page, migratetype); 1149 set_page_private(page, migratetype);
1127 local_irq_save(flags); 1150 local_irq_save(flags);
@@ -1188,6 +1211,51 @@ void split_page(struct page *page, unsigned int order)
1188} 1211}
1189 1212
1190/* 1213/*
1214 * Similar to split_page except the page is already free. As this is only
1215 * being used for migration, the migratetype of the block also changes.
1216 * As this is called with interrupts disabled, the caller is responsible
1217 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1218 * are enabled.
1219 *
1220 * Note: this is probably too low level an operation for use in drivers.
1221 * Please consult with lkml before using this in your driver.
1222 */
1223int split_free_page(struct page *page)
1224{
1225 unsigned int order;
1226 unsigned long watermark;
1227 struct zone *zone;
1228
1229 BUG_ON(!PageBuddy(page));
1230
1231 zone = page_zone(page);
1232 order = page_order(page);
1233
1234 /* Obey watermarks as if the page was being allocated */
1235 watermark = low_wmark_pages(zone) + (1 << order);
1236 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1237 return 0;
1238
1239 /* Remove page from free list */
1240 list_del(&page->lru);
1241 zone->free_area[order].nr_free--;
1242 rmv_page_order(page);
1243 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1244
1245 /* Split into individual pages */
1246 set_page_refcounted(page);
1247 split_page(page, order);
1248
1249 if (order >= pageblock_order - 1) {
1250 struct page *endpage = page + (1 << order) - 1;
1251 for (; page < endpage; page += pageblock_nr_pages)
1252 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1253 }
1254
1255 return 1 << order;
1256}
1257
1258/*
1191 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1259 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1192 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1260 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1193 * or two. 1261 * or two.
@@ -1693,6 +1761,62 @@ out:
1693 return page; 1761 return page;
1694} 1762}
1695 1763
1764#ifdef CONFIG_COMPACTION
1765/* Try memory compaction for high-order allocations before reclaim */
1766static struct page *
1767__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1768 struct zonelist *zonelist, enum zone_type high_zoneidx,
1769 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1770 int migratetype, unsigned long *did_some_progress)
1771{
1772 struct page *page;
1773
1774 if (!order || compaction_deferred(preferred_zone))
1775 return NULL;
1776
1777 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1778 nodemask);
1779 if (*did_some_progress != COMPACT_SKIPPED) {
1780
1781 /* Page migration frees to the PCP lists but we want merging */
1782 drain_pages(get_cpu());
1783 put_cpu();
1784
1785 page = get_page_from_freelist(gfp_mask, nodemask,
1786 order, zonelist, high_zoneidx,
1787 alloc_flags, preferred_zone,
1788 migratetype);
1789 if (page) {
1790 preferred_zone->compact_considered = 0;
1791 preferred_zone->compact_defer_shift = 0;
1792 count_vm_event(COMPACTSUCCESS);
1793 return page;
1794 }
1795
1796 /*
1797 * It's bad if compaction run occurs and fails.
1798 * The most likely reason is that pages exist,
1799 * but not enough to satisfy watermarks.
1800 */
1801 count_vm_event(COMPACTFAIL);
1802 defer_compaction(preferred_zone);
1803
1804 cond_resched();
1805 }
1806
1807 return NULL;
1808}
1809#else
1810static inline struct page *
1811__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1812 struct zonelist *zonelist, enum zone_type high_zoneidx,
1813 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1814 int migratetype, unsigned long *did_some_progress)
1815{
1816 return NULL;
1817}
1818#endif /* CONFIG_COMPACTION */
1819
1696/* The really slow allocator path where we enter direct reclaim */ 1820/* The really slow allocator path where we enter direct reclaim */
1697static inline struct page * 1821static inline struct page *
1698__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 1822__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1879,6 +2003,15 @@ rebalance:
1879 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2003 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1880 goto nopage; 2004 goto nopage;
1881 2005
2006 /* Try direct compaction */
2007 page = __alloc_pages_direct_compact(gfp_mask, order,
2008 zonelist, high_zoneidx,
2009 nodemask,
2010 alloc_flags, preferred_zone,
2011 migratetype, &did_some_progress);
2012 if (page)
2013 goto got_pg;
2014
1882 /* Try direct reclaim and then allocating */ 2015 /* Try direct reclaim and then allocating */
1883 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2016 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1884 zonelist, high_zoneidx, 2017 zonelist, high_zoneidx,
@@ -1970,10 +2103,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1970 if (unlikely(!zonelist->_zonerefs->zone)) 2103 if (unlikely(!zonelist->_zonerefs->zone))
1971 return NULL; 2104 return NULL;
1972 2105
2106 get_mems_allowed();
1973 /* The preferred zone is used for statistics later */ 2107 /* The preferred zone is used for statistics later */
1974 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2108 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1975 if (!preferred_zone) 2109 if (!preferred_zone) {
2110 put_mems_allowed();
1976 return NULL; 2111 return NULL;
2112 }
1977 2113
1978 /* First allocation attempt */ 2114 /* First allocation attempt */
1979 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2115 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1983,6 +2119,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1983 page = __alloc_pages_slowpath(gfp_mask, order, 2119 page = __alloc_pages_slowpath(gfp_mask, order,
1984 zonelist, high_zoneidx, nodemask, 2120 zonelist, high_zoneidx, nodemask,
1985 preferred_zone, migratetype); 2121 preferred_zone, migratetype);
2122 put_mems_allowed();
1986 2123
1987 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2124 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1988 return page; 2125 return page;
@@ -2434,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2434 strncpy((char*)table->data, saved_string, 2571 strncpy((char*)table->data, saved_string,
2435 NUMA_ZONELIST_ORDER_LEN); 2572 NUMA_ZONELIST_ORDER_LEN);
2436 user_zonelist_order = oldval; 2573 user_zonelist_order = oldval;
2437 } else if (oldval != user_zonelist_order) 2574 } else if (oldval != user_zonelist_order) {
2438 build_all_zonelists(); 2575 mutex_lock(&zonelists_mutex);
2576 build_all_zonelists(NULL);
2577 mutex_unlock(&zonelists_mutex);
2578 }
2439 } 2579 }
2440out: 2580out:
2441 mutex_unlock(&zl_order_mutex); 2581 mutex_unlock(&zl_order_mutex);
@@ -2582,7 +2722,7 @@ static int default_zonelist_order(void)
2582 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 2722 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
2583 * If they are really small and used heavily, the system can fall 2723 * If they are really small and used heavily, the system can fall
2584 * into OOM very easily. 2724 * into OOM very easily.
2585 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2725 * This function detect ZONE_DMA/DMA32 size and configures zone order.
2586 */ 2726 */
2587 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 2727 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2588 low_kmem_size = 0; 2728 low_kmem_size = 0;
@@ -2594,6 +2734,15 @@ static int default_zonelist_order(void)
2594 if (zone_type < ZONE_NORMAL) 2734 if (zone_type < ZONE_NORMAL)
2595 low_kmem_size += z->present_pages; 2735 low_kmem_size += z->present_pages;
2596 total_size += z->present_pages; 2736 total_size += z->present_pages;
2737 } else if (zone_type == ZONE_NORMAL) {
2738 /*
2739 * If any node has only lowmem, then node order
2740 * is preferred to allow kernel allocations
2741 * locally; otherwise, they can easily infringe
2742 * on other nodes when there is an abundance of
2743 * lowmem available to allocate from.
2744 */
2745 return ZONELIST_ORDER_NODE;
2597 } 2746 }
2598 } 2747 }
2599 } 2748 }
@@ -2776,9 +2925,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2776 */ 2925 */
2777static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 2926static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 2927static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2928static void setup_zone_pageset(struct zone *zone);
2929
2930/*
2931 * Global mutex to protect against size modification of zonelists
2932 * as well as to serialize pageset setup for the new populated zone.
2933 */
2934DEFINE_MUTEX(zonelists_mutex);
2779 2935
2780/* return values int ....just for stop_machine() */ 2936/* return values int ....just for stop_machine() */
2781static int __build_all_zonelists(void *dummy) 2937static __init_refok int __build_all_zonelists(void *data)
2782{ 2938{
2783 int nid; 2939 int nid;
2784 int cpu; 2940 int cpu;
@@ -2793,6 +2949,14 @@ static int __build_all_zonelists(void *dummy)
2793 build_zonelist_cache(pgdat); 2949 build_zonelist_cache(pgdat);
2794 } 2950 }
2795 2951
2952#ifdef CONFIG_MEMORY_HOTPLUG
2953 /* Setup real pagesets for the new zone */
2954 if (data) {
2955 struct zone *zone = data;
2956 setup_zone_pageset(zone);
2957 }
2958#endif
2959
2796 /* 2960 /*
2797 * Initialize the boot_pagesets that are going to be used 2961 * Initialize the boot_pagesets that are going to be used
2798 * for bootstrapping processors. The real pagesets for 2962 * for bootstrapping processors. The real pagesets for
@@ -2812,7 +2976,11 @@ static int __build_all_zonelists(void *dummy)
2812 return 0; 2976 return 0;
2813} 2977}
2814 2978
2815void build_all_zonelists(void) 2979/*
2980 * Called with zonelists_mutex held always
2981 * unless system_state == SYSTEM_BOOTING.
2982 */
2983void build_all_zonelists(void *data)
2816{ 2984{
2817 set_zonelist_order(); 2985 set_zonelist_order();
2818 2986
@@ -2823,7 +2991,7 @@ void build_all_zonelists(void)
2823 } else { 2991 } else {
2824 /* we have to stop all cpus to guarantee there is no user 2992 /* we have to stop all cpus to guarantee there is no user
2825 of zonelist */ 2993 of zonelist */
2826 stop_machine(__build_all_zonelists, NULL, NULL); 2994 stop_machine(__build_all_zonelists, data, NULL);
2827 /* cpuset refresh routine should be here */ 2995 /* cpuset refresh routine should be here */
2828 } 2996 }
2829 vm_total_pages = nr_free_pagecache_pages(); 2997 vm_total_pages = nr_free_pagecache_pages();
@@ -3146,31 +3314,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3146 pcp->batch = PAGE_SHIFT * 8; 3314 pcp->batch = PAGE_SHIFT * 8;
3147} 3315}
3148 3316
3317static __meminit void setup_zone_pageset(struct zone *zone)
3318{
3319 int cpu;
3320
3321 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3322
3323 for_each_possible_cpu(cpu) {
3324 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3325
3326 setup_pageset(pcp, zone_batchsize(zone));
3327
3328 if (percpu_pagelist_fraction)
3329 setup_pagelist_highmark(pcp,
3330 (zone->present_pages /
3331 percpu_pagelist_fraction));
3332 }
3333}
3334
3149/* 3335/*
3150 * Allocate per cpu pagesets and initialize them. 3336 * Allocate per cpu pagesets and initialize them.
3151 * Before this call only boot pagesets were available. 3337 * Before this call only boot pagesets were available.
3152 * Boot pagesets will no longer be used by this processorr
3153 * after setup_per_cpu_pageset().
3154 */ 3338 */
3155void __init setup_per_cpu_pageset(void) 3339void __init setup_per_cpu_pageset(void)
3156{ 3340{
3157 struct zone *zone; 3341 struct zone *zone;
3158 int cpu;
3159 3342
3160 for_each_populated_zone(zone) { 3343 for_each_populated_zone(zone)
3161 zone->pageset = alloc_percpu(struct per_cpu_pageset); 3344 setup_zone_pageset(zone);
3162
3163 for_each_possible_cpu(cpu) {
3164 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3165
3166 setup_pageset(pcp, zone_batchsize(zone));
3167
3168 if (percpu_pagelist_fraction)
3169 setup_pagelist_highmark(pcp,
3170 (zone->present_pages /
3171 percpu_pagelist_fraction));
3172 }
3173 }
3174} 3345}
3175 3346
3176static noinline __init_refok 3347static noinline __init_refok