aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c497
1 files changed, 410 insertions, 87 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a712fb9e04ce..4a4f9219683f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
57#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
58#include <linux/memcontrol.h> 58#include <linux/memcontrol.h>
59#include <linux/prefetch.h> 59#include <linux/prefetch.h>
60#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 61#include <linux/page-debug-flags.h>
61 62
62#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
@@ -513,10 +514,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
513 * free pages of length of (1 << order) and marked with _mapcount -2. Page's 514 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
514 * order is recorded in page_private(page) field. 515 * order is recorded in page_private(page) field.
515 * So when we are allocating or freeing one, we can derive the state of the 516 * So when we are allocating or freeing one, we can derive the state of the
516 * other. That is, if we allocate a small block, and both were 517 * other. That is, if we allocate a small block, and both were
517 * free, the remainder of the region must be split into blocks. 518 * free, the remainder of the region must be split into blocks.
518 * If a block is freed, and its buddy is also free, then this 519 * If a block is freed, and its buddy is also free, then this
519 * triggers coalescing into a block of larger size. 520 * triggers coalescing into a block of larger size.
520 * 521 *
521 * -- wli 522 * -- wli
522 */ 523 */
@@ -749,6 +750,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
749 __free_pages(page, order); 750 __free_pages(page, order);
750} 751}
751 752
753#ifdef CONFIG_CMA
754/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
755void __init init_cma_reserved_pageblock(struct page *page)
756{
757 unsigned i = pageblock_nr_pages;
758 struct page *p = page;
759
760 do {
761 __ClearPageReserved(p);
762 set_page_count(p, 0);
763 } while (++p, --i);
764
765 set_page_refcounted(page);
766 set_pageblock_migratetype(page, MIGRATE_CMA);
767 __free_pages(page, pageblock_order);
768 totalram_pages += pageblock_nr_pages;
769}
770#endif
752 771
753/* 772/*
754 * The order of subdivision here is critical for the IO subsystem. 773 * The order of subdivision here is critical for the IO subsystem.
@@ -874,11 +893,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
874 * This array describes the order lists are fallen back to when 893 * This array describes the order lists are fallen back to when
875 * the free lists for the desirable migrate type are depleted 894 * the free lists for the desirable migrate type are depleted
876 */ 895 */
877static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { 896static int fallbacks[MIGRATE_TYPES][4] = {
878 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 897 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
879 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 898 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
880 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 899#ifdef CONFIG_CMA
881 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ 900 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
901 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
902#else
903 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
904#endif
905 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
906 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
882}; 907};
883 908
884/* 909/*
@@ -973,12 +998,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
973 /* Find the largest possible block of pages in the other list */ 998 /* Find the largest possible block of pages in the other list */
974 for (current_order = MAX_ORDER-1; current_order >= order; 999 for (current_order = MAX_ORDER-1; current_order >= order;
975 --current_order) { 1000 --current_order) {
976 for (i = 0; i < MIGRATE_TYPES - 1; i++) { 1001 for (i = 0;; i++) {
977 migratetype = fallbacks[start_migratetype][i]; 1002 migratetype = fallbacks[start_migratetype][i];
978 1003
979 /* MIGRATE_RESERVE handled later if necessary */ 1004 /* MIGRATE_RESERVE handled later if necessary */
980 if (migratetype == MIGRATE_RESERVE) 1005 if (migratetype == MIGRATE_RESERVE)
981 continue; 1006 break;
982 1007
983 area = &(zone->free_area[current_order]); 1008 area = &(zone->free_area[current_order]);
984 if (list_empty(&area->free_list[migratetype])) 1009 if (list_empty(&area->free_list[migratetype]))
@@ -993,11 +1018,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
993 * pages to the preferred allocation list. If falling 1018 * pages to the preferred allocation list. If falling
994 * back for a reclaimable kernel allocation, be more 1019 * back for a reclaimable kernel allocation, be more
995 * aggressive about taking ownership of free pages 1020 * aggressive about taking ownership of free pages
1021 *
1022 * On the other hand, never change migration
1023 * type of MIGRATE_CMA pageblocks nor move CMA
1024 * pages on different free lists. We don't
1025 * want unmovable pages to be allocated from
1026 * MIGRATE_CMA areas.
996 */ 1027 */
997 if (unlikely(current_order >= (pageblock_order >> 1)) || 1028 if (!is_migrate_cma(migratetype) &&
998 start_migratetype == MIGRATE_RECLAIMABLE || 1029 (unlikely(current_order >= pageblock_order / 2) ||
999 page_group_by_mobility_disabled) { 1030 start_migratetype == MIGRATE_RECLAIMABLE ||
1000 unsigned long pages; 1031 page_group_by_mobility_disabled)) {
1032 int pages;
1001 pages = move_freepages_block(zone, page, 1033 pages = move_freepages_block(zone, page,
1002 start_migratetype); 1034 start_migratetype);
1003 1035
@@ -1015,11 +1047,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1015 rmv_page_order(page); 1047 rmv_page_order(page);
1016 1048
1017 /* Take ownership for orders >= pageblock_order */ 1049 /* Take ownership for orders >= pageblock_order */
1018 if (current_order >= pageblock_order) 1050 if (current_order >= pageblock_order &&
1051 !is_migrate_cma(migratetype))
1019 change_pageblock_range(page, current_order, 1052 change_pageblock_range(page, current_order,
1020 start_migratetype); 1053 start_migratetype);
1021 1054
1022 expand(zone, page, order, current_order, area, migratetype); 1055 expand(zone, page, order, current_order, area,
1056 is_migrate_cma(migratetype)
1057 ? migratetype : start_migratetype);
1023 1058
1024 trace_mm_page_alloc_extfrag(page, order, current_order, 1059 trace_mm_page_alloc_extfrag(page, order, current_order,
1025 start_migratetype, migratetype); 1060 start_migratetype, migratetype);
@@ -1061,17 +1096,17 @@ retry_reserve:
1061 return page; 1096 return page;
1062} 1097}
1063 1098
1064/* 1099/*
1065 * Obtain a specified number of elements from the buddy allocator, all under 1100 * Obtain a specified number of elements from the buddy allocator, all under
1066 * a single hold of the lock, for efficiency. Add them to the supplied list. 1101 * a single hold of the lock, for efficiency. Add them to the supplied list.
1067 * Returns the number of new pages which were placed at *list. 1102 * Returns the number of new pages which were placed at *list.
1068 */ 1103 */
1069static int rmqueue_bulk(struct zone *zone, unsigned int order, 1104static int rmqueue_bulk(struct zone *zone, unsigned int order,
1070 unsigned long count, struct list_head *list, 1105 unsigned long count, struct list_head *list,
1071 int migratetype, int cold) 1106 int migratetype, int cold)
1072{ 1107{
1073 int i; 1108 int mt = migratetype, i;
1074 1109
1075 spin_lock(&zone->lock); 1110 spin_lock(&zone->lock);
1076 for (i = 0; i < count; ++i) { 1111 for (i = 0; i < count; ++i) {
1077 struct page *page = __rmqueue(zone, order, migratetype); 1112 struct page *page = __rmqueue(zone, order, migratetype);
@@ -1091,7 +1126,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1091 list_add(&page->lru, list); 1126 list_add(&page->lru, list);
1092 else 1127 else
1093 list_add_tail(&page->lru, list); 1128 list_add_tail(&page->lru, list);
1094 set_page_private(page, migratetype); 1129 if (IS_ENABLED(CONFIG_CMA)) {
1130 mt = get_pageblock_migratetype(page);
1131 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1132 mt = migratetype;
1133 }
1134 set_page_private(page, mt);
1095 list = &page->lru; 1135 list = &page->lru;
1096 } 1136 }
1097 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1137 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
@@ -1371,8 +1411,12 @@ int split_free_page(struct page *page)
1371 1411
1372 if (order >= pageblock_order - 1) { 1412 if (order >= pageblock_order - 1) {
1373 struct page *endpage = page + (1 << order) - 1; 1413 struct page *endpage = page + (1 << order) - 1;
1374 for (; page < endpage; page += pageblock_nr_pages) 1414 for (; page < endpage; page += pageblock_nr_pages) {
1375 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1415 int mt = get_pageblock_migratetype(page);
1416 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
1417 set_pageblock_migratetype(page,
1418 MIGRATE_MOVABLE);
1419 }
1376 } 1420 }
1377 1421
1378 return 1 << order; 1422 return 1 << order;
@@ -2086,16 +2130,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2086} 2130}
2087#endif /* CONFIG_COMPACTION */ 2131#endif /* CONFIG_COMPACTION */
2088 2132
2089/* The really slow allocator path where we enter direct reclaim */ 2133/* Perform direct synchronous page reclaim */
2090static inline struct page * 2134static int
2091__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2135__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2092 struct zonelist *zonelist, enum zone_type high_zoneidx, 2136 nodemask_t *nodemask)
2093 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2094 int migratetype, unsigned long *did_some_progress)
2095{ 2137{
2096 struct page *page = NULL;
2097 struct reclaim_state reclaim_state; 2138 struct reclaim_state reclaim_state;
2098 bool drained = false; 2139 int progress;
2099 2140
2100 cond_resched(); 2141 cond_resched();
2101 2142
@@ -2106,7 +2147,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2106 reclaim_state.reclaimed_slab = 0; 2147 reclaim_state.reclaimed_slab = 0;
2107 current->reclaim_state = &reclaim_state; 2148 current->reclaim_state = &reclaim_state;
2108 2149
2109 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2150 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2110 2151
2111 current->reclaim_state = NULL; 2152 current->reclaim_state = NULL;
2112 lockdep_clear_current_reclaim_state(); 2153 lockdep_clear_current_reclaim_state();
@@ -2114,6 +2155,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2114 2155
2115 cond_resched(); 2156 cond_resched();
2116 2157
2158 return progress;
2159}
2160
2161/* The really slow allocator path where we enter direct reclaim */
2162static inline struct page *
2163__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2164 struct zonelist *zonelist, enum zone_type high_zoneidx,
2165 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2166 int migratetype, unsigned long *did_some_progress)
2167{
2168 struct page *page = NULL;
2169 bool drained = false;
2170
2171 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2172 nodemask);
2117 if (unlikely(!(*did_some_progress))) 2173 if (unlikely(!(*did_some_progress)))
2118 return NULL; 2174 return NULL;
2119 2175
@@ -4244,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4244 4300
4245#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4246 4302
4247/* Return a sensible default order for the pageblock size. */
4248static inline int pageblock_default_order(void)
4249{
4250 if (HPAGE_SHIFT > PAGE_SHIFT)
4251 return HUGETLB_PAGE_ORDER;
4252
4253 return MAX_ORDER-1;
4254}
4255
4256/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4303/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4257static inline void __init set_pageblock_order(unsigned int order) 4304static inline void __init set_pageblock_order(void)
4258{ 4305{
4306 unsigned int order;
4307
4259 /* Check that pageblock_nr_pages has not already been setup */ 4308 /* Check that pageblock_nr_pages has not already been setup */
4260 if (pageblock_order) 4309 if (pageblock_order)
4261 return; 4310 return;
4262 4311
4312 if (HPAGE_SHIFT > PAGE_SHIFT)
4313 order = HUGETLB_PAGE_ORDER;
4314 else
4315 order = MAX_ORDER - 1;
4316
4263 /* 4317 /*
4264 * Assume the largest contiguous order of interest is a huge page. 4318 * Assume the largest contiguous order of interest is a huge page.
4265 * This value may be variable depending on boot parameters on IA64 4319 * This value may be variable depending on boot parameters on IA64 and
4320 * powerpc.
4266 */ 4321 */
4267 pageblock_order = order; 4322 pageblock_order = order;
4268} 4323}
@@ -4270,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order)
4270 4325
4271/* 4326/*
4272 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4327 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4273 * and pageblock_default_order() are unused as pageblock_order is set 4328 * is unused as pageblock_order is set at compile-time. See
4274 * at compile-time. See include/linux/pageblock-flags.h for the values of 4329 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4275 * pageblock_order based on the kernel config 4330 * the kernel config
4276 */ 4331 */
4277static inline int pageblock_default_order(unsigned int order) 4332static inline void set_pageblock_order(void)
4278{ 4333{
4279 return MAX_ORDER-1;
4280} 4334}
4281#define set_pageblock_order(x) do {} while (0)
4282 4335
4283#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4336#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4284 4337
@@ -4301,11 +4354,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4301 init_waitqueue_head(&pgdat->kswapd_wait); 4354 init_waitqueue_head(&pgdat->kswapd_wait);
4302 pgdat->kswapd_max_order = 0; 4355 pgdat->kswapd_max_order = 0;
4303 pgdat_page_cgroup_init(pgdat); 4356 pgdat_page_cgroup_init(pgdat);
4304 4357
4305 for (j = 0; j < MAX_NR_ZONES; j++) { 4358 for (j = 0; j < MAX_NR_ZONES; j++) {
4306 struct zone *zone = pgdat->node_zones + j; 4359 struct zone *zone = pgdat->node_zones + j;
4307 unsigned long size, realsize, memmap_pages; 4360 unsigned long size, realsize, memmap_pages;
4308 enum lru_list lru;
4309 4361
4310 size = zone_spanned_pages_in_node(nid, j, zones_size); 4362 size = zone_spanned_pages_in_node(nid, j, zones_size);
4311 realsize = size - zone_absent_pages_in_node(nid, j, 4363 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4355,18 +4407,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4355 zone->zone_pgdat = pgdat; 4407 zone->zone_pgdat = pgdat;
4356 4408
4357 zone_pcp_init(zone); 4409 zone_pcp_init(zone);
4358 for_each_lru(lru) 4410 lruvec_init(&zone->lruvec, zone);
4359 INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
4360 zone->reclaim_stat.recent_rotated[0] = 0;
4361 zone->reclaim_stat.recent_rotated[1] = 0;
4362 zone->reclaim_stat.recent_scanned[0] = 0;
4363 zone->reclaim_stat.recent_scanned[1] = 0;
4364 zap_zone_vm_stats(zone); 4411 zap_zone_vm_stats(zone);
4365 zone->flags = 0; 4412 zone->flags = 0;
4366 if (!size) 4413 if (!size)
4367 continue; 4414 continue;
4368 4415
4369 set_pageblock_order(pageblock_default_order()); 4416 set_pageblock_order();
4370 setup_usemap(pgdat, zone, size); 4417 setup_usemap(pgdat, zone, size);
4371 ret = init_currently_empty_zone(zone, zone_start_pfn, 4418 ret = init_currently_empty_zone(zone, zone_start_pfn,
4372 size, MEMMAP_EARLY); 4419 size, MEMMAP_EARLY);
@@ -4759,31 +4806,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4759 find_zone_movable_pfns_for_nodes(); 4806 find_zone_movable_pfns_for_nodes();
4760 4807
4761 /* Print out the zone ranges */ 4808 /* Print out the zone ranges */
4762 printk("Zone PFN ranges:\n"); 4809 printk("Zone ranges:\n");
4763 for (i = 0; i < MAX_NR_ZONES; i++) { 4810 for (i = 0; i < MAX_NR_ZONES; i++) {
4764 if (i == ZONE_MOVABLE) 4811 if (i == ZONE_MOVABLE)
4765 continue; 4812 continue;
4766 printk(" %-8s ", zone_names[i]); 4813 printk(KERN_CONT " %-8s ", zone_names[i]);
4767 if (arch_zone_lowest_possible_pfn[i] == 4814 if (arch_zone_lowest_possible_pfn[i] ==
4768 arch_zone_highest_possible_pfn[i]) 4815 arch_zone_highest_possible_pfn[i])
4769 printk("empty\n"); 4816 printk(KERN_CONT "empty\n");
4770 else 4817 else
4771 printk("%0#10lx -> %0#10lx\n", 4818 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
4772 arch_zone_lowest_possible_pfn[i], 4819 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
4773 arch_zone_highest_possible_pfn[i]); 4820 (arch_zone_highest_possible_pfn[i]
4821 << PAGE_SHIFT) - 1);
4774 } 4822 }
4775 4823
4776 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 4824 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
4777 printk("Movable zone start PFN for each node\n"); 4825 printk("Movable zone start for each node\n");
4778 for (i = 0; i < MAX_NUMNODES; i++) { 4826 for (i = 0; i < MAX_NUMNODES; i++) {
4779 if (zone_movable_pfn[i]) 4827 if (zone_movable_pfn[i])
4780 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); 4828 printk(" Node %d: %#010lx\n", i,
4829 zone_movable_pfn[i] << PAGE_SHIFT);
4781 } 4830 }
4782 4831
4783 /* Print out the early_node_map[] */ 4832 /* Print out the early_node_map[] */
4784 printk("Early memory PFN ranges\n"); 4833 printk("Early memory node ranges\n");
4785 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4834 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4786 printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); 4835 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
4836 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
4787 4837
4788 /* Initialise every node */ 4838 /* Initialise every node */
4789 mminit_verify_pageflags_layout(); 4839 mminit_verify_pageflags_layout();
@@ -4976,14 +5026,7 @@ static void setup_per_zone_lowmem_reserve(void)
4976 calculate_totalreserve_pages(); 5026 calculate_totalreserve_pages();
4977} 5027}
4978 5028
4979/** 5029static void __setup_per_zone_wmarks(void)
4980 * setup_per_zone_wmarks - called when min_free_kbytes changes
4981 * or when memory is hot-{added|removed}
4982 *
4983 * Ensures that the watermark[min,low,high] values for each zone are set
4984 * correctly with respect to min_free_kbytes.
4985 */
4986void setup_per_zone_wmarks(void)
4987{ 5030{
4988 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5031 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4989 unsigned long lowmem_pages = 0; 5032 unsigned long lowmem_pages = 0;
@@ -5030,6 +5073,11 @@ void setup_per_zone_wmarks(void)
5030 5073
5031 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5074 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5032 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5075 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5076
5077 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5078 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5079 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5080
5033 setup_zone_migrate_reserve(zone); 5081 setup_zone_migrate_reserve(zone);
5034 spin_unlock_irqrestore(&zone->lock, flags); 5082 spin_unlock_irqrestore(&zone->lock, flags);
5035 } 5083 }
@@ -5038,6 +5086,20 @@ void setup_per_zone_wmarks(void)
5038 calculate_totalreserve_pages(); 5086 calculate_totalreserve_pages();
5039} 5087}
5040 5088
5089/**
5090 * setup_per_zone_wmarks - called when min_free_kbytes changes
5091 * or when memory is hot-{added|removed}
5092 *
5093 * Ensures that the watermark[min,low,high] values for each zone are set
5094 * correctly with respect to min_free_kbytes.
5095 */
5096void setup_per_zone_wmarks(void)
5097{
5098 mutex_lock(&zonelists_mutex);
5099 __setup_per_zone_wmarks();
5100 mutex_unlock(&zonelists_mutex);
5101}
5102
5041/* 5103/*
5042 * The inactive anon list should be small enough that the VM never has to 5104 * The inactive anon list should be small enough that the VM never has to
5043 * do too much work, but large enough that each inactive page has a chance 5105 * do too much work, but large enough that each inactive page has a chance
@@ -5203,7 +5265,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5203 int ret; 5265 int ret;
5204 5266
5205 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5267 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5206 if (!write || (ret == -EINVAL)) 5268 if (!write || (ret < 0))
5207 return ret; 5269 return ret;
5208 for_each_populated_zone(zone) { 5270 for_each_populated_zone(zone) {
5209 for_each_possible_cpu(cpu) { 5271 for_each_possible_cpu(cpu) {
@@ -5242,9 +5304,10 @@ void *__init alloc_large_system_hash(const char *tablename,
5242 int flags, 5304 int flags,
5243 unsigned int *_hash_shift, 5305 unsigned int *_hash_shift,
5244 unsigned int *_hash_mask, 5306 unsigned int *_hash_mask,
5245 unsigned long limit) 5307 unsigned long low_limit,
5308 unsigned long high_limit)
5246{ 5309{
5247 unsigned long long max = limit; 5310 unsigned long long max = high_limit;
5248 unsigned long log2qty, size; 5311 unsigned long log2qty, size;
5249 void *table = NULL; 5312 void *table = NULL;
5250 5313
@@ -5282,6 +5345,8 @@ void *__init alloc_large_system_hash(const char *tablename,
5282 } 5345 }
5283 max = min(max, 0x80000000ULL); 5346 max = min(max, 0x80000000ULL);
5284 5347
5348 if (numentries < low_limit)
5349 numentries = low_limit;
5285 if (numentries > max) 5350 if (numentries > max)
5286 numentries = max; 5351 numentries = max;
5287 5352
@@ -5412,14 +5477,16 @@ static int
5412__count_immobile_pages(struct zone *zone, struct page *page, int count) 5477__count_immobile_pages(struct zone *zone, struct page *page, int count)
5413{ 5478{
5414 unsigned long pfn, iter, found; 5479 unsigned long pfn, iter, found;
5480 int mt;
5481
5415 /* 5482 /*
5416 * For avoiding noise data, lru_add_drain_all() should be called 5483 * For avoiding noise data, lru_add_drain_all() should be called
5417 * If ZONE_MOVABLE, the zone never contains immobile pages 5484 * If ZONE_MOVABLE, the zone never contains immobile pages
5418 */ 5485 */
5419 if (zone_idx(zone) == ZONE_MOVABLE) 5486 if (zone_idx(zone) == ZONE_MOVABLE)
5420 return true; 5487 return true;
5421 5488 mt = get_pageblock_migratetype(page);
5422 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) 5489 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5423 return true; 5490 return true;
5424 5491
5425 pfn = page_to_pfn(page); 5492 pfn = page_to_pfn(page);
@@ -5536,7 +5603,7 @@ out:
5536 return ret; 5603 return ret;
5537} 5604}
5538 5605
5539void unset_migratetype_isolate(struct page *page) 5606void unset_migratetype_isolate(struct page *page, unsigned migratetype)
5540{ 5607{
5541 struct zone *zone; 5608 struct zone *zone;
5542 unsigned long flags; 5609 unsigned long flags;
@@ -5544,12 +5611,264 @@ void unset_migratetype_isolate(struct page *page)
5544 spin_lock_irqsave(&zone->lock, flags); 5611 spin_lock_irqsave(&zone->lock, flags);
5545 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 5612 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5546 goto out; 5613 goto out;
5547 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 5614 set_pageblock_migratetype(page, migratetype);
5548 move_freepages_block(zone, page, MIGRATE_MOVABLE); 5615 move_freepages_block(zone, page, migratetype);
5549out: 5616out:
5550 spin_unlock_irqrestore(&zone->lock, flags); 5617 spin_unlock_irqrestore(&zone->lock, flags);
5551} 5618}
5552 5619
5620#ifdef CONFIG_CMA
5621
5622static unsigned long pfn_max_align_down(unsigned long pfn)
5623{
5624 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
5625 pageblock_nr_pages) - 1);
5626}
5627
5628static unsigned long pfn_max_align_up(unsigned long pfn)
5629{
5630 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
5631 pageblock_nr_pages));
5632}
5633
5634static struct page *
5635__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5636 int **resultp)
5637{
5638 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5639
5640 if (PageHighMem(page))
5641 gfp_mask |= __GFP_HIGHMEM;
5642
5643 return alloc_page(gfp_mask);
5644}
5645
5646/* [start, end) must belong to a single zone. */
5647static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5648{
5649 /* This function is based on compact_zone() from compaction.c. */
5650
5651 unsigned long pfn = start;
5652 unsigned int tries = 0;
5653 int ret = 0;
5654
5655 struct compact_control cc = {
5656 .nr_migratepages = 0,
5657 .order = -1,
5658 .zone = page_zone(pfn_to_page(start)),
5659 .sync = true,
5660 };
5661 INIT_LIST_HEAD(&cc.migratepages);
5662
5663 migrate_prep_local();
5664
5665 while (pfn < end || !list_empty(&cc.migratepages)) {
5666 if (fatal_signal_pending(current)) {
5667 ret = -EINTR;
5668 break;
5669 }
5670
5671 if (list_empty(&cc.migratepages)) {
5672 cc.nr_migratepages = 0;
5673 pfn = isolate_migratepages_range(cc.zone, &cc,
5674 pfn, end);
5675 if (!pfn) {
5676 ret = -EINTR;
5677 break;
5678 }
5679 tries = 0;
5680 } else if (++tries == 5) {
5681 ret = ret < 0 ? ret : -EBUSY;
5682 break;
5683 }
5684
5685 ret = migrate_pages(&cc.migratepages,
5686 __alloc_contig_migrate_alloc,
5687 0, false, MIGRATE_SYNC);
5688 }
5689
5690 putback_lru_pages(&cc.migratepages);
5691 return ret > 0 ? 0 : ret;
5692}
5693
5694/*
5695 * Update zone's cma pages counter used for watermark level calculation.
5696 */
5697static inline void __update_cma_watermarks(struct zone *zone, int count)
5698{
5699 unsigned long flags;
5700 spin_lock_irqsave(&zone->lock, flags);
5701 zone->min_cma_pages += count;
5702 spin_unlock_irqrestore(&zone->lock, flags);
5703 setup_per_zone_wmarks();
5704}
5705
5706/*
5707 * Trigger memory pressure bump to reclaim some pages in order to be able to
5708 * allocate 'count' pages in single page units. Does similar work as
5709 *__alloc_pages_slowpath() function.
5710 */
5711static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5712{
5713 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5714 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5715 int did_some_progress = 0;
5716 int order = 1;
5717
5718 /*
5719 * Increase level of watermarks to force kswapd do his job
5720 * to stabilise at new watermark level.
5721 */
5722 __update_cma_watermarks(zone, count);
5723
5724 /* Obey watermarks as if the page was being allocated */
5725 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5726 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5727
5728 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5729 NULL);
5730 if (!did_some_progress) {
5731 /* Exhausted what can be done so it's blamo time */
5732 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5733 }
5734 }
5735
5736 /* Restore original watermark levels. */
5737 __update_cma_watermarks(zone, -count);
5738
5739 return count;
5740}
5741
5742/**
5743 * alloc_contig_range() -- tries to allocate given range of pages
5744 * @start: start PFN to allocate
5745 * @end: one-past-the-last PFN to allocate
5746 * @migratetype: migratetype of the underlaying pageblocks (either
5747 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
5748 * in range must have the same migratetype and it must
5749 * be either of the two.
5750 *
5751 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
5752 * aligned, however it's the caller's responsibility to guarantee that
5753 * we are the only thread that changes migrate type of pageblocks the
5754 * pages fall in.
5755 *
5756 * The PFN range must belong to a single zone.
5757 *
5758 * Returns zero on success or negative error code. On success all
5759 * pages which PFN is in [start, end) are allocated for the caller and
5760 * need to be freed with free_contig_range().
5761 */
5762int alloc_contig_range(unsigned long start, unsigned long end,
5763 unsigned migratetype)
5764{
5765 struct zone *zone = page_zone(pfn_to_page(start));
5766 unsigned long outer_start, outer_end;
5767 int ret = 0, order;
5768
5769 /*
5770 * What we do here is we mark all pageblocks in range as
5771 * MIGRATE_ISOLATE. Because pageblock and max order pages may
5772 * have different sizes, and due to the way page allocator
5773 * work, we align the range to biggest of the two pages so
5774 * that page allocator won't try to merge buddies from
5775 * different pageblocks and change MIGRATE_ISOLATE to some
5776 * other migration type.
5777 *
5778 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
5779 * migrate the pages from an unaligned range (ie. pages that
5780 * we are interested in). This will put all the pages in
5781 * range back to page allocator as MIGRATE_ISOLATE.
5782 *
5783 * When this is done, we take the pages in range from page
5784 * allocator removing them from the buddy system. This way
5785 * page allocator will never consider using them.
5786 *
5787 * This lets us mark the pageblocks back as
5788 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
5789 * aligned range but not in the unaligned, original range are
5790 * put back to page allocator so that buddy can use them.
5791 */
5792
5793 ret = start_isolate_page_range(pfn_max_align_down(start),
5794 pfn_max_align_up(end), migratetype);
5795 if (ret)
5796 goto done;
5797
5798 ret = __alloc_contig_migrate_range(start, end);
5799 if (ret)
5800 goto done;
5801
5802 /*
5803 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
5804 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
5805 * more, all pages in [start, end) are free in page allocator.
5806 * What we are going to do is to allocate all pages from
5807 * [start, end) (that is remove them from page allocator).
5808 *
5809 * The only problem is that pages at the beginning and at the
5810 * end of interesting range may be not aligned with pages that
5811 * page allocator holds, ie. they can be part of higher order
5812 * pages. Because of this, we reserve the bigger range and
5813 * once this is done free the pages we are not interested in.
5814 *
5815 * We don't have to hold zone->lock here because the pages are
5816 * isolated thus they won't get removed from buddy.
5817 */
5818
5819 lru_add_drain_all();
5820 drain_all_pages();
5821
5822 order = 0;
5823 outer_start = start;
5824 while (!PageBuddy(pfn_to_page(outer_start))) {
5825 if (++order >= MAX_ORDER) {
5826 ret = -EBUSY;
5827 goto done;
5828 }
5829 outer_start &= ~0UL << order;
5830 }
5831
5832 /* Make sure the range is really isolated. */
5833 if (test_pages_isolated(outer_start, end)) {
5834 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5835 outer_start, end);
5836 ret = -EBUSY;
5837 goto done;
5838 }
5839
5840 /*
5841 * Reclaim enough pages to make sure that contiguous allocation
5842 * will not starve the system.
5843 */
5844 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5845
5846 /* Grab isolated pages from freelists. */
5847 outer_end = isolate_freepages_range(outer_start, end);
5848 if (!outer_end) {
5849 ret = -EBUSY;
5850 goto done;
5851 }
5852
5853 /* Free head and tail (if any) */
5854 if (start != outer_start)
5855 free_contig_range(outer_start, start - outer_start);
5856 if (end != outer_end)
5857 free_contig_range(end, outer_end - end);
5858
5859done:
5860 undo_isolate_page_range(pfn_max_align_down(start),
5861 pfn_max_align_up(end), migratetype);
5862 return ret;
5863}
5864
5865void free_contig_range(unsigned long pfn, unsigned nr_pages)
5866{
5867 for (; nr_pages--; ++pfn)
5868 __free_page(pfn_to_page(pfn));
5869}
5870#endif
5871
5553#ifdef CONFIG_MEMORY_HOTREMOVE 5872#ifdef CONFIG_MEMORY_HOTREMOVE
5554/* 5873/*
5555 * All pages in the range must be isolated before calling this. 5874 * All pages in the range must be isolated before calling this.
@@ -5618,7 +5937,7 @@ bool is_free_buddy_page(struct page *page)
5618} 5937}
5619#endif 5938#endif
5620 5939
5621static struct trace_print_flags pageflag_names[] = { 5940static const struct trace_print_flags pageflag_names[] = {
5622 {1UL << PG_locked, "locked" }, 5941 {1UL << PG_locked, "locked" },
5623 {1UL << PG_error, "error" }, 5942 {1UL << PG_error, "error" },
5624 {1UL << PG_referenced, "referenced" }, 5943 {1UL << PG_referenced, "referenced" },
@@ -5653,7 +5972,9 @@ static struct trace_print_flags pageflag_names[] = {
5653#ifdef CONFIG_MEMORY_FAILURE 5972#ifdef CONFIG_MEMORY_FAILURE
5654 {1UL << PG_hwpoison, "hwpoison" }, 5973 {1UL << PG_hwpoison, "hwpoison" },
5655#endif 5974#endif
5656 {-1UL, NULL }, 5975#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5976 {1UL << PG_compound_lock, "compound_lock" },
5977#endif
5657}; 5978};
5658 5979
5659static void dump_page_flags(unsigned long flags) 5980static void dump_page_flags(unsigned long flags)
@@ -5662,12 +5983,14 @@ static void dump_page_flags(unsigned long flags)
5662 unsigned long mask; 5983 unsigned long mask;
5663 int i; 5984 int i;
5664 5985
5986 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
5987
5665 printk(KERN_ALERT "page flags: %#lx(", flags); 5988 printk(KERN_ALERT "page flags: %#lx(", flags);
5666 5989
5667 /* remove zone id */ 5990 /* remove zone id */
5668 flags &= (1UL << NR_PAGEFLAGS) - 1; 5991 flags &= (1UL << NR_PAGEFLAGS) - 1;
5669 5992
5670 for (i = 0; pageflag_names[i].name && flags; i++) { 5993 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
5671 5994
5672 mask = pageflag_names[i].mask; 5995 mask = pageflag_names[i].mask;
5673 if ((flags & mask) != mask) 5996 if ((flags & mask) != mask)