diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 497 |
1 files changed, 410 insertions, 87 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a712fb9e04ce..4a4f9219683f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
58 | #include <linux/memcontrol.h> | 58 | #include <linux/memcontrol.h> |
59 | #include <linux/prefetch.h> | 59 | #include <linux/prefetch.h> |
60 | #include <linux/migrate.h> | ||
60 | #include <linux/page-debug-flags.h> | 61 | #include <linux/page-debug-flags.h> |
61 | 62 | ||
62 | #include <asm/tlbflush.h> | 63 | #include <asm/tlbflush.h> |
@@ -513,10 +514,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
513 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's | 514 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
514 | * order is recorded in page_private(page) field. | 515 | * order is recorded in page_private(page) field. |
515 | * So when we are allocating or freeing one, we can derive the state of the | 516 | * So when we are allocating or freeing one, we can derive the state of the |
516 | * other. That is, if we allocate a small block, and both were | 517 | * other. That is, if we allocate a small block, and both were |
517 | * free, the remainder of the region must be split into blocks. | 518 | * free, the remainder of the region must be split into blocks. |
518 | * If a block is freed, and its buddy is also free, then this | 519 | * If a block is freed, and its buddy is also free, then this |
519 | * triggers coalescing into a block of larger size. | 520 | * triggers coalescing into a block of larger size. |
520 | * | 521 | * |
521 | * -- wli | 522 | * -- wli |
522 | */ | 523 | */ |
@@ -749,6 +750,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
749 | __free_pages(page, order); | 750 | __free_pages(page, order); |
750 | } | 751 | } |
751 | 752 | ||
753 | #ifdef CONFIG_CMA | ||
754 | /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ | ||
755 | void __init init_cma_reserved_pageblock(struct page *page) | ||
756 | { | ||
757 | unsigned i = pageblock_nr_pages; | ||
758 | struct page *p = page; | ||
759 | |||
760 | do { | ||
761 | __ClearPageReserved(p); | ||
762 | set_page_count(p, 0); | ||
763 | } while (++p, --i); | ||
764 | |||
765 | set_page_refcounted(page); | ||
766 | set_pageblock_migratetype(page, MIGRATE_CMA); | ||
767 | __free_pages(page, pageblock_order); | ||
768 | totalram_pages += pageblock_nr_pages; | ||
769 | } | ||
770 | #endif | ||
752 | 771 | ||
753 | /* | 772 | /* |
754 | * The order of subdivision here is critical for the IO subsystem. | 773 | * The order of subdivision here is critical for the IO subsystem. |
@@ -874,11 +893,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
874 | * This array describes the order lists are fallen back to when | 893 | * This array describes the order lists are fallen back to when |
875 | * the free lists for the desirable migrate type are depleted | 894 | * the free lists for the desirable migrate type are depleted |
876 | */ | 895 | */ |
877 | static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | 896 | static int fallbacks[MIGRATE_TYPES][4] = { |
878 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 897 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
879 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 898 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
880 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 899 | #ifdef CONFIG_CMA |
881 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ | 900 | [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
901 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ | ||
902 | #else | ||
903 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
904 | #endif | ||
905 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | ||
906 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | ||
882 | }; | 907 | }; |
883 | 908 | ||
884 | /* | 909 | /* |
@@ -973,12 +998,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
973 | /* Find the largest possible block of pages in the other list */ | 998 | /* Find the largest possible block of pages in the other list */ |
974 | for (current_order = MAX_ORDER-1; current_order >= order; | 999 | for (current_order = MAX_ORDER-1; current_order >= order; |
975 | --current_order) { | 1000 | --current_order) { |
976 | for (i = 0; i < MIGRATE_TYPES - 1; i++) { | 1001 | for (i = 0;; i++) { |
977 | migratetype = fallbacks[start_migratetype][i]; | 1002 | migratetype = fallbacks[start_migratetype][i]; |
978 | 1003 | ||
979 | /* MIGRATE_RESERVE handled later if necessary */ | 1004 | /* MIGRATE_RESERVE handled later if necessary */ |
980 | if (migratetype == MIGRATE_RESERVE) | 1005 | if (migratetype == MIGRATE_RESERVE) |
981 | continue; | 1006 | break; |
982 | 1007 | ||
983 | area = &(zone->free_area[current_order]); | 1008 | area = &(zone->free_area[current_order]); |
984 | if (list_empty(&area->free_list[migratetype])) | 1009 | if (list_empty(&area->free_list[migratetype])) |
@@ -993,11 +1018,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
993 | * pages to the preferred allocation list. If falling | 1018 | * pages to the preferred allocation list. If falling |
994 | * back for a reclaimable kernel allocation, be more | 1019 | * back for a reclaimable kernel allocation, be more |
995 | * aggressive about taking ownership of free pages | 1020 | * aggressive about taking ownership of free pages |
1021 | * | ||
1022 | * On the other hand, never change migration | ||
1023 | * type of MIGRATE_CMA pageblocks nor move CMA | ||
1024 | * pages on different free lists. We don't | ||
1025 | * want unmovable pages to be allocated from | ||
1026 | * MIGRATE_CMA areas. | ||
996 | */ | 1027 | */ |
997 | if (unlikely(current_order >= (pageblock_order >> 1)) || | 1028 | if (!is_migrate_cma(migratetype) && |
998 | start_migratetype == MIGRATE_RECLAIMABLE || | 1029 | (unlikely(current_order >= pageblock_order / 2) || |
999 | page_group_by_mobility_disabled) { | 1030 | start_migratetype == MIGRATE_RECLAIMABLE || |
1000 | unsigned long pages; | 1031 | page_group_by_mobility_disabled)) { |
1032 | int pages; | ||
1001 | pages = move_freepages_block(zone, page, | 1033 | pages = move_freepages_block(zone, page, |
1002 | start_migratetype); | 1034 | start_migratetype); |
1003 | 1035 | ||
@@ -1015,11 +1047,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1015 | rmv_page_order(page); | 1047 | rmv_page_order(page); |
1016 | 1048 | ||
1017 | /* Take ownership for orders >= pageblock_order */ | 1049 | /* Take ownership for orders >= pageblock_order */ |
1018 | if (current_order >= pageblock_order) | 1050 | if (current_order >= pageblock_order && |
1051 | !is_migrate_cma(migratetype)) | ||
1019 | change_pageblock_range(page, current_order, | 1052 | change_pageblock_range(page, current_order, |
1020 | start_migratetype); | 1053 | start_migratetype); |
1021 | 1054 | ||
1022 | expand(zone, page, order, current_order, area, migratetype); | 1055 | expand(zone, page, order, current_order, area, |
1056 | is_migrate_cma(migratetype) | ||
1057 | ? migratetype : start_migratetype); | ||
1023 | 1058 | ||
1024 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1059 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1025 | start_migratetype, migratetype); | 1060 | start_migratetype, migratetype); |
@@ -1061,17 +1096,17 @@ retry_reserve: | |||
1061 | return page; | 1096 | return page; |
1062 | } | 1097 | } |
1063 | 1098 | ||
1064 | /* | 1099 | /* |
1065 | * Obtain a specified number of elements from the buddy allocator, all under | 1100 | * Obtain a specified number of elements from the buddy allocator, all under |
1066 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 1101 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
1067 | * Returns the number of new pages which were placed at *list. | 1102 | * Returns the number of new pages which were placed at *list. |
1068 | */ | 1103 | */ |
1069 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1104 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1070 | unsigned long count, struct list_head *list, | 1105 | unsigned long count, struct list_head *list, |
1071 | int migratetype, int cold) | 1106 | int migratetype, int cold) |
1072 | { | 1107 | { |
1073 | int i; | 1108 | int mt = migratetype, i; |
1074 | 1109 | ||
1075 | spin_lock(&zone->lock); | 1110 | spin_lock(&zone->lock); |
1076 | for (i = 0; i < count; ++i) { | 1111 | for (i = 0; i < count; ++i) { |
1077 | struct page *page = __rmqueue(zone, order, migratetype); | 1112 | struct page *page = __rmqueue(zone, order, migratetype); |
@@ -1091,7 +1126,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1091 | list_add(&page->lru, list); | 1126 | list_add(&page->lru, list); |
1092 | else | 1127 | else |
1093 | list_add_tail(&page->lru, list); | 1128 | list_add_tail(&page->lru, list); |
1094 | set_page_private(page, migratetype); | 1129 | if (IS_ENABLED(CONFIG_CMA)) { |
1130 | mt = get_pageblock_migratetype(page); | ||
1131 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | ||
1132 | mt = migratetype; | ||
1133 | } | ||
1134 | set_page_private(page, mt); | ||
1095 | list = &page->lru; | 1135 | list = &page->lru; |
1096 | } | 1136 | } |
1097 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 1137 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
@@ -1371,8 +1411,12 @@ int split_free_page(struct page *page) | |||
1371 | 1411 | ||
1372 | if (order >= pageblock_order - 1) { | 1412 | if (order >= pageblock_order - 1) { |
1373 | struct page *endpage = page + (1 << order) - 1; | 1413 | struct page *endpage = page + (1 << order) - 1; |
1374 | for (; page < endpage; page += pageblock_nr_pages) | 1414 | for (; page < endpage; page += pageblock_nr_pages) { |
1375 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 1415 | int mt = get_pageblock_migratetype(page); |
1416 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) | ||
1417 | set_pageblock_migratetype(page, | ||
1418 | MIGRATE_MOVABLE); | ||
1419 | } | ||
1376 | } | 1420 | } |
1377 | 1421 | ||
1378 | return 1 << order; | 1422 | return 1 << order; |
@@ -2086,16 +2130,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2086 | } | 2130 | } |
2087 | #endif /* CONFIG_COMPACTION */ | 2131 | #endif /* CONFIG_COMPACTION */ |
2088 | 2132 | ||
2089 | /* The really slow allocator path where we enter direct reclaim */ | 2133 | /* Perform direct synchronous page reclaim */ |
2090 | static inline struct page * | 2134 | static int |
2091 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2135 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, |
2092 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2136 | nodemask_t *nodemask) |
2093 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
2094 | int migratetype, unsigned long *did_some_progress) | ||
2095 | { | 2137 | { |
2096 | struct page *page = NULL; | ||
2097 | struct reclaim_state reclaim_state; | 2138 | struct reclaim_state reclaim_state; |
2098 | bool drained = false; | 2139 | int progress; |
2099 | 2140 | ||
2100 | cond_resched(); | 2141 | cond_resched(); |
2101 | 2142 | ||
@@ -2106,7 +2147,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2106 | reclaim_state.reclaimed_slab = 0; | 2147 | reclaim_state.reclaimed_slab = 0; |
2107 | current->reclaim_state = &reclaim_state; | 2148 | current->reclaim_state = &reclaim_state; |
2108 | 2149 | ||
2109 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2150 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
2110 | 2151 | ||
2111 | current->reclaim_state = NULL; | 2152 | current->reclaim_state = NULL; |
2112 | lockdep_clear_current_reclaim_state(); | 2153 | lockdep_clear_current_reclaim_state(); |
@@ -2114,6 +2155,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2114 | 2155 | ||
2115 | cond_resched(); | 2156 | cond_resched(); |
2116 | 2157 | ||
2158 | return progress; | ||
2159 | } | ||
2160 | |||
2161 | /* The really slow allocator path where we enter direct reclaim */ | ||
2162 | static inline struct page * | ||
2163 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
2164 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
2165 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
2166 | int migratetype, unsigned long *did_some_progress) | ||
2167 | { | ||
2168 | struct page *page = NULL; | ||
2169 | bool drained = false; | ||
2170 | |||
2171 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
2172 | nodemask); | ||
2117 | if (unlikely(!(*did_some_progress))) | 2173 | if (unlikely(!(*did_some_progress))) |
2118 | return NULL; | 2174 | return NULL; |
2119 | 2175 | ||
@@ -4244,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, | |||
4244 | 4300 | ||
4245 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4301 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4246 | 4302 | ||
4247 | /* Return a sensible default order for the pageblock size. */ | ||
4248 | static inline int pageblock_default_order(void) | ||
4249 | { | ||
4250 | if (HPAGE_SHIFT > PAGE_SHIFT) | ||
4251 | return HUGETLB_PAGE_ORDER; | ||
4252 | |||
4253 | return MAX_ORDER-1; | ||
4254 | } | ||
4255 | |||
4256 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4303 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4257 | static inline void __init set_pageblock_order(unsigned int order) | 4304 | static inline void __init set_pageblock_order(void) |
4258 | { | 4305 | { |
4306 | unsigned int order; | ||
4307 | |||
4259 | /* Check that pageblock_nr_pages has not already been setup */ | 4308 | /* Check that pageblock_nr_pages has not already been setup */ |
4260 | if (pageblock_order) | 4309 | if (pageblock_order) |
4261 | return; | 4310 | return; |
4262 | 4311 | ||
4312 | if (HPAGE_SHIFT > PAGE_SHIFT) | ||
4313 | order = HUGETLB_PAGE_ORDER; | ||
4314 | else | ||
4315 | order = MAX_ORDER - 1; | ||
4316 | |||
4263 | /* | 4317 | /* |
4264 | * Assume the largest contiguous order of interest is a huge page. | 4318 | * Assume the largest contiguous order of interest is a huge page. |
4265 | * This value may be variable depending on boot parameters on IA64 | 4319 | * This value may be variable depending on boot parameters on IA64 and |
4320 | * powerpc. | ||
4266 | */ | 4321 | */ |
4267 | pageblock_order = order; | 4322 | pageblock_order = order; |
4268 | } | 4323 | } |
@@ -4270,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order) | |||
4270 | 4325 | ||
4271 | /* | 4326 | /* |
4272 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() | 4327 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() |
4273 | * and pageblock_default_order() are unused as pageblock_order is set | 4328 | * is unused as pageblock_order is set at compile-time. See |
4274 | * at compile-time. See include/linux/pageblock-flags.h for the values of | 4329 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4275 | * pageblock_order based on the kernel config | 4330 | * the kernel config |
4276 | */ | 4331 | */ |
4277 | static inline int pageblock_default_order(unsigned int order) | 4332 | static inline void set_pageblock_order(void) |
4278 | { | 4333 | { |
4279 | return MAX_ORDER-1; | ||
4280 | } | 4334 | } |
4281 | #define set_pageblock_order(x) do {} while (0) | ||
4282 | 4335 | ||
4283 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4336 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4284 | 4337 | ||
@@ -4301,11 +4354,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4301 | init_waitqueue_head(&pgdat->kswapd_wait); | 4354 | init_waitqueue_head(&pgdat->kswapd_wait); |
4302 | pgdat->kswapd_max_order = 0; | 4355 | pgdat->kswapd_max_order = 0; |
4303 | pgdat_page_cgroup_init(pgdat); | 4356 | pgdat_page_cgroup_init(pgdat); |
4304 | 4357 | ||
4305 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4358 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4306 | struct zone *zone = pgdat->node_zones + j; | 4359 | struct zone *zone = pgdat->node_zones + j; |
4307 | unsigned long size, realsize, memmap_pages; | 4360 | unsigned long size, realsize, memmap_pages; |
4308 | enum lru_list lru; | ||
4309 | 4361 | ||
4310 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4362 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4311 | realsize = size - zone_absent_pages_in_node(nid, j, | 4363 | realsize = size - zone_absent_pages_in_node(nid, j, |
@@ -4355,18 +4407,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4355 | zone->zone_pgdat = pgdat; | 4407 | zone->zone_pgdat = pgdat; |
4356 | 4408 | ||
4357 | zone_pcp_init(zone); | 4409 | zone_pcp_init(zone); |
4358 | for_each_lru(lru) | 4410 | lruvec_init(&zone->lruvec, zone); |
4359 | INIT_LIST_HEAD(&zone->lruvec.lists[lru]); | ||
4360 | zone->reclaim_stat.recent_rotated[0] = 0; | ||
4361 | zone->reclaim_stat.recent_rotated[1] = 0; | ||
4362 | zone->reclaim_stat.recent_scanned[0] = 0; | ||
4363 | zone->reclaim_stat.recent_scanned[1] = 0; | ||
4364 | zap_zone_vm_stats(zone); | 4411 | zap_zone_vm_stats(zone); |
4365 | zone->flags = 0; | 4412 | zone->flags = 0; |
4366 | if (!size) | 4413 | if (!size) |
4367 | continue; | 4414 | continue; |
4368 | 4415 | ||
4369 | set_pageblock_order(pageblock_default_order()); | 4416 | set_pageblock_order(); |
4370 | setup_usemap(pgdat, zone, size); | 4417 | setup_usemap(pgdat, zone, size); |
4371 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 4418 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
4372 | size, MEMMAP_EARLY); | 4419 | size, MEMMAP_EARLY); |
@@ -4759,31 +4806,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4759 | find_zone_movable_pfns_for_nodes(); | 4806 | find_zone_movable_pfns_for_nodes(); |
4760 | 4807 | ||
4761 | /* Print out the zone ranges */ | 4808 | /* Print out the zone ranges */ |
4762 | printk("Zone PFN ranges:\n"); | 4809 | printk("Zone ranges:\n"); |
4763 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4810 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4764 | if (i == ZONE_MOVABLE) | 4811 | if (i == ZONE_MOVABLE) |
4765 | continue; | 4812 | continue; |
4766 | printk(" %-8s ", zone_names[i]); | 4813 | printk(KERN_CONT " %-8s ", zone_names[i]); |
4767 | if (arch_zone_lowest_possible_pfn[i] == | 4814 | if (arch_zone_lowest_possible_pfn[i] == |
4768 | arch_zone_highest_possible_pfn[i]) | 4815 | arch_zone_highest_possible_pfn[i]) |
4769 | printk("empty\n"); | 4816 | printk(KERN_CONT "empty\n"); |
4770 | else | 4817 | else |
4771 | printk("%0#10lx -> %0#10lx\n", | 4818 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", |
4772 | arch_zone_lowest_possible_pfn[i], | 4819 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, |
4773 | arch_zone_highest_possible_pfn[i]); | 4820 | (arch_zone_highest_possible_pfn[i] |
4821 | << PAGE_SHIFT) - 1); | ||
4774 | } | 4822 | } |
4775 | 4823 | ||
4776 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ | 4824 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ |
4777 | printk("Movable zone start PFN for each node\n"); | 4825 | printk("Movable zone start for each node\n"); |
4778 | for (i = 0; i < MAX_NUMNODES; i++) { | 4826 | for (i = 0; i < MAX_NUMNODES; i++) { |
4779 | if (zone_movable_pfn[i]) | 4827 | if (zone_movable_pfn[i]) |
4780 | printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); | 4828 | printk(" Node %d: %#010lx\n", i, |
4829 | zone_movable_pfn[i] << PAGE_SHIFT); | ||
4781 | } | 4830 | } |
4782 | 4831 | ||
4783 | /* Print out the early_node_map[] */ | 4832 | /* Print out the early_node_map[] */ |
4784 | printk("Early memory PFN ranges\n"); | 4833 | printk("Early memory node ranges\n"); |
4785 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4834 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4786 | printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); | 4835 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
4836 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | ||
4787 | 4837 | ||
4788 | /* Initialise every node */ | 4838 | /* Initialise every node */ |
4789 | mminit_verify_pageflags_layout(); | 4839 | mminit_verify_pageflags_layout(); |
@@ -4976,14 +5026,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
4976 | calculate_totalreserve_pages(); | 5026 | calculate_totalreserve_pages(); |
4977 | } | 5027 | } |
4978 | 5028 | ||
4979 | /** | 5029 | static void __setup_per_zone_wmarks(void) |
4980 | * setup_per_zone_wmarks - called when min_free_kbytes changes | ||
4981 | * or when memory is hot-{added|removed} | ||
4982 | * | ||
4983 | * Ensures that the watermark[min,low,high] values for each zone are set | ||
4984 | * correctly with respect to min_free_kbytes. | ||
4985 | */ | ||
4986 | void setup_per_zone_wmarks(void) | ||
4987 | { | 5030 | { |
4988 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 5031 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
4989 | unsigned long lowmem_pages = 0; | 5032 | unsigned long lowmem_pages = 0; |
@@ -5030,6 +5073,11 @@ void setup_per_zone_wmarks(void) | |||
5030 | 5073 | ||
5031 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5074 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5032 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5075 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5076 | |||
5077 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | ||
5078 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | ||
5079 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | ||
5080 | |||
5033 | setup_zone_migrate_reserve(zone); | 5081 | setup_zone_migrate_reserve(zone); |
5034 | spin_unlock_irqrestore(&zone->lock, flags); | 5082 | spin_unlock_irqrestore(&zone->lock, flags); |
5035 | } | 5083 | } |
@@ -5038,6 +5086,20 @@ void setup_per_zone_wmarks(void) | |||
5038 | calculate_totalreserve_pages(); | 5086 | calculate_totalreserve_pages(); |
5039 | } | 5087 | } |
5040 | 5088 | ||
5089 | /** | ||
5090 | * setup_per_zone_wmarks - called when min_free_kbytes changes | ||
5091 | * or when memory is hot-{added|removed} | ||
5092 | * | ||
5093 | * Ensures that the watermark[min,low,high] values for each zone are set | ||
5094 | * correctly with respect to min_free_kbytes. | ||
5095 | */ | ||
5096 | void setup_per_zone_wmarks(void) | ||
5097 | { | ||
5098 | mutex_lock(&zonelists_mutex); | ||
5099 | __setup_per_zone_wmarks(); | ||
5100 | mutex_unlock(&zonelists_mutex); | ||
5101 | } | ||
5102 | |||
5041 | /* | 5103 | /* |
5042 | * The inactive anon list should be small enough that the VM never has to | 5104 | * The inactive anon list should be small enough that the VM never has to |
5043 | * do too much work, but large enough that each inactive page has a chance | 5105 | * do too much work, but large enough that each inactive page has a chance |
@@ -5203,7 +5265,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
5203 | int ret; | 5265 | int ret; |
5204 | 5266 | ||
5205 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5267 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5206 | if (!write || (ret == -EINVAL)) | 5268 | if (!write || (ret < 0)) |
5207 | return ret; | 5269 | return ret; |
5208 | for_each_populated_zone(zone) { | 5270 | for_each_populated_zone(zone) { |
5209 | for_each_possible_cpu(cpu) { | 5271 | for_each_possible_cpu(cpu) { |
@@ -5242,9 +5304,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5242 | int flags, | 5304 | int flags, |
5243 | unsigned int *_hash_shift, | 5305 | unsigned int *_hash_shift, |
5244 | unsigned int *_hash_mask, | 5306 | unsigned int *_hash_mask, |
5245 | unsigned long limit) | 5307 | unsigned long low_limit, |
5308 | unsigned long high_limit) | ||
5246 | { | 5309 | { |
5247 | unsigned long long max = limit; | 5310 | unsigned long long max = high_limit; |
5248 | unsigned long log2qty, size; | 5311 | unsigned long log2qty, size; |
5249 | void *table = NULL; | 5312 | void *table = NULL; |
5250 | 5313 | ||
@@ -5282,6 +5345,8 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5282 | } | 5345 | } |
5283 | max = min(max, 0x80000000ULL); | 5346 | max = min(max, 0x80000000ULL); |
5284 | 5347 | ||
5348 | if (numentries < low_limit) | ||
5349 | numentries = low_limit; | ||
5285 | if (numentries > max) | 5350 | if (numentries > max) |
5286 | numentries = max; | 5351 | numentries = max; |
5287 | 5352 | ||
@@ -5412,14 +5477,16 @@ static int | |||
5412 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | 5477 | __count_immobile_pages(struct zone *zone, struct page *page, int count) |
5413 | { | 5478 | { |
5414 | unsigned long pfn, iter, found; | 5479 | unsigned long pfn, iter, found; |
5480 | int mt; | ||
5481 | |||
5415 | /* | 5482 | /* |
5416 | * For avoiding noise data, lru_add_drain_all() should be called | 5483 | * For avoiding noise data, lru_add_drain_all() should be called |
5417 | * If ZONE_MOVABLE, the zone never contains immobile pages | 5484 | * If ZONE_MOVABLE, the zone never contains immobile pages |
5418 | */ | 5485 | */ |
5419 | if (zone_idx(zone) == ZONE_MOVABLE) | 5486 | if (zone_idx(zone) == ZONE_MOVABLE) |
5420 | return true; | 5487 | return true; |
5421 | 5488 | mt = get_pageblock_migratetype(page); | |
5422 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) | 5489 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5423 | return true; | 5490 | return true; |
5424 | 5491 | ||
5425 | pfn = page_to_pfn(page); | 5492 | pfn = page_to_pfn(page); |
@@ -5536,7 +5603,7 @@ out: | |||
5536 | return ret; | 5603 | return ret; |
5537 | } | 5604 | } |
5538 | 5605 | ||
5539 | void unset_migratetype_isolate(struct page *page) | 5606 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) |
5540 | { | 5607 | { |
5541 | struct zone *zone; | 5608 | struct zone *zone; |
5542 | unsigned long flags; | 5609 | unsigned long flags; |
@@ -5544,12 +5611,264 @@ void unset_migratetype_isolate(struct page *page) | |||
5544 | spin_lock_irqsave(&zone->lock, flags); | 5611 | spin_lock_irqsave(&zone->lock, flags); |
5545 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 5612 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
5546 | goto out; | 5613 | goto out; |
5547 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 5614 | set_pageblock_migratetype(page, migratetype); |
5548 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | 5615 | move_freepages_block(zone, page, migratetype); |
5549 | out: | 5616 | out: |
5550 | spin_unlock_irqrestore(&zone->lock, flags); | 5617 | spin_unlock_irqrestore(&zone->lock, flags); |
5551 | } | 5618 | } |
5552 | 5619 | ||
5620 | #ifdef CONFIG_CMA | ||
5621 | |||
5622 | static unsigned long pfn_max_align_down(unsigned long pfn) | ||
5623 | { | ||
5624 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, | ||
5625 | pageblock_nr_pages) - 1); | ||
5626 | } | ||
5627 | |||
5628 | static unsigned long pfn_max_align_up(unsigned long pfn) | ||
5629 | { | ||
5630 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, | ||
5631 | pageblock_nr_pages)); | ||
5632 | } | ||
5633 | |||
5634 | static struct page * | ||
5635 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | ||
5636 | int **resultp) | ||
5637 | { | ||
5638 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
5639 | |||
5640 | if (PageHighMem(page)) | ||
5641 | gfp_mask |= __GFP_HIGHMEM; | ||
5642 | |||
5643 | return alloc_page(gfp_mask); | ||
5644 | } | ||
5645 | |||
5646 | /* [start, end) must belong to a single zone. */ | ||
5647 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | ||
5648 | { | ||
5649 | /* This function is based on compact_zone() from compaction.c. */ | ||
5650 | |||
5651 | unsigned long pfn = start; | ||
5652 | unsigned int tries = 0; | ||
5653 | int ret = 0; | ||
5654 | |||
5655 | struct compact_control cc = { | ||
5656 | .nr_migratepages = 0, | ||
5657 | .order = -1, | ||
5658 | .zone = page_zone(pfn_to_page(start)), | ||
5659 | .sync = true, | ||
5660 | }; | ||
5661 | INIT_LIST_HEAD(&cc.migratepages); | ||
5662 | |||
5663 | migrate_prep_local(); | ||
5664 | |||
5665 | while (pfn < end || !list_empty(&cc.migratepages)) { | ||
5666 | if (fatal_signal_pending(current)) { | ||
5667 | ret = -EINTR; | ||
5668 | break; | ||
5669 | } | ||
5670 | |||
5671 | if (list_empty(&cc.migratepages)) { | ||
5672 | cc.nr_migratepages = 0; | ||
5673 | pfn = isolate_migratepages_range(cc.zone, &cc, | ||
5674 | pfn, end); | ||
5675 | if (!pfn) { | ||
5676 | ret = -EINTR; | ||
5677 | break; | ||
5678 | } | ||
5679 | tries = 0; | ||
5680 | } else if (++tries == 5) { | ||
5681 | ret = ret < 0 ? ret : -EBUSY; | ||
5682 | break; | ||
5683 | } | ||
5684 | |||
5685 | ret = migrate_pages(&cc.migratepages, | ||
5686 | __alloc_contig_migrate_alloc, | ||
5687 | 0, false, MIGRATE_SYNC); | ||
5688 | } | ||
5689 | |||
5690 | putback_lru_pages(&cc.migratepages); | ||
5691 | return ret > 0 ? 0 : ret; | ||
5692 | } | ||
5693 | |||
5694 | /* | ||
5695 | * Update zone's cma pages counter used for watermark level calculation. | ||
5696 | */ | ||
5697 | static inline void __update_cma_watermarks(struct zone *zone, int count) | ||
5698 | { | ||
5699 | unsigned long flags; | ||
5700 | spin_lock_irqsave(&zone->lock, flags); | ||
5701 | zone->min_cma_pages += count; | ||
5702 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5703 | setup_per_zone_wmarks(); | ||
5704 | } | ||
5705 | |||
5706 | /* | ||
5707 | * Trigger memory pressure bump to reclaim some pages in order to be able to | ||
5708 | * allocate 'count' pages in single page units. Does similar work as | ||
5709 | *__alloc_pages_slowpath() function. | ||
5710 | */ | ||
5711 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | ||
5712 | { | ||
5713 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
5714 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | ||
5715 | int did_some_progress = 0; | ||
5716 | int order = 1; | ||
5717 | |||
5718 | /* | ||
5719 | * Increase level of watermarks to force kswapd do his job | ||
5720 | * to stabilise at new watermark level. | ||
5721 | */ | ||
5722 | __update_cma_watermarks(zone, count); | ||
5723 | |||
5724 | /* Obey watermarks as if the page was being allocated */ | ||
5725 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { | ||
5726 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | ||
5727 | |||
5728 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
5729 | NULL); | ||
5730 | if (!did_some_progress) { | ||
5731 | /* Exhausted what can be done so it's blamo time */ | ||
5732 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | ||
5733 | } | ||
5734 | } | ||
5735 | |||
5736 | /* Restore original watermark levels. */ | ||
5737 | __update_cma_watermarks(zone, -count); | ||
5738 | |||
5739 | return count; | ||
5740 | } | ||
5741 | |||
5742 | /** | ||
5743 | * alloc_contig_range() -- tries to allocate given range of pages | ||
5744 | * @start: start PFN to allocate | ||
5745 | * @end: one-past-the-last PFN to allocate | ||
5746 | * @migratetype: migratetype of the underlaying pageblocks (either | ||
5747 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks | ||
5748 | * in range must have the same migratetype and it must | ||
5749 | * be either of the two. | ||
5750 | * | ||
5751 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES | ||
5752 | * aligned, however it's the caller's responsibility to guarantee that | ||
5753 | * we are the only thread that changes migrate type of pageblocks the | ||
5754 | * pages fall in. | ||
5755 | * | ||
5756 | * The PFN range must belong to a single zone. | ||
5757 | * | ||
5758 | * Returns zero on success or negative error code. On success all | ||
5759 | * pages which PFN is in [start, end) are allocated for the caller and | ||
5760 | * need to be freed with free_contig_range(). | ||
5761 | */ | ||
5762 | int alloc_contig_range(unsigned long start, unsigned long end, | ||
5763 | unsigned migratetype) | ||
5764 | { | ||
5765 | struct zone *zone = page_zone(pfn_to_page(start)); | ||
5766 | unsigned long outer_start, outer_end; | ||
5767 | int ret = 0, order; | ||
5768 | |||
5769 | /* | ||
5770 | * What we do here is we mark all pageblocks in range as | ||
5771 | * MIGRATE_ISOLATE. Because pageblock and max order pages may | ||
5772 | * have different sizes, and due to the way page allocator | ||
5773 | * work, we align the range to biggest of the two pages so | ||
5774 | * that page allocator won't try to merge buddies from | ||
5775 | * different pageblocks and change MIGRATE_ISOLATE to some | ||
5776 | * other migration type. | ||
5777 | * | ||
5778 | * Once the pageblocks are marked as MIGRATE_ISOLATE, we | ||
5779 | * migrate the pages from an unaligned range (ie. pages that | ||
5780 | * we are interested in). This will put all the pages in | ||
5781 | * range back to page allocator as MIGRATE_ISOLATE. | ||
5782 | * | ||
5783 | * When this is done, we take the pages in range from page | ||
5784 | * allocator removing them from the buddy system. This way | ||
5785 | * page allocator will never consider using them. | ||
5786 | * | ||
5787 | * This lets us mark the pageblocks back as | ||
5788 | * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the | ||
5789 | * aligned range but not in the unaligned, original range are | ||
5790 | * put back to page allocator so that buddy can use them. | ||
5791 | */ | ||
5792 | |||
5793 | ret = start_isolate_page_range(pfn_max_align_down(start), | ||
5794 | pfn_max_align_up(end), migratetype); | ||
5795 | if (ret) | ||
5796 | goto done; | ||
5797 | |||
5798 | ret = __alloc_contig_migrate_range(start, end); | ||
5799 | if (ret) | ||
5800 | goto done; | ||
5801 | |||
5802 | /* | ||
5803 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES | ||
5804 | * aligned blocks that are marked as MIGRATE_ISOLATE. What's | ||
5805 | * more, all pages in [start, end) are free in page allocator. | ||
5806 | * What we are going to do is to allocate all pages from | ||
5807 | * [start, end) (that is remove them from page allocator). | ||
5808 | * | ||
5809 | * The only problem is that pages at the beginning and at the | ||
5810 | * end of interesting range may be not aligned with pages that | ||
5811 | * page allocator holds, ie. they can be part of higher order | ||
5812 | * pages. Because of this, we reserve the bigger range and | ||
5813 | * once this is done free the pages we are not interested in. | ||
5814 | * | ||
5815 | * We don't have to hold zone->lock here because the pages are | ||
5816 | * isolated thus they won't get removed from buddy. | ||
5817 | */ | ||
5818 | |||
5819 | lru_add_drain_all(); | ||
5820 | drain_all_pages(); | ||
5821 | |||
5822 | order = 0; | ||
5823 | outer_start = start; | ||
5824 | while (!PageBuddy(pfn_to_page(outer_start))) { | ||
5825 | if (++order >= MAX_ORDER) { | ||
5826 | ret = -EBUSY; | ||
5827 | goto done; | ||
5828 | } | ||
5829 | outer_start &= ~0UL << order; | ||
5830 | } | ||
5831 | |||
5832 | /* Make sure the range is really isolated. */ | ||
5833 | if (test_pages_isolated(outer_start, end)) { | ||
5834 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", | ||
5835 | outer_start, end); | ||
5836 | ret = -EBUSY; | ||
5837 | goto done; | ||
5838 | } | ||
5839 | |||
5840 | /* | ||
5841 | * Reclaim enough pages to make sure that contiguous allocation | ||
5842 | * will not starve the system. | ||
5843 | */ | ||
5844 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | ||
5845 | |||
5846 | /* Grab isolated pages from freelists. */ | ||
5847 | outer_end = isolate_freepages_range(outer_start, end); | ||
5848 | if (!outer_end) { | ||
5849 | ret = -EBUSY; | ||
5850 | goto done; | ||
5851 | } | ||
5852 | |||
5853 | /* Free head and tail (if any) */ | ||
5854 | if (start != outer_start) | ||
5855 | free_contig_range(outer_start, start - outer_start); | ||
5856 | if (end != outer_end) | ||
5857 | free_contig_range(end, outer_end - end); | ||
5858 | |||
5859 | done: | ||
5860 | undo_isolate_page_range(pfn_max_align_down(start), | ||
5861 | pfn_max_align_up(end), migratetype); | ||
5862 | return ret; | ||
5863 | } | ||
5864 | |||
5865 | void free_contig_range(unsigned long pfn, unsigned nr_pages) | ||
5866 | { | ||
5867 | for (; nr_pages--; ++pfn) | ||
5868 | __free_page(pfn_to_page(pfn)); | ||
5869 | } | ||
5870 | #endif | ||
5871 | |||
5553 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5872 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5554 | /* | 5873 | /* |
5555 | * All pages in the range must be isolated before calling this. | 5874 | * All pages in the range must be isolated before calling this. |
@@ -5618,7 +5937,7 @@ bool is_free_buddy_page(struct page *page) | |||
5618 | } | 5937 | } |
5619 | #endif | 5938 | #endif |
5620 | 5939 | ||
5621 | static struct trace_print_flags pageflag_names[] = { | 5940 | static const struct trace_print_flags pageflag_names[] = { |
5622 | {1UL << PG_locked, "locked" }, | 5941 | {1UL << PG_locked, "locked" }, |
5623 | {1UL << PG_error, "error" }, | 5942 | {1UL << PG_error, "error" }, |
5624 | {1UL << PG_referenced, "referenced" }, | 5943 | {1UL << PG_referenced, "referenced" }, |
@@ -5653,7 +5972,9 @@ static struct trace_print_flags pageflag_names[] = { | |||
5653 | #ifdef CONFIG_MEMORY_FAILURE | 5972 | #ifdef CONFIG_MEMORY_FAILURE |
5654 | {1UL << PG_hwpoison, "hwpoison" }, | 5973 | {1UL << PG_hwpoison, "hwpoison" }, |
5655 | #endif | 5974 | #endif |
5656 | {-1UL, NULL }, | 5975 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
5976 | {1UL << PG_compound_lock, "compound_lock" }, | ||
5977 | #endif | ||
5657 | }; | 5978 | }; |
5658 | 5979 | ||
5659 | static void dump_page_flags(unsigned long flags) | 5980 | static void dump_page_flags(unsigned long flags) |
@@ -5662,12 +5983,14 @@ static void dump_page_flags(unsigned long flags) | |||
5662 | unsigned long mask; | 5983 | unsigned long mask; |
5663 | int i; | 5984 | int i; |
5664 | 5985 | ||
5986 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | ||
5987 | |||
5665 | printk(KERN_ALERT "page flags: %#lx(", flags); | 5988 | printk(KERN_ALERT "page flags: %#lx(", flags); |
5666 | 5989 | ||
5667 | /* remove zone id */ | 5990 | /* remove zone id */ |
5668 | flags &= (1UL << NR_PAGEFLAGS) - 1; | 5991 | flags &= (1UL << NR_PAGEFLAGS) - 1; |
5669 | 5992 | ||
5670 | for (i = 0; pageflag_names[i].name && flags; i++) { | 5993 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { |
5671 | 5994 | ||
5672 | mask = pageflag_names[i].mask; | 5995 | mask = pageflag_names[i].mask; |
5673 | if ((flags & mask) != mask) | 5996 | if ((flags & mask) != mask) |