aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c816
1 files changed, 518 insertions, 298 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9aa..d052abbe3063 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/suspend.h> 28#include <linux/suspend.h>
28#include <linux/pagevec.h> 29#include <linux/pagevec.h>
@@ -72,6 +73,7 @@ unsigned long totalram_pages __read_mostly;
72unsigned long totalreserve_pages __read_mostly; 73unsigned long totalreserve_pages __read_mostly;
73unsigned long highest_memmap_pfn __read_mostly; 74unsigned long highest_memmap_pfn __read_mostly;
74int percpu_pagelist_fraction; 75int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
75 77
76#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 78#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
77int pageblock_order __read_mostly; 79int pageblock_order __read_mostly;
@@ -161,17 +163,25 @@ static unsigned long __meminitdata dma_reserve;
161 163
162#if MAX_NUMNODES > 1 164#if MAX_NUMNODES > 1
163int nr_node_ids __read_mostly = MAX_NUMNODES; 165int nr_node_ids __read_mostly = MAX_NUMNODES;
166int nr_online_nodes __read_mostly = 1;
164EXPORT_SYMBOL(nr_node_ids); 167EXPORT_SYMBOL(nr_node_ids);
168EXPORT_SYMBOL(nr_online_nodes);
165#endif 169#endif
166 170
167int page_group_by_mobility_disabled __read_mostly; 171int page_group_by_mobility_disabled __read_mostly;
168 172
169static void set_pageblock_migratetype(struct page *page, int migratetype) 173static void set_pageblock_migratetype(struct page *page, int migratetype)
170{ 174{
175
176 if (unlikely(page_group_by_mobility_disabled))
177 migratetype = MIGRATE_UNMOVABLE;
178
171 set_pageblock_flags_group(page, (unsigned long)migratetype, 179 set_pageblock_flags_group(page, (unsigned long)migratetype,
172 PB_migrate, PB_migrate_end); 180 PB_migrate, PB_migrate_end);
173} 181}
174 182
183bool oom_killer_disabled __read_mostly;
184
175#ifdef CONFIG_DEBUG_VM 185#ifdef CONFIG_DEBUG_VM
176static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 186static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
177{ 187{
@@ -294,23 +304,6 @@ void prep_compound_page(struct page *page, unsigned long order)
294 } 304 }
295} 305}
296 306
297#ifdef CONFIG_HUGETLBFS
298void prep_compound_gigantic_page(struct page *page, unsigned long order)
299{
300 int i;
301 int nr_pages = 1 << order;
302 struct page *p = page + 1;
303
304 set_compound_page_dtor(page, free_compound_page);
305 set_compound_order(page, order);
306 __SetPageHead(page);
307 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
308 __SetPageTail(p);
309 p->first_page = page;
310 }
311}
312#endif
313
314static int destroy_compound_page(struct page *page, unsigned long order) 307static int destroy_compound_page(struct page *page, unsigned long order)
315{ 308{
316 int i; 309 int i;
@@ -417,7 +410,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
417 return 0; 410 return 0;
418 411
419 if (PageBuddy(buddy) && page_order(buddy) == order) { 412 if (PageBuddy(buddy) && page_order(buddy) == order) {
420 BUG_ON(page_count(buddy) != 0); 413 VM_BUG_ON(page_count(buddy) != 0);
421 return 1; 414 return 1;
422 } 415 }
423 return 0; 416 return 0;
@@ -448,22 +441,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
448 */ 441 */
449 442
450static inline void __free_one_page(struct page *page, 443static inline void __free_one_page(struct page *page,
451 struct zone *zone, unsigned int order) 444 struct zone *zone, unsigned int order,
445 int migratetype)
452{ 446{
453 unsigned long page_idx; 447 unsigned long page_idx;
454 int order_size = 1 << order;
455 int migratetype = get_pageblock_migratetype(page);
456 448
457 if (unlikely(PageCompound(page))) 449 if (unlikely(PageCompound(page)))
458 if (unlikely(destroy_compound_page(page, order))) 450 if (unlikely(destroy_compound_page(page, order)))
459 return; 451 return;
460 452
453 VM_BUG_ON(migratetype == -1);
454
461 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 455 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
462 456
463 VM_BUG_ON(page_idx & (order_size - 1)); 457 VM_BUG_ON(page_idx & ((1 << order) - 1));
464 VM_BUG_ON(bad_range(zone, page)); 458 VM_BUG_ON(bad_range(zone, page));
465 459
466 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
467 while (order < MAX_ORDER-1) { 460 while (order < MAX_ORDER-1) {
468 unsigned long combined_idx; 461 unsigned long combined_idx;
469 struct page *buddy; 462 struct page *buddy;
@@ -487,12 +480,26 @@ static inline void __free_one_page(struct page *page,
487 zone->free_area[order].nr_free++; 480 zone->free_area[order].nr_free++;
488} 481}
489 482
483#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
484/*
485 * free_page_mlock() -- clean up attempts to free and mlocked() page.
486 * Page should not be on lru, so no need to fix that up.
487 * free_pages_check() will verify...
488 */
489static inline void free_page_mlock(struct page *page)
490{
491 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497
490static inline int free_pages_check(struct page *page) 498static inline int free_pages_check(struct page *page)
491{ 499{
492 free_page_mlock(page);
493 if (unlikely(page_mapcount(page) | 500 if (unlikely(page_mapcount(page) |
494 (page->mapping != NULL) | 501 (page->mapping != NULL) |
495 (page_count(page) != 0) | 502 (atomic_read(&page->_count) != 0) |
496 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 503 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
497 bad_page(page); 504 bad_page(page);
498 return 1; 505 return 1;
@@ -519,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
519 spin_lock(&zone->lock); 526 spin_lock(&zone->lock);
520 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
521 zone->pages_scanned = 0; 528 zone->pages_scanned = 0;
529
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
522 while (count--) { 531 while (count--) {
523 struct page *page; 532 struct page *page;
524 533
@@ -526,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count,
526 page = list_entry(list->prev, struct page, lru); 535 page = list_entry(list->prev, struct page, lru);
527 /* have to delete it as __free_one_page list manipulates */ 536 /* have to delete it as __free_one_page list manipulates */
528 list_del(&page->lru); 537 list_del(&page->lru);
529 __free_one_page(page, zone, order); 538 __free_one_page(page, zone, order, page_private(page));
530 } 539 }
531 spin_unlock(&zone->lock); 540 spin_unlock(&zone->lock);
532} 541}
533 542
534static void free_one_page(struct zone *zone, struct page *page, int order) 543static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype)
535{ 545{
536 spin_lock(&zone->lock); 546 spin_lock(&zone->lock);
537 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
538 zone->pages_scanned = 0; 548 zone->pages_scanned = 0;
539 __free_one_page(page, zone, order); 549
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype);
540 spin_unlock(&zone->lock); 552 spin_unlock(&zone->lock);
541} 553}
542 554
@@ -545,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
545 unsigned long flags; 557 unsigned long flags;
546 int i; 558 int i;
547 int bad = 0; 559 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page);
561
562 kmemcheck_free_shadow(page, order);
548 563
549 for (i = 0 ; i < (1 << order) ; ++i) 564 for (i = 0 ; i < (1 << order) ; ++i)
550 bad += free_pages_check(page + i); 565 bad += free_pages_check(page + i);
@@ -560,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
560 kernel_map_pages(page, 1 << order, 0); 575 kernel_map_pages(page, 1 << order, 0);
561 576
562 local_irq_save(flags); 577 local_irq_save(flags);
578 if (unlikely(wasMlocked))
579 free_page_mlock(page);
563 __count_vm_events(PGFREE, 1 << order); 580 __count_vm_events(PGFREE, 1 << order);
564 free_one_page(page_zone(page), page, order); 581 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page));
565 local_irq_restore(flags); 583 local_irq_restore(flags);
566} 584}
567 585
@@ -632,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
632{ 650{
633 if (unlikely(page_mapcount(page) | 651 if (unlikely(page_mapcount(page) |
634 (page->mapping != NULL) | 652 (page->mapping != NULL) |
635 (page_count(page) != 0) | 653 (atomic_read(&page->_count) != 0) |
636 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 654 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
637 bad_page(page); 655 bad_page(page);
638 return 1; 656 return 1;
@@ -657,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
657 * Go through the free lists for the given migratetype and remove 675 * Go through the free lists for the given migratetype and remove
658 * the smallest available page from the freelists 676 * the smallest available page from the freelists
659 */ 677 */
660static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 678static inline
679struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
661 int migratetype) 680 int migratetype)
662{ 681{
663 unsigned int current_order; 682 unsigned int current_order;
@@ -675,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
675 list_del(&page->lru); 694 list_del(&page->lru);
676 rmv_page_order(page); 695 rmv_page_order(page);
677 area->nr_free--; 696 area->nr_free--;
678 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
679 expand(zone, page, order, current_order, area, migratetype); 697 expand(zone, page, order, current_order, area, migratetype);
680 return page; 698 return page;
681 } 699 }
@@ -766,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
766} 784}
767 785
768/* Remove an element from the buddy allocator from the fallback list */ 786/* Remove an element from the buddy allocator from the fallback list */
769static struct page *__rmqueue_fallback(struct zone *zone, int order, 787static inline struct page *
770 int start_migratetype) 788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
771{ 789{
772 struct free_area * area; 790 struct free_area * area;
773 int current_order; 791 int current_order;
@@ -815,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
815 /* Remove the page from the freelists */ 833 /* Remove the page from the freelists */
816 list_del(&page->lru); 834 list_del(&page->lru);
817 rmv_page_order(page); 835 rmv_page_order(page);
818 __mod_zone_page_state(zone, NR_FREE_PAGES,
819 -(1UL << order));
820 836
821 if (current_order == pageblock_order) 837 if (current_order == pageblock_order)
822 set_pageblock_migratetype(page, 838 set_pageblock_migratetype(page,
@@ -827,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
827 } 843 }
828 } 844 }
829 845
830 /* Use MIGRATE_RESERVE rather than fail an allocation */ 846 return NULL;
831 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
832} 847}
833 848
834/* 849/*
@@ -840,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
840{ 855{
841 struct page *page; 856 struct page *page;
842 857
858retry_reserve:
843 page = __rmqueue_smallest(zone, order, migratetype); 859 page = __rmqueue_smallest(zone, order, migratetype);
844 860
845 if (unlikely(!page)) 861 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
846 page = __rmqueue_fallback(zone, order, migratetype); 862 page = __rmqueue_fallback(zone, order, migratetype);
847 863
864 /*
865 * Use MIGRATE_RESERVE rather than fail an allocation. goto
866 * is used because __rmqueue_smallest is an inline function
867 * and we want just one call site
868 */
869 if (!page) {
870 migratetype = MIGRATE_RESERVE;
871 goto retry_reserve;
872 }
873 }
874
848 return page; 875 return page;
849} 876}
850 877
@@ -855,7 +882,7 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
855 */ 882 */
856static int rmqueue_bulk(struct zone *zone, unsigned int order, 883static int rmqueue_bulk(struct zone *zone, unsigned int order,
857 unsigned long count, struct list_head *list, 884 unsigned long count, struct list_head *list,
858 int migratetype) 885 int migratetype, int cold)
859{ 886{
860 int i; 887 int i;
861 888
@@ -874,10 +901,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
874 * merge IO requests if the physical pages are ordered 901 * merge IO requests if the physical pages are ordered
875 * properly. 902 * properly.
876 */ 903 */
877 list_add(&page->lru, list); 904 if (likely(cold == 0))
905 list_add(&page->lru, list);
906 else
907 list_add_tail(&page->lru, list);
878 set_page_private(page, migratetype); 908 set_page_private(page, migratetype);
879 list = &page->lru; 909 list = &page->lru;
880 } 910 }
911 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
881 spin_unlock(&zone->lock); 912 spin_unlock(&zone->lock);
882 return i; 913 return i;
883} 914}
@@ -993,6 +1024,9 @@ static void free_hot_cold_page(struct page *page, int cold)
993 struct zone *zone = page_zone(page); 1024 struct zone *zone = page_zone(page);
994 struct per_cpu_pages *pcp; 1025 struct per_cpu_pages *pcp;
995 unsigned long flags; 1026 unsigned long flags;
1027 int wasMlocked = TestClearPageMlocked(page);
1028
1029 kmemcheck_free_shadow(page, 0);
996 1030
997 if (PageAnon(page)) 1031 if (PageAnon(page))
998 page->mapping = NULL; 1032 page->mapping = NULL;
@@ -1007,13 +1041,16 @@ static void free_hot_cold_page(struct page *page, int cold)
1007 kernel_map_pages(page, 1, 0); 1041 kernel_map_pages(page, 1, 0);
1008 1042
1009 pcp = &zone_pcp(zone, get_cpu())->pcp; 1043 pcp = &zone_pcp(zone, get_cpu())->pcp;
1044 set_page_private(page, get_pageblock_migratetype(page));
1010 local_irq_save(flags); 1045 local_irq_save(flags);
1046 if (unlikely(wasMlocked))
1047 free_page_mlock(page);
1011 __count_vm_event(PGFREE); 1048 __count_vm_event(PGFREE);
1049
1012 if (cold) 1050 if (cold)
1013 list_add_tail(&page->lru, &pcp->list); 1051 list_add_tail(&page->lru, &pcp->list);
1014 else 1052 else
1015 list_add(&page->lru, &pcp->list); 1053 list_add(&page->lru, &pcp->list);
1016 set_page_private(page, get_pageblock_migratetype(page));
1017 pcp->count++; 1054 pcp->count++;
1018 if (pcp->count >= pcp->high) { 1055 if (pcp->count >= pcp->high) {
1019 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1056 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -1047,6 +1084,16 @@ void split_page(struct page *page, unsigned int order)
1047 1084
1048 VM_BUG_ON(PageCompound(page)); 1085 VM_BUG_ON(PageCompound(page));
1049 VM_BUG_ON(!page_count(page)); 1086 VM_BUG_ON(!page_count(page));
1087
1088#ifdef CONFIG_KMEMCHECK
1089 /*
1090 * Split shadow pages too, because free(page[0]) would
1091 * otherwise free the whole shadow.
1092 */
1093 if (kmemcheck_page_is_tracked(page))
1094 split_page(virt_to_page(page[0].shadow), order);
1095#endif
1096
1050 for (i = 1; i < (1 << order); i++) 1097 for (i = 1; i < (1 << order); i++)
1051 set_page_refcounted(page + i); 1098 set_page_refcounted(page + i);
1052} 1099}
@@ -1056,14 +1103,15 @@ void split_page(struct page *page, unsigned int order)
1056 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1103 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1057 * or two. 1104 * or two.
1058 */ 1105 */
1059static struct page *buffered_rmqueue(struct zone *preferred_zone, 1106static inline
1060 struct zone *zone, int order, gfp_t gfp_flags) 1107struct page *buffered_rmqueue(struct zone *preferred_zone,
1108 struct zone *zone, int order, gfp_t gfp_flags,
1109 int migratetype)
1061{ 1110{
1062 unsigned long flags; 1111 unsigned long flags;
1063 struct page *page; 1112 struct page *page;
1064 int cold = !!(gfp_flags & __GFP_COLD); 1113 int cold = !!(gfp_flags & __GFP_COLD);
1065 int cpu; 1114 int cpu;
1066 int migratetype = allocflags_to_migratetype(gfp_flags);
1067 1115
1068again: 1116again:
1069 cpu = get_cpu(); 1117 cpu = get_cpu();
@@ -1074,7 +1122,8 @@ again:
1074 local_irq_save(flags); 1122 local_irq_save(flags);
1075 if (!pcp->count) { 1123 if (!pcp->count) {
1076 pcp->count = rmqueue_bulk(zone, 0, 1124 pcp->count = rmqueue_bulk(zone, 0,
1077 pcp->batch, &pcp->list, migratetype); 1125 pcp->batch, &pcp->list,
1126 migratetype, cold);
1078 if (unlikely(!pcp->count)) 1127 if (unlikely(!pcp->count))
1079 goto failed; 1128 goto failed;
1080 } 1129 }
@@ -1093,15 +1142,30 @@ again:
1093 /* Allocate more to the pcp list if necessary */ 1142 /* Allocate more to the pcp list if necessary */
1094 if (unlikely(&page->lru == &pcp->list)) { 1143 if (unlikely(&page->lru == &pcp->list)) {
1095 pcp->count += rmqueue_bulk(zone, 0, 1144 pcp->count += rmqueue_bulk(zone, 0,
1096 pcp->batch, &pcp->list, migratetype); 1145 pcp->batch, &pcp->list,
1146 migratetype, cold);
1097 page = list_entry(pcp->list.next, struct page, lru); 1147 page = list_entry(pcp->list.next, struct page, lru);
1098 } 1148 }
1099 1149
1100 list_del(&page->lru); 1150 list_del(&page->lru);
1101 pcp->count--; 1151 pcp->count--;
1102 } else { 1152 } else {
1153 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1154 /*
1155 * __GFP_NOFAIL is not to be used in new code.
1156 *
1157 * All __GFP_NOFAIL callers should be fixed so that they
1158 * properly detect and handle allocation failures.
1159 *
1160 * We most definitely don't want callers attempting to
1161 * allocate greater than order-1 page units with
1162 * __GFP_NOFAIL.
1163 */
1164 WARN_ON_ONCE(order > 1);
1165 }
1103 spin_lock_irqsave(&zone->lock, flags); 1166 spin_lock_irqsave(&zone->lock, flags);
1104 page = __rmqueue(zone, order, migratetype); 1167 page = __rmqueue(zone, order, migratetype);
1168 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1105 spin_unlock(&zone->lock); 1169 spin_unlock(&zone->lock);
1106 if (!page) 1170 if (!page)
1107 goto failed; 1171 goto failed;
@@ -1123,10 +1187,15 @@ failed:
1123 return NULL; 1187 return NULL;
1124} 1188}
1125 1189
1126#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1190/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1127#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1191#define ALLOC_WMARK_MIN WMARK_MIN
1128#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1192#define ALLOC_WMARK_LOW WMARK_LOW
1129#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1193#define ALLOC_WMARK_HIGH WMARK_HIGH
1194#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1195
1196/* Mask to get the watermark bits */
1197#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1198
1130#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1199#define ALLOC_HARDER 0x10 /* try to alloc harder */
1131#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1200#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1132#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1201#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1384,23 +1453,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1384 */ 1453 */
1385static struct page * 1454static struct page *
1386get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1455get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1387 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1456 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1457 struct zone *preferred_zone, int migratetype)
1388{ 1458{
1389 struct zoneref *z; 1459 struct zoneref *z;
1390 struct page *page = NULL; 1460 struct page *page = NULL;
1391 int classzone_idx; 1461 int classzone_idx;
1392 struct zone *zone, *preferred_zone; 1462 struct zone *zone;
1393 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1463 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1394 int zlc_active = 0; /* set if using zonelist_cache */ 1464 int zlc_active = 0; /* set if using zonelist_cache */
1395 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1465 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1396 1466
1397 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1398 &preferred_zone);
1399 if (!preferred_zone)
1400 return NULL;
1401
1402 classzone_idx = zone_idx(preferred_zone); 1467 classzone_idx = zone_idx(preferred_zone);
1403
1404zonelist_scan: 1468zonelist_scan:
1405 /* 1469 /*
1406 * Scan zonelist, looking for a zone with enough free. 1470 * Scan zonelist, looking for a zone with enough free.
@@ -1415,31 +1479,49 @@ zonelist_scan:
1415 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1479 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1416 goto try_next_zone; 1480 goto try_next_zone;
1417 1481
1482 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1418 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1483 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1419 unsigned long mark; 1484 unsigned long mark;
1420 if (alloc_flags & ALLOC_WMARK_MIN) 1485 int ret;
1421 mark = zone->pages_min; 1486
1422 else if (alloc_flags & ALLOC_WMARK_LOW) 1487 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1423 mark = zone->pages_low; 1488 if (zone_watermark_ok(zone, order, mark,
1424 else 1489 classzone_idx, alloc_flags))
1425 mark = zone->pages_high; 1490 goto try_this_zone;
1426 if (!zone_watermark_ok(zone, order, mark, 1491
1427 classzone_idx, alloc_flags)) { 1492 if (zone_reclaim_mode == 0)
1428 if (!zone_reclaim_mode || 1493 goto this_zone_full;
1429 !zone_reclaim(zone, gfp_mask, order)) 1494
1495 ret = zone_reclaim(zone, gfp_mask, order);
1496 switch (ret) {
1497 case ZONE_RECLAIM_NOSCAN:
1498 /* did not scan */
1499 goto try_next_zone;
1500 case ZONE_RECLAIM_FULL:
1501 /* scanned but unreclaimable */
1502 goto this_zone_full;
1503 default:
1504 /* did we reclaim enough */
1505 if (!zone_watermark_ok(zone, order, mark,
1506 classzone_idx, alloc_flags))
1430 goto this_zone_full; 1507 goto this_zone_full;
1431 } 1508 }
1432 } 1509 }
1433 1510
1434 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1511try_this_zone:
1512 page = buffered_rmqueue(preferred_zone, zone, order,
1513 gfp_mask, migratetype);
1435 if (page) 1514 if (page)
1436 break; 1515 break;
1437this_zone_full: 1516this_zone_full:
1438 if (NUMA_BUILD) 1517 if (NUMA_BUILD)
1439 zlc_mark_zone_full(zonelist, z); 1518 zlc_mark_zone_full(zonelist, z);
1440try_next_zone: 1519try_next_zone:
1441 if (NUMA_BUILD && !did_zlc_setup) { 1520 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1442 /* we do zlc_setup after the first zone is tried */ 1521 /*
1522 * we do zlc_setup after the first zone is tried but only
1523 * if there are multiple nodes make it worthwhile
1524 */
1443 allowednodes = zlc_setup(zonelist, alloc_flags); 1525 allowednodes = zlc_setup(zonelist, alloc_flags);
1444 zlc_active = 1; 1526 zlc_active = 1;
1445 did_zlc_setup = 1; 1527 did_zlc_setup = 1;
@@ -1454,47 +1536,219 @@ try_next_zone:
1454 return page; 1536 return page;
1455} 1537}
1456 1538
1539static inline int
1540should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1541 unsigned long pages_reclaimed)
1542{
1543 /* Do not loop if specifically requested */
1544 if (gfp_mask & __GFP_NORETRY)
1545 return 0;
1546
1547 /*
1548 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1549 * means __GFP_NOFAIL, but that may not be true in other
1550 * implementations.
1551 */
1552 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1553 return 1;
1554
1555 /*
1556 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1557 * specified, then we retry until we no longer reclaim any pages
1558 * (above), or we've reclaimed an order of pages at least as
1559 * large as the allocation's order. In both cases, if the
1560 * allocation still fails, we stop retrying.
1561 */
1562 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1563 return 1;
1564
1565 /*
1566 * Don't let big-order allocations loop unless the caller
1567 * explicitly requests that.
1568 */
1569 if (gfp_mask & __GFP_NOFAIL)
1570 return 1;
1571
1572 return 0;
1573}
1574
1575static inline struct page *
1576__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1577 struct zonelist *zonelist, enum zone_type high_zoneidx,
1578 nodemask_t *nodemask, struct zone *preferred_zone,
1579 int migratetype)
1580{
1581 struct page *page;
1582
1583 /* Acquire the OOM killer lock for the zones in zonelist */
1584 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1585 schedule_timeout_uninterruptible(1);
1586 return NULL;
1587 }
1588
1589 /*
1590 * Go through the zonelist yet one more time, keep very high watermark
1591 * here, this is only to catch a parallel oom killing, we must fail if
1592 * we're still under heavy pressure.
1593 */
1594 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1595 order, zonelist, high_zoneidx,
1596 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1597 preferred_zone, migratetype);
1598 if (page)
1599 goto out;
1600
1601 /* The OOM killer will not help higher order allocs */
1602 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1603 goto out;
1604
1605 /* Exhausted what can be done so it's blamo time */
1606 out_of_memory(zonelist, gfp_mask, order);
1607
1608out:
1609 clear_zonelist_oom(zonelist, gfp_mask);
1610 return page;
1611}
1612
1613/* The really slow allocator path where we enter direct reclaim */
1614static inline struct page *
1615__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1616 struct zonelist *zonelist, enum zone_type high_zoneidx,
1617 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1618 int migratetype, unsigned long *did_some_progress)
1619{
1620 struct page *page = NULL;
1621 struct reclaim_state reclaim_state;
1622 struct task_struct *p = current;
1623
1624 cond_resched();
1625
1626 /* We now go into synchronous reclaim */
1627 cpuset_memory_pressure_bump();
1628
1629 /*
1630 * The task's cpuset might have expanded its set of allowable nodes
1631 */
1632 p->flags |= PF_MEMALLOC;
1633 lockdep_set_current_reclaim_state(gfp_mask);
1634 reclaim_state.reclaimed_slab = 0;
1635 p->reclaim_state = &reclaim_state;
1636
1637 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1638
1639 p->reclaim_state = NULL;
1640 lockdep_clear_current_reclaim_state();
1641 p->flags &= ~PF_MEMALLOC;
1642
1643 cond_resched();
1644
1645 if (order != 0)
1646 drain_all_pages();
1647
1648 if (likely(*did_some_progress))
1649 page = get_page_from_freelist(gfp_mask, nodemask, order,
1650 zonelist, high_zoneidx,
1651 alloc_flags, preferred_zone,
1652 migratetype);
1653 return page;
1654}
1655
1457/* 1656/*
1458 * This is the 'heart' of the zoned buddy allocator. 1657 * This is called in the allocator slow-path if the allocation request is of
1658 * sufficient urgency to ignore watermarks and take other desperate measures
1459 */ 1659 */
1460struct page * 1660static inline struct page *
1461__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1661__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1462 struct zonelist *zonelist, nodemask_t *nodemask) 1662 struct zonelist *zonelist, enum zone_type high_zoneidx,
1663 nodemask_t *nodemask, struct zone *preferred_zone,
1664 int migratetype)
1665{
1666 struct page *page;
1667
1668 do {
1669 page = get_page_from_freelist(gfp_mask, nodemask, order,
1670 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1671 preferred_zone, migratetype);
1672
1673 if (!page && gfp_mask & __GFP_NOFAIL)
1674 congestion_wait(BLK_RW_ASYNC, HZ/50);
1675 } while (!page && (gfp_mask & __GFP_NOFAIL));
1676
1677 return page;
1678}
1679
1680static inline
1681void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1682 enum zone_type high_zoneidx)
1463{ 1683{
1464 const gfp_t wait = gfp_mask & __GFP_WAIT;
1465 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1466 struct zoneref *z; 1684 struct zoneref *z;
1467 struct zone *zone; 1685 struct zone *zone;
1468 struct page *page;
1469 struct reclaim_state reclaim_state;
1470 struct task_struct *p = current;
1471 int do_retry;
1472 int alloc_flags;
1473 unsigned long did_some_progress;
1474 unsigned long pages_reclaimed = 0;
1475 1686
1476 lockdep_trace_alloc(gfp_mask); 1687 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1688 wakeup_kswapd(zone, order);
1689}
1477 1690
1478 might_sleep_if(wait); 1691static inline int
1692gfp_to_alloc_flags(gfp_t gfp_mask)
1693{
1694 struct task_struct *p = current;
1695 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1696 const gfp_t wait = gfp_mask & __GFP_WAIT;
1479 1697
1480 if (should_fail_alloc_page(gfp_mask, order)) 1698 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1481 return NULL; 1699 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1482 1700
1483restart: 1701 /*
1484 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1702 * The caller may dip into page reserves a bit more if the caller
1703 * cannot run direct reclaim, or if the caller has realtime scheduling
1704 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1705 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1706 */
1707 alloc_flags |= (gfp_mask & __GFP_HIGH);
1485 1708
1486 if (unlikely(!z->zone)) { 1709 if (!wait) {
1710 alloc_flags |= ALLOC_HARDER;
1487 /* 1711 /*
1488 * Happens if we have an empty zonelist as a result of 1712 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1489 * GFP_THISNODE being used on a memoryless node 1713 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1490 */ 1714 */
1491 return NULL; 1715 alloc_flags &= ~ALLOC_CPUSET;
1716 } else if (unlikely(rt_task(p)))
1717 alloc_flags |= ALLOC_HARDER;
1718
1719 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1720 if (!in_interrupt() &&
1721 ((p->flags & PF_MEMALLOC) ||
1722 unlikely(test_thread_flag(TIF_MEMDIE))))
1723 alloc_flags |= ALLOC_NO_WATERMARKS;
1492 } 1724 }
1493 1725
1494 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1726 return alloc_flags;
1495 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1727}
1496 if (page) 1728
1497 goto got_pg; 1729static inline struct page *
1730__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1731 struct zonelist *zonelist, enum zone_type high_zoneidx,
1732 nodemask_t *nodemask, struct zone *preferred_zone,
1733 int migratetype)
1734{
1735 const gfp_t wait = gfp_mask & __GFP_WAIT;
1736 struct page *page = NULL;
1737 int alloc_flags;
1738 unsigned long pages_reclaimed = 0;
1739 unsigned long did_some_progress;
1740 struct task_struct *p = current;
1741
1742 /*
1743 * In the slowpath, we sanity check order to avoid ever trying to
1744 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1745 * be using allocators in order of preference for an area that is
1746 * too large.
1747 */
1748 if (order >= MAX_ORDER) {
1749 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
1750 return NULL;
1751 }
1498 1752
1499 /* 1753 /*
1500 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1754 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1507,155 +1761,88 @@ restart:
1507 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1761 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1508 goto nopage; 1762 goto nopage;
1509 1763
1510 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1764 wake_all_kswapd(order, zonelist, high_zoneidx);
1511 wakeup_kswapd(zone, order);
1512 1765
1513 /* 1766 /*
1514 * OK, we're below the kswapd watermark and have kicked background 1767 * OK, we're below the kswapd watermark and have kicked background
1515 * reclaim. Now things get more complex, so set up alloc_flags according 1768 * reclaim. Now things get more complex, so set up alloc_flags according
1516 * to how we want to proceed. 1769 * to how we want to proceed.
1517 *
1518 * The caller may dip into page reserves a bit more if the caller
1519 * cannot run direct reclaim, or if the caller has realtime scheduling
1520 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1521 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1522 */ 1770 */
1523 alloc_flags = ALLOC_WMARK_MIN; 1771 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1524 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1525 alloc_flags |= ALLOC_HARDER;
1526 if (gfp_mask & __GFP_HIGH)
1527 alloc_flags |= ALLOC_HIGH;
1528 if (wait)
1529 alloc_flags |= ALLOC_CPUSET;
1530 1772
1531 /* 1773restart:
1532 * Go through the zonelist again. Let __GFP_HIGH and allocations 1774 /* This is the last chance, in general, before the goto nopage. */
1533 * coming from realtime tasks go deeper into reserves.
1534 *
1535 * This is the last chance, in general, before the goto nopage.
1536 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1537 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1538 */
1539 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1775 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1540 high_zoneidx, alloc_flags); 1776 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1777 preferred_zone, migratetype);
1541 if (page) 1778 if (page)
1542 goto got_pg; 1779 goto got_pg;
1543 1780
1544 /* This allocation should allow future memory freeing. */
1545
1546rebalance: 1781rebalance:
1547 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1782 /* Allocate without watermarks if the context allows */
1548 && !in_interrupt()) { 1783 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1549 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1784 page = __alloc_pages_high_priority(gfp_mask, order,
1550nofail_alloc: 1785 zonelist, high_zoneidx, nodemask,
1551 /* go through the zonelist yet again, ignoring mins */ 1786 preferred_zone, migratetype);
1552 page = get_page_from_freelist(gfp_mask, nodemask, order, 1787 if (page)
1553 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1788 goto got_pg;
1554 if (page)
1555 goto got_pg;
1556 if (gfp_mask & __GFP_NOFAIL) {
1557 congestion_wait(WRITE, HZ/50);
1558 goto nofail_alloc;
1559 }
1560 }
1561 goto nopage;
1562 } 1789 }
1563 1790
1564 /* Atomic allocations - we can't balance anything */ 1791 /* Atomic allocations - we can't balance anything */
1565 if (!wait) 1792 if (!wait)
1566 goto nopage; 1793 goto nopage;
1567 1794
1568 cond_resched(); 1795 /* Avoid recursion of direct reclaim */
1569 1796 if (p->flags & PF_MEMALLOC)
1570 /* We now go into synchronous reclaim */ 1797 goto nopage;
1571 cpuset_memory_pressure_bump();
1572 /*
1573 * The task's cpuset might have expanded its set of allowable nodes
1574 */
1575 cpuset_update_task_memory_state();
1576 p->flags |= PF_MEMALLOC;
1577
1578 lockdep_set_current_reclaim_state(gfp_mask);
1579 reclaim_state.reclaimed_slab = 0;
1580 p->reclaim_state = &reclaim_state;
1581 1798
1582 did_some_progress = try_to_free_pages(zonelist, order, 1799 /* Avoid allocations with no watermarks from looping endlessly */
1583 gfp_mask, nodemask); 1800 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1801 goto nopage;
1584 1802
1585 p->reclaim_state = NULL; 1803 /* Try direct reclaim and then allocating */
1586 lockdep_clear_current_reclaim_state(); 1804 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1587 p->flags &= ~PF_MEMALLOC; 1805 zonelist, high_zoneidx,
1806 nodemask,
1807 alloc_flags, preferred_zone,
1808 migratetype, &did_some_progress);
1809 if (page)
1810 goto got_pg;
1588 1811
1589 cond_resched(); 1812 /*
1813 * If we failed to make any progress reclaiming, then we are
1814 * running out of options and have to consider going OOM
1815 */
1816 if (!did_some_progress) {
1817 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1818 if (oom_killer_disabled)
1819 goto nopage;
1820 page = __alloc_pages_may_oom(gfp_mask, order,
1821 zonelist, high_zoneidx,
1822 nodemask, preferred_zone,
1823 migratetype);
1824 if (page)
1825 goto got_pg;
1590 1826
1591 if (order != 0) 1827 /*
1592 drain_all_pages(); 1828 * The OOM killer does not trigger for high-order
1829 * ~__GFP_NOFAIL allocations so if no progress is being
1830 * made, there are no other options and retrying is
1831 * unlikely to help.
1832 */
1833 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1834 !(gfp_mask & __GFP_NOFAIL))
1835 goto nopage;
1593 1836
1594 if (likely(did_some_progress)) {
1595 page = get_page_from_freelist(gfp_mask, nodemask, order,
1596 zonelist, high_zoneidx, alloc_flags);
1597 if (page)
1598 goto got_pg;
1599 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1600 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1601 schedule_timeout_uninterruptible(1);
1602 goto restart; 1837 goto restart;
1603 } 1838 }
1604
1605 /*
1606 * Go through the zonelist yet one more time, keep
1607 * very high watermark here, this is only to catch
1608 * a parallel oom killing, we must fail if we're still
1609 * under heavy pressure.
1610 */
1611 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1612 order, zonelist, high_zoneidx,
1613 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1614 if (page) {
1615 clear_zonelist_oom(zonelist, gfp_mask);
1616 goto got_pg;
1617 }
1618
1619 /* The OOM killer will not help higher order allocs so fail */
1620 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1621 clear_zonelist_oom(zonelist, gfp_mask);
1622 goto nopage;
1623 }
1624
1625 out_of_memory(zonelist, gfp_mask, order);
1626 clear_zonelist_oom(zonelist, gfp_mask);
1627 goto restart;
1628 } 1839 }
1629 1840
1630 /* 1841 /* Check if we should retry the allocation */
1631 * Don't let big-order allocations loop unless the caller explicitly
1632 * requests that. Wait for some write requests to complete then retry.
1633 *
1634 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1635 * means __GFP_NOFAIL, but that may not be true in other
1636 * implementations.
1637 *
1638 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1639 * specified, then we retry until we no longer reclaim any pages
1640 * (above), or we've reclaimed an order of pages at least as
1641 * large as the allocation's order. In both cases, if the
1642 * allocation still fails, we stop retrying.
1643 */
1644 pages_reclaimed += did_some_progress; 1842 pages_reclaimed += did_some_progress;
1645 do_retry = 0; 1843 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1646 if (!(gfp_mask & __GFP_NORETRY)) { 1844 /* Wait for some write requests to complete then retry */
1647 if (order <= PAGE_ALLOC_COSTLY_ORDER) { 1845 congestion_wait(BLK_RW_ASYNC, HZ/50);
1648 do_retry = 1;
1649 } else {
1650 if (gfp_mask & __GFP_REPEAT &&
1651 pages_reclaimed < (1 << order))
1652 do_retry = 1;
1653 }
1654 if (gfp_mask & __GFP_NOFAIL)
1655 do_retry = 1;
1656 }
1657 if (do_retry) {
1658 congestion_wait(WRITE, HZ/50);
1659 goto rebalance; 1846 goto rebalance;
1660 } 1847 }
1661 1848
@@ -1667,10 +1854,60 @@ nopage:
1667 dump_stack(); 1854 dump_stack();
1668 show_mem(); 1855 show_mem();
1669 } 1856 }
1857 return page;
1670got_pg: 1858got_pg:
1859 if (kmemcheck_enabled)
1860 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1671 return page; 1861 return page;
1862
1672} 1863}
1673EXPORT_SYMBOL(__alloc_pages_internal); 1864
1865/*
1866 * This is the 'heart' of the zoned buddy allocator.
1867 */
1868struct page *
1869__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1870 struct zonelist *zonelist, nodemask_t *nodemask)
1871{
1872 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1873 struct zone *preferred_zone;
1874 struct page *page;
1875 int migratetype = allocflags_to_migratetype(gfp_mask);
1876
1877 gfp_mask &= gfp_allowed_mask;
1878
1879 lockdep_trace_alloc(gfp_mask);
1880
1881 might_sleep_if(gfp_mask & __GFP_WAIT);
1882
1883 if (should_fail_alloc_page(gfp_mask, order))
1884 return NULL;
1885
1886 /*
1887 * Check the zones suitable for the gfp_mask contain at least one
1888 * valid zone. It's possible to have an empty zonelist as a result
1889 * of GFP_THISNODE and a memoryless node
1890 */
1891 if (unlikely(!zonelist->_zonerefs->zone))
1892 return NULL;
1893
1894 /* The preferred zone is used for statistics later */
1895 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1896 if (!preferred_zone)
1897 return NULL;
1898
1899 /* First allocation attempt */
1900 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1901 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1902 preferred_zone, migratetype);
1903 if (unlikely(!page))
1904 page = __alloc_pages_slowpath(gfp_mask, order,
1905 zonelist, high_zoneidx, nodemask,
1906 preferred_zone, migratetype);
1907
1908 return page;
1909}
1910EXPORT_SYMBOL(__alloc_pages_nodemask);
1674 1911
1675/* 1912/*
1676 * Common helper functions. 1913 * Common helper functions.
@@ -1757,7 +1994,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1757 unsigned long alloc_end = addr + (PAGE_SIZE << order); 1994 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1758 unsigned long used = addr + PAGE_ALIGN(size); 1995 unsigned long used = addr + PAGE_ALIGN(size);
1759 1996
1760 split_page(virt_to_page(addr), order); 1997 split_page(virt_to_page((void *)addr), order);
1761 while (used < alloc_end) { 1998 while (used < alloc_end) {
1762 free_page(used); 1999 free_page(used);
1763 used += PAGE_SIZE; 2000 used += PAGE_SIZE;
@@ -1799,7 +2036,7 @@ static unsigned int nr_free_zone_pages(int offset)
1799 2036
1800 for_each_zone_zonelist(zone, z, zonelist, offset) { 2037 for_each_zone_zonelist(zone, z, zonelist, offset) {
1801 unsigned long size = zone->present_pages; 2038 unsigned long size = zone->present_pages;
1802 unsigned long high = zone->pages_high; 2039 unsigned long high = high_wmark_pages(zone);
1803 if (size > high) 2040 if (size > high)
1804 sum += size - high; 2041 sum += size - high;
1805 } 2042 }
@@ -1891,19 +2128,14 @@ void show_free_areas(void)
1891 2128
1892 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2129 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1893 " inactive_file:%lu" 2130 " inactive_file:%lu"
1894//TODO: check/adjust line lengths
1895#ifdef CONFIG_UNEVICTABLE_LRU
1896 " unevictable:%lu" 2131 " unevictable:%lu"
1897#endif
1898 " dirty:%lu writeback:%lu unstable:%lu\n" 2132 " dirty:%lu writeback:%lu unstable:%lu\n"
1899 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2133 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1900 global_page_state(NR_ACTIVE_ANON), 2134 global_page_state(NR_ACTIVE_ANON),
1901 global_page_state(NR_ACTIVE_FILE), 2135 global_page_state(NR_ACTIVE_FILE),
1902 global_page_state(NR_INACTIVE_ANON), 2136 global_page_state(NR_INACTIVE_ANON),
1903 global_page_state(NR_INACTIVE_FILE), 2137 global_page_state(NR_INACTIVE_FILE),
1904#ifdef CONFIG_UNEVICTABLE_LRU
1905 global_page_state(NR_UNEVICTABLE), 2138 global_page_state(NR_UNEVICTABLE),
1906#endif
1907 global_page_state(NR_FILE_DIRTY), 2139 global_page_state(NR_FILE_DIRTY),
1908 global_page_state(NR_WRITEBACK), 2140 global_page_state(NR_WRITEBACK),
1909 global_page_state(NR_UNSTABLE_NFS), 2141 global_page_state(NR_UNSTABLE_NFS),
@@ -1927,25 +2159,21 @@ void show_free_areas(void)
1927 " inactive_anon:%lukB" 2159 " inactive_anon:%lukB"
1928 " active_file:%lukB" 2160 " active_file:%lukB"
1929 " inactive_file:%lukB" 2161 " inactive_file:%lukB"
1930#ifdef CONFIG_UNEVICTABLE_LRU
1931 " unevictable:%lukB" 2162 " unevictable:%lukB"
1932#endif
1933 " present:%lukB" 2163 " present:%lukB"
1934 " pages_scanned:%lu" 2164 " pages_scanned:%lu"
1935 " all_unreclaimable? %s" 2165 " all_unreclaimable? %s"
1936 "\n", 2166 "\n",
1937 zone->name, 2167 zone->name,
1938 K(zone_page_state(zone, NR_FREE_PAGES)), 2168 K(zone_page_state(zone, NR_FREE_PAGES)),
1939 K(zone->pages_min), 2169 K(min_wmark_pages(zone)),
1940 K(zone->pages_low), 2170 K(low_wmark_pages(zone)),
1941 K(zone->pages_high), 2171 K(high_wmark_pages(zone)),
1942 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2172 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1943 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2173 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1944 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2174 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1945 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2175 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1946#ifdef CONFIG_UNEVICTABLE_LRU
1947 K(zone_page_state(zone, NR_UNEVICTABLE)), 2176 K(zone_page_state(zone, NR_UNEVICTABLE)),
1948#endif
1949 K(zone->present_pages), 2177 K(zone->present_pages),
1950 zone->pages_scanned, 2178 zone->pages_scanned,
1951 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2179 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2103,7 +2331,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2103} 2331}
2104 2332
2105 2333
2106#define MAX_NODE_LOAD (num_online_nodes()) 2334#define MAX_NODE_LOAD (nr_online_nodes)
2107static int node_load[MAX_NUMNODES]; 2335static int node_load[MAX_NUMNODES];
2108 2336
2109/** 2337/**
@@ -2312,7 +2540,7 @@ static void build_zonelists(pg_data_t *pgdat)
2312 2540
2313 /* NUMA-aware ordering of nodes */ 2541 /* NUMA-aware ordering of nodes */
2314 local_node = pgdat->node_id; 2542 local_node = pgdat->node_id;
2315 load = num_online_nodes(); 2543 load = nr_online_nodes;
2316 prev_node = local_node; 2544 prev_node = local_node;
2317 nodes_clear(used_mask); 2545 nodes_clear(used_mask);
2318 2546
@@ -2463,7 +2691,7 @@ void build_all_zonelists(void)
2463 2691
2464 printk("Built %i zonelists in %s order, mobility grouping %s. " 2692 printk("Built %i zonelists in %s order, mobility grouping %s. "
2465 "Total pages: %ld\n", 2693 "Total pages: %ld\n",
2466 num_online_nodes(), 2694 nr_online_nodes,
2467 zonelist_order_name[current_zonelist_order], 2695 zonelist_order_name[current_zonelist_order],
2468 page_group_by_mobility_disabled ? "off" : "on", 2696 page_group_by_mobility_disabled ? "off" : "on",
2469 vm_total_pages); 2697 vm_total_pages);
@@ -2542,8 +2770,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2542 2770
2543/* 2771/*
2544 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2772 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2545 * of blocks reserved is based on zone->pages_min. The memory within the 2773 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2546 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2774 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2547 * higher will lead to a bigger reserve which will get freed as contiguous 2775 * higher will lead to a bigger reserve which will get freed as contiguous
2548 * blocks as reclaim kicks in 2776 * blocks as reclaim kicks in
2549 */ 2777 */
@@ -2556,7 +2784,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2556 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2784 /* Get the start pfn, end pfn and the number of blocks to reserve */
2557 start_pfn = zone->zone_start_pfn; 2785 start_pfn = zone->zone_start_pfn;
2558 end_pfn = start_pfn + zone->spanned_pages; 2786 end_pfn = start_pfn + zone->spanned_pages;
2559 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2787 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2560 pageblock_order; 2788 pageblock_order;
2561 2789
2562 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2790 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -2809,7 +3037,7 @@ bad:
2809 if (dzone == zone) 3037 if (dzone == zone)
2810 break; 3038 break;
2811 kfree(zone_pcp(dzone, cpu)); 3039 kfree(zone_pcp(dzone, cpu));
2812 zone_pcp(dzone, cpu) = NULL; 3040 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
2813 } 3041 }
2814 return -ENOMEM; 3042 return -ENOMEM;
2815} 3043}
@@ -2824,7 +3052,7 @@ static inline void free_zone_pagesets(int cpu)
2824 /* Free per_cpu_pageset if it is slab allocated */ 3052 /* Free per_cpu_pageset if it is slab allocated */
2825 if (pset != &boot_pageset[cpu]) 3053 if (pset != &boot_pageset[cpu])
2826 kfree(pset); 3054 kfree(pset);
2827 zone_pcp(zone, cpu) = NULL; 3055 zone_pcp(zone, cpu) = &boot_pageset[cpu];
2828 } 3056 }
2829} 3057}
2830 3058
@@ -3488,7 +3716,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3488 zone_pcp_init(zone); 3716 zone_pcp_init(zone);
3489 for_each_lru(l) { 3717 for_each_lru(l) {
3490 INIT_LIST_HEAD(&zone->lru[l].list); 3718 INIT_LIST_HEAD(&zone->lru[l].list);
3491 zone->lru[l].nr_scan = 0; 3719 zone->lru[l].nr_saved_scan = 0;
3492 } 3720 }
3493 zone->reclaim_stat.recent_rotated[0] = 0; 3721 zone->reclaim_stat.recent_rotated[0] = 0;
3494 zone->reclaim_stat.recent_rotated[1] = 0; 3722 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -3815,6 +4043,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3815 int i, nid; 4043 int i, nid;
3816 unsigned long usable_startpfn; 4044 unsigned long usable_startpfn;
3817 unsigned long kernelcore_node, kernelcore_remaining; 4045 unsigned long kernelcore_node, kernelcore_remaining;
4046 /* save the state before borrow the nodemask */
4047 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
3818 unsigned long totalpages = early_calculate_totalpages(); 4048 unsigned long totalpages = early_calculate_totalpages();
3819 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4049 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
3820 4050
@@ -3842,7 +4072,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3842 4072
3843 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4073 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
3844 if (!required_kernelcore) 4074 if (!required_kernelcore)
3845 return; 4075 goto out;
3846 4076
3847 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4077 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
3848 find_usable_zone_for_movable(); 4078 find_usable_zone_for_movable();
@@ -3941,6 +4171,10 @@ restart:
3941 for (nid = 0; nid < MAX_NUMNODES; nid++) 4171 for (nid = 0; nid < MAX_NUMNODES; nid++)
3942 zone_movable_pfn[nid] = 4172 zone_movable_pfn[nid] =
3943 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4173 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4174
4175out:
4176 /* restore the node_state */
4177 node_states[N_HIGH_MEMORY] = saved_node_state;
3944} 4178}
3945 4179
3946/* Any regular memory on that node ? */ 4180/* Any regular memory on that node ? */
@@ -4159,8 +4393,8 @@ static void calculate_totalreserve_pages(void)
4159 max = zone->lowmem_reserve[j]; 4393 max = zone->lowmem_reserve[j];
4160 } 4394 }
4161 4395
4162 /* we treat pages_high as reserved pages. */ 4396 /* we treat the high watermark as reserved pages. */
4163 max += zone->pages_high; 4397 max += high_wmark_pages(zone);
4164 4398
4165 if (max > zone->present_pages) 4399 if (max > zone->present_pages)
4166 max = zone->present_pages; 4400 max = zone->present_pages;
@@ -4210,12 +4444,13 @@ static void setup_per_zone_lowmem_reserve(void)
4210} 4444}
4211 4445
4212/** 4446/**
4213 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4447 * setup_per_zone_wmarks - called when min_free_kbytes changes
4448 * or when memory is hot-{added|removed}
4214 * 4449 *
4215 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4450 * Ensures that the watermark[min,low,high] values for each zone are set
4216 * with respect to min_free_kbytes. 4451 * correctly with respect to min_free_kbytes.
4217 */ 4452 */
4218void setup_per_zone_pages_min(void) 4453void setup_per_zone_wmarks(void)
4219{ 4454{
4220 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4455 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4221 unsigned long lowmem_pages = 0; 4456 unsigned long lowmem_pages = 0;
@@ -4240,7 +4475,7 @@ void setup_per_zone_pages_min(void)
4240 * need highmem pages, so cap pages_min to a small 4475 * need highmem pages, so cap pages_min to a small
4241 * value here. 4476 * value here.
4242 * 4477 *
4243 * The (pages_high-pages_low) and (pages_low-pages_min) 4478 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4244 * deltas controls asynch page reclaim, and so should 4479 * deltas controls asynch page reclaim, and so should
4245 * not be capped for highmem. 4480 * not be capped for highmem.
4246 */ 4481 */
@@ -4251,17 +4486,17 @@ void setup_per_zone_pages_min(void)
4251 min_pages = SWAP_CLUSTER_MAX; 4486 min_pages = SWAP_CLUSTER_MAX;
4252 if (min_pages > 128) 4487 if (min_pages > 128)
4253 min_pages = 128; 4488 min_pages = 128;
4254 zone->pages_min = min_pages; 4489 zone->watermark[WMARK_MIN] = min_pages;
4255 } else { 4490 } else {
4256 /* 4491 /*
4257 * If it's a lowmem zone, reserve a number of pages 4492 * If it's a lowmem zone, reserve a number of pages
4258 * proportionate to the zone's size. 4493 * proportionate to the zone's size.
4259 */ 4494 */
4260 zone->pages_min = tmp; 4495 zone->watermark[WMARK_MIN] = tmp;
4261 } 4496 }
4262 4497
4263 zone->pages_low = zone->pages_min + (tmp >> 2); 4498 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4264 zone->pages_high = zone->pages_min + (tmp >> 1); 4499 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4265 setup_zone_migrate_reserve(zone); 4500 setup_zone_migrate_reserve(zone);
4266 spin_unlock_irqrestore(&zone->lock, flags); 4501 spin_unlock_irqrestore(&zone->lock, flags);
4267 } 4502 }
@@ -4271,8 +4506,6 @@ void setup_per_zone_pages_min(void)
4271} 4506}
4272 4507
4273/** 4508/**
4274 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4275 *
4276 * The inactive anon list should be small enough that the VM never has to 4509 * The inactive anon list should be small enough that the VM never has to
4277 * do too much work, but large enough that each inactive page has a chance 4510 * do too much work, but large enough that each inactive page has a chance
4278 * to be referenced again before it is swapped out. 4511 * to be referenced again before it is swapped out.
@@ -4293,21 +4526,26 @@ void setup_per_zone_pages_min(void)
4293 * 1TB 101 10GB 4526 * 1TB 101 10GB
4294 * 10TB 320 32GB 4527 * 10TB 320 32GB
4295 */ 4528 */
4296static void setup_per_zone_inactive_ratio(void) 4529void calculate_zone_inactive_ratio(struct zone *zone)
4297{ 4530{
4298 struct zone *zone; 4531 unsigned int gb, ratio;
4299 4532
4300 for_each_zone(zone) { 4533 /* Zone size in gigabytes */
4301 unsigned int gb, ratio; 4534 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4302 4535 if (gb)
4303 /* Zone size in gigabytes */
4304 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4305 ratio = int_sqrt(10 * gb); 4536 ratio = int_sqrt(10 * gb);
4306 if (!ratio) 4537 else
4307 ratio = 1; 4538 ratio = 1;
4308 4539
4309 zone->inactive_ratio = ratio; 4540 zone->inactive_ratio = ratio;
4310 } 4541}
4542
4543static void __init setup_per_zone_inactive_ratio(void)
4544{
4545 struct zone *zone;
4546
4547 for_each_zone(zone)
4548 calculate_zone_inactive_ratio(zone);
4311} 4549}
4312 4550
4313/* 4551/*
@@ -4334,7 +4572,7 @@ static void setup_per_zone_inactive_ratio(void)
4334 * 8192MB: 11584k 4572 * 8192MB: 11584k
4335 * 16384MB: 16384k 4573 * 16384MB: 16384k
4336 */ 4574 */
4337static int __init init_per_zone_pages_min(void) 4575static int __init init_per_zone_wmark_min(void)
4338{ 4576{
4339 unsigned long lowmem_kbytes; 4577 unsigned long lowmem_kbytes;
4340 4578
@@ -4345,12 +4583,12 @@ static int __init init_per_zone_pages_min(void)
4345 min_free_kbytes = 128; 4583 min_free_kbytes = 128;
4346 if (min_free_kbytes > 65536) 4584 if (min_free_kbytes > 65536)
4347 min_free_kbytes = 65536; 4585 min_free_kbytes = 65536;
4348 setup_per_zone_pages_min(); 4586 setup_per_zone_wmarks();
4349 setup_per_zone_lowmem_reserve(); 4587 setup_per_zone_lowmem_reserve();
4350 setup_per_zone_inactive_ratio(); 4588 setup_per_zone_inactive_ratio();
4351 return 0; 4589 return 0;
4352} 4590}
4353module_init(init_per_zone_pages_min) 4591module_init(init_per_zone_wmark_min)
4354 4592
4355/* 4593/*
4356 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4594 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4362,7 +4600,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4362{ 4600{
4363 proc_dointvec(table, write, file, buffer, length, ppos); 4601 proc_dointvec(table, write, file, buffer, length, ppos);
4364 if (write) 4602 if (write)
4365 setup_per_zone_pages_min(); 4603 setup_per_zone_wmarks();
4366 return 0; 4604 return 0;
4367} 4605}
4368 4606
@@ -4406,7 +4644,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4406 * whenever sysctl_lowmem_reserve_ratio changes. 4644 * whenever sysctl_lowmem_reserve_ratio changes.
4407 * 4645 *
4408 * The reserve ratio obviously has absolutely no relation with the 4646 * The reserve ratio obviously has absolutely no relation with the
4409 * pages_min watermarks. The lowmem reserve ratio can only make sense 4647 * minimum watermarks. The lowmem reserve ratio can only make sense
4410 * if in function of the boot time zone sizes. 4648 * if in function of the boot time zone sizes.
4411 */ 4649 */
4412int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4650int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
@@ -4433,7 +4671,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4433 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4671 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4434 if (!write || (ret == -EINVAL)) 4672 if (!write || (ret == -EINVAL))
4435 return ret; 4673 return ret;
4436 for_each_zone(zone) { 4674 for_each_populated_zone(zone) {
4437 for_each_online_cpu(cpu) { 4675 for_each_online_cpu(cpu) {
4438 unsigned long high; 4676 unsigned long high;
4439 high = zone->present_pages / percpu_pagelist_fraction; 4677 high = zone->present_pages / percpu_pagelist_fraction;
@@ -4513,22 +4751,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4513 else if (hashdist) 4751 else if (hashdist)
4514 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4752 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4515 else { 4753 else {
4516 unsigned long order = get_order(size);
4517 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4518 /* 4754 /*
4519 * If bucketsize is not a power-of-two, we may free 4755 * If bucketsize is not a power-of-two, we may free
4520 * some pages at the end of hash table. 4756 * some pages at the end of hash table which
4757 * alloc_pages_exact() automatically does
4521 */ 4758 */
4522 if (table) { 4759 if (get_order(size) < MAX_ORDER) {
4523 unsigned long alloc_end = (unsigned long)table + 4760 table = alloc_pages_exact(size, GFP_ATOMIC);
4524 (PAGE_SIZE << order); 4761 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4525 unsigned long used = (unsigned long)table +
4526 PAGE_ALIGN(size);
4527 split_page(virt_to_page(table), order);
4528 while (used < alloc_end) {
4529 free_page(used);
4530 used += PAGE_SIZE;
4531 }
4532 } 4762 }
4533 } 4763 }
4534 } while (!table && size > PAGE_SIZE && --log2qty); 4764 } while (!table && size > PAGE_SIZE && --log2qty);
@@ -4547,16 +4777,6 @@ void *__init alloc_large_system_hash(const char *tablename,
4547 if (_hash_mask) 4777 if (_hash_mask)
4548 *_hash_mask = (1 << log2qty) - 1; 4778 *_hash_mask = (1 << log2qty) - 1;
4549 4779
4550 /*
4551 * If hashdist is set, the table allocation is done with __vmalloc()
4552 * which invokes the kmemleak_alloc() callback. This function may also
4553 * be called before the slab and kmemleak are initialised when
4554 * kmemleak simply buffers the request to be executed later
4555 * (GFP_ATOMIC flag ignored in this case).
4556 */
4557 if (!hashdist)
4558 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4559
4560 return table; 4780 return table;
4561} 4781}
4562 4782