aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c974
1 files changed, 822 insertions, 152 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b5358a0561f..4f59d90b81e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,8 @@
37#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
38#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
39#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h>
41#include <linux/pfn.h>
40 42
41#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
42#include <asm/div64.h> 44#include <asm/div64.h>
@@ -51,7 +53,6 @@ EXPORT_SYMBOL(node_online_map);
51nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 53nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
52EXPORT_SYMBOL(node_possible_map); 54EXPORT_SYMBOL(node_possible_map);
53unsigned long totalram_pages __read_mostly; 55unsigned long totalram_pages __read_mostly;
54unsigned long totalhigh_pages __read_mostly;
55unsigned long totalreserve_pages __read_mostly; 56unsigned long totalreserve_pages __read_mostly;
56long nr_swap_pages; 57long nr_swap_pages;
57int percpu_pagelist_fraction; 58int percpu_pagelist_fraction;
@@ -69,7 +70,15 @@ static void __free_pages_ok(struct page *page, unsigned int order);
69 * TBD: should special case ZONE_DMA32 machines here - in those we normally 70 * TBD: should special case ZONE_DMA32 machines here - in those we normally
70 * don't need any ZONE_NORMAL reservation 71 * don't need any ZONE_NORMAL reservation
71 */ 72 */
72int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; 73int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
74 256,
75#ifdef CONFIG_ZONE_DMA32
76 256,
77#endif
78#ifdef CONFIG_HIGHMEM
79 32
80#endif
81};
73 82
74EXPORT_SYMBOL(totalram_pages); 83EXPORT_SYMBOL(totalram_pages);
75 84
@@ -80,11 +89,53 @@ EXPORT_SYMBOL(totalram_pages);
80struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 89struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
81EXPORT_SYMBOL(zone_table); 90EXPORT_SYMBOL(zone_table);
82 91
83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 92static char *zone_names[MAX_NR_ZONES] = {
93 "DMA",
94#ifdef CONFIG_ZONE_DMA32
95 "DMA32",
96#endif
97 "Normal",
98#ifdef CONFIG_HIGHMEM
99 "HighMem"
100#endif
101};
102
84int min_free_kbytes = 1024; 103int min_free_kbytes = 1024;
85 104
86unsigned long __meminitdata nr_kernel_pages; 105unsigned long __meminitdata nr_kernel_pages;
87unsigned long __meminitdata nr_all_pages; 106unsigned long __meminitdata nr_all_pages;
107static unsigned long __initdata dma_reserve;
108
109#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
110 /*
111 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
112 * ranges of memory (RAM) that may be registered with add_active_range().
113 * Ranges passed to add_active_range() will be merged if possible
114 * so the number of times add_active_range() can be called is
115 * related to the number of nodes and the number of holes
116 */
117 #ifdef CONFIG_MAX_ACTIVE_REGIONS
118 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
119 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
120 #else
121 #if MAX_NUMNODES >= 32
122 /* If there can be many nodes, allow up to 50 holes per node */
123 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
124 #else
125 /* By default, allow up to 256 distinct regions */
126 #define MAX_ACTIVE_REGIONS 256
127 #endif
128 #endif
129
130 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
131 int __initdata nr_nodemap_entries;
132 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
133 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
134#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
135 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
136 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
137#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
138#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
88 139
89#ifdef CONFIG_DEBUG_VM 140#ifdef CONFIG_DEBUG_VM
90static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 141static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -127,7 +178,6 @@ static int bad_range(struct zone *zone, struct page *page)
127 178
128 return 0; 179 return 0;
129} 180}
130
131#else 181#else
132static inline int bad_range(struct zone *zone, struct page *page) 182static inline int bad_range(struct zone *zone, struct page *page)
133{ 183{
@@ -218,12 +268,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
218{ 268{
219 int i; 269 int i;
220 270
221 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 271 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
222 /* 272 /*
223 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 273 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
224 * and __GFP_HIGHMEM from hard or soft interrupt context. 274 * and __GFP_HIGHMEM from hard or soft interrupt context.
225 */ 275 */
226 BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 276 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
227 for (i = 0; i < (1 << order); i++) 277 for (i = 0; i < (1 << order); i++)
228 clear_highpage(page + i); 278 clear_highpage(page + i);
229} 279}
@@ -347,8 +397,8 @@ static inline void __free_one_page(struct page *page,
347 397
348 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 398 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
349 399
350 BUG_ON(page_idx & (order_size - 1)); 400 VM_BUG_ON(page_idx & (order_size - 1));
351 BUG_ON(bad_range(zone, page)); 401 VM_BUG_ON(bad_range(zone, page));
352 402
353 zone->free_pages += order_size; 403 zone->free_pages += order_size;
354 while (order < MAX_ORDER-1) { 404 while (order < MAX_ORDER-1) {
@@ -421,7 +471,7 @@ static void free_pages_bulk(struct zone *zone, int count,
421 while (count--) { 471 while (count--) {
422 struct page *page; 472 struct page *page;
423 473
424 BUG_ON(list_empty(list)); 474 VM_BUG_ON(list_empty(list));
425 page = list_entry(list->prev, struct page, lru); 475 page = list_entry(list->prev, struct page, lru);
426 /* have to delete it as __free_one_page list manipulates */ 476 /* have to delete it as __free_one_page list manipulates */
427 list_del(&page->lru); 477 list_del(&page->lru);
@@ -432,9 +482,11 @@ static void free_pages_bulk(struct zone *zone, int count,
432 482
433static void free_one_page(struct zone *zone, struct page *page, int order) 483static void free_one_page(struct zone *zone, struct page *page, int order)
434{ 484{
435 LIST_HEAD(list); 485 spin_lock(&zone->lock);
436 list_add(&page->lru, &list); 486 zone->all_unreclaimable = 0;
437 free_pages_bulk(zone, 1, &list, order); 487 zone->pages_scanned = 0;
488 __free_one_page(page, zone ,order);
489 spin_unlock(&zone->lock);
438} 490}
439 491
440static void __free_pages_ok(struct page *page, unsigned int order) 492static void __free_pages_ok(struct page *page, unsigned int order)
@@ -512,7 +564,7 @@ static inline void expand(struct zone *zone, struct page *page,
512 area--; 564 area--;
513 high--; 565 high--;
514 size >>= 1; 566 size >>= 1;
515 BUG_ON(bad_range(zone, &page[size])); 567 VM_BUG_ON(bad_range(zone, &page[size]));
516 list_add(&page[size].lru, &area->free_list); 568 list_add(&page[size].lru, &area->free_list);
517 area->nr_free++; 569 area->nr_free++;
518 set_page_order(&page[size], high); 570 set_page_order(&page[size], high);
@@ -615,19 +667,23 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
615#ifdef CONFIG_NUMA 667#ifdef CONFIG_NUMA
616/* 668/*
617 * Called from the slab reaper to drain pagesets on a particular node that 669 * Called from the slab reaper to drain pagesets on a particular node that
618 * belong to the currently executing processor. 670 * belongs to the currently executing processor.
619 * Note that this function must be called with the thread pinned to 671 * Note that this function must be called with the thread pinned to
620 * a single processor. 672 * a single processor.
621 */ 673 */
622void drain_node_pages(int nodeid) 674void drain_node_pages(int nodeid)
623{ 675{
624 int i, z; 676 int i;
677 enum zone_type z;
625 unsigned long flags; 678 unsigned long flags;
626 679
627 for (z = 0; z < MAX_NR_ZONES; z++) { 680 for (z = 0; z < MAX_NR_ZONES; z++) {
628 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 681 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
629 struct per_cpu_pageset *pset; 682 struct per_cpu_pageset *pset;
630 683
684 if (!populated_zone(zone))
685 continue;
686
631 pset = zone_pcp(zone, smp_processor_id()); 687 pset = zone_pcp(zone, smp_processor_id());
632 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 688 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
633 struct per_cpu_pages *pcp; 689 struct per_cpu_pages *pcp;
@@ -672,7 +728,8 @@ static void __drain_pages(unsigned int cpu)
672 728
673void mark_free_pages(struct zone *zone) 729void mark_free_pages(struct zone *zone)
674{ 730{
675 unsigned long zone_pfn, flags; 731 unsigned long pfn, max_zone_pfn;
732 unsigned long flags;
676 int order; 733 int order;
677 struct list_head *curr; 734 struct list_head *curr;
678 735
@@ -680,18 +737,25 @@ void mark_free_pages(struct zone *zone)
680 return; 737 return;
681 738
682 spin_lock_irqsave(&zone->lock, flags); 739 spin_lock_irqsave(&zone->lock, flags);
683 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 740
684 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 741 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
742 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
743 if (pfn_valid(pfn)) {
744 struct page *page = pfn_to_page(pfn);
745
746 if (!PageNosave(page))
747 ClearPageNosaveFree(page);
748 }
685 749
686 for (order = MAX_ORDER - 1; order >= 0; --order) 750 for (order = MAX_ORDER - 1; order >= 0; --order)
687 list_for_each(curr, &zone->free_area[order].free_list) { 751 list_for_each(curr, &zone->free_area[order].free_list) {
688 unsigned long start_pfn, i; 752 unsigned long i;
689 753
690 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 754 pfn = page_to_pfn(list_entry(curr, struct page, lru));
755 for (i = 0; i < (1UL << order); i++)
756 SetPageNosaveFree(pfn_to_page(pfn + i));
757 }
691 758
692 for (i=0; i < (1<<order); i++)
693 SetPageNosaveFree(pfn_to_page(start_pfn+i));
694 }
695 spin_unlock_irqrestore(&zone->lock, flags); 759 spin_unlock_irqrestore(&zone->lock, flags);
696} 760}
697 761
@@ -761,8 +825,8 @@ void split_page(struct page *page, unsigned int order)
761{ 825{
762 int i; 826 int i;
763 827
764 BUG_ON(PageCompound(page)); 828 VM_BUG_ON(PageCompound(page));
765 BUG_ON(!page_count(page)); 829 VM_BUG_ON(!page_count(page));
766 for (i = 1; i < (1 << order); i++) 830 for (i = 1; i < (1 << order); i++)
767 set_page_refcounted(page + i); 831 set_page_refcounted(page + i);
768} 832}
@@ -809,7 +873,7 @@ again:
809 local_irq_restore(flags); 873 local_irq_restore(flags);
810 put_cpu(); 874 put_cpu();
811 875
812 BUG_ON(bad_range(zone, page)); 876 VM_BUG_ON(bad_range(zone, page));
813 if (prep_new_page(page, order, gfp_flags)) 877 if (prep_new_page(page, order, gfp_flags))
814 goto again; 878 goto again;
815 return page; 879 return page;
@@ -870,32 +934,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
870 struct zone **z = zonelist->zones; 934 struct zone **z = zonelist->zones;
871 struct page *page = NULL; 935 struct page *page = NULL;
872 int classzone_idx = zone_idx(*z); 936 int classzone_idx = zone_idx(*z);
937 struct zone *zone;
873 938
874 /* 939 /*
875 * Go through the zonelist once, looking for a zone with enough free. 940 * Go through the zonelist once, looking for a zone with enough free.
876 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 941 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
877 */ 942 */
878 do { 943 do {
944 zone = *z;
945 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
946 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
947 break;
879 if ((alloc_flags & ALLOC_CPUSET) && 948 if ((alloc_flags & ALLOC_CPUSET) &&
880 !cpuset_zone_allowed(*z, gfp_mask)) 949 !cpuset_zone_allowed(zone, gfp_mask))
881 continue; 950 continue;
882 951
883 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 952 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
884 unsigned long mark; 953 unsigned long mark;
885 if (alloc_flags & ALLOC_WMARK_MIN) 954 if (alloc_flags & ALLOC_WMARK_MIN)
886 mark = (*z)->pages_min; 955 mark = zone->pages_min;
887 else if (alloc_flags & ALLOC_WMARK_LOW) 956 else if (alloc_flags & ALLOC_WMARK_LOW)
888 mark = (*z)->pages_low; 957 mark = zone->pages_low;
889 else 958 else
890 mark = (*z)->pages_high; 959 mark = zone->pages_high;
891 if (!zone_watermark_ok(*z, order, mark, 960 if (!zone_watermark_ok(zone , order, mark,
892 classzone_idx, alloc_flags)) 961 classzone_idx, alloc_flags))
893 if (!zone_reclaim_mode || 962 if (!zone_reclaim_mode ||
894 !zone_reclaim(*z, gfp_mask, order)) 963 !zone_reclaim(zone, gfp_mask, order))
895 continue; 964 continue;
896 } 965 }
897 966
898 page = buffered_rmqueue(zonelist, *z, order, gfp_mask); 967 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
899 if (page) { 968 if (page) {
900 break; 969 break;
901 } 970 }
@@ -1083,7 +1152,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1083 * get_zeroed_page() returns a 32-bit address, which cannot represent 1152 * get_zeroed_page() returns a 32-bit address, which cannot represent
1084 * a highmem page 1153 * a highmem page
1085 */ 1154 */
1086 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1155 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1087 1156
1088 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1157 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1089 if (page) 1158 if (page)
@@ -1116,7 +1185,7 @@ EXPORT_SYMBOL(__free_pages);
1116fastcall void free_pages(unsigned long addr, unsigned int order) 1185fastcall void free_pages(unsigned long addr, unsigned int order)
1117{ 1186{
1118 if (addr != 0) { 1187 if (addr != 0) {
1119 BUG_ON(!virt_addr_valid((void *)addr)); 1188 VM_BUG_ON(!virt_addr_valid((void *)addr));
1120 __free_pages(virt_to_page((void *)addr), order); 1189 __free_pages(virt_to_page((void *)addr), order);
1121 } 1190 }
1122} 1191}
@@ -1142,7 +1211,8 @@ EXPORT_SYMBOL(nr_free_pages);
1142#ifdef CONFIG_NUMA 1211#ifdef CONFIG_NUMA
1143unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1212unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1144{ 1213{
1145 unsigned int i, sum = 0; 1214 unsigned int sum = 0;
1215 enum zone_type i;
1146 1216
1147 for (i = 0; i < MAX_NR_ZONES; i++) 1217 for (i = 0; i < MAX_NR_ZONES; i++)
1148 sum += pgdat->node_zones[i].free_pages; 1218 sum += pgdat->node_zones[i].free_pages;
@@ -1187,27 +1257,11 @@ unsigned int nr_free_pagecache_pages(void)
1187 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1257 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1188} 1258}
1189 1259
1190#ifdef CONFIG_HIGHMEM 1260static inline void show_node(struct zone *zone)
1191unsigned int nr_free_highpages (void)
1192{ 1261{
1193 pg_data_t *pgdat; 1262 if (NUMA_BUILD)
1194 unsigned int pages = 0; 1263 printk("Node %ld ", zone_to_nid(zone));
1195
1196 for_each_online_pgdat(pgdat)
1197 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1198
1199 return pages;
1200} 1264}
1201#endif
1202
1203#ifdef CONFIG_NUMA
1204static void show_node(struct zone *zone)
1205{
1206 printk("Node %d ", zone->zone_pgdat->node_id);
1207}
1208#else
1209#define show_node(zone) do { } while (0)
1210#endif
1211 1265
1212void si_meminfo(struct sysinfo *val) 1266void si_meminfo(struct sysinfo *val)
1213{ 1267{
@@ -1215,13 +1269,8 @@ void si_meminfo(struct sysinfo *val)
1215 val->sharedram = 0; 1269 val->sharedram = 0;
1216 val->freeram = nr_free_pages(); 1270 val->freeram = nr_free_pages();
1217 val->bufferram = nr_blockdev_pages(); 1271 val->bufferram = nr_blockdev_pages();
1218#ifdef CONFIG_HIGHMEM
1219 val->totalhigh = totalhigh_pages; 1272 val->totalhigh = totalhigh_pages;
1220 val->freehigh = nr_free_highpages(); 1273 val->freehigh = nr_free_highpages();
1221#else
1222 val->totalhigh = 0;
1223 val->freehigh = 0;
1224#endif
1225 val->mem_unit = PAGE_SIZE; 1274 val->mem_unit = PAGE_SIZE;
1226} 1275}
1227 1276
@@ -1234,8 +1283,13 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1234 1283
1235 val->totalram = pgdat->node_present_pages; 1284 val->totalram = pgdat->node_present_pages;
1236 val->freeram = nr_free_pages_pgdat(pgdat); 1285 val->freeram = nr_free_pages_pgdat(pgdat);
1286#ifdef CONFIG_HIGHMEM
1237 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1287 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1238 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1288 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1289#else
1290 val->totalhigh = 0;
1291 val->freehigh = 0;
1292#endif
1239 val->mem_unit = PAGE_SIZE; 1293 val->mem_unit = PAGE_SIZE;
1240} 1294}
1241#endif 1295#endif
@@ -1249,43 +1303,35 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1249 */ 1303 */
1250void show_free_areas(void) 1304void show_free_areas(void)
1251{ 1305{
1252 int cpu, temperature; 1306 int cpu;
1253 unsigned long active; 1307 unsigned long active;
1254 unsigned long inactive; 1308 unsigned long inactive;
1255 unsigned long free; 1309 unsigned long free;
1256 struct zone *zone; 1310 struct zone *zone;
1257 1311
1258 for_each_zone(zone) { 1312 for_each_zone(zone) {
1259 show_node(zone); 1313 if (!populated_zone(zone))
1260 printk("%s per-cpu:", zone->name);
1261
1262 if (!populated_zone(zone)) {
1263 printk(" empty\n");
1264 continue; 1314 continue;
1265 } else 1315
1266 printk("\n"); 1316 show_node(zone);
1317 printk("%s per-cpu:\n", zone->name);
1267 1318
1268 for_each_online_cpu(cpu) { 1319 for_each_online_cpu(cpu) {
1269 struct per_cpu_pageset *pageset; 1320 struct per_cpu_pageset *pageset;
1270 1321
1271 pageset = zone_pcp(zone, cpu); 1322 pageset = zone_pcp(zone, cpu);
1272 1323
1273 for (temperature = 0; temperature < 2; temperature++) 1324 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1274 printk("cpu %d %s: high %d, batch %d used:%d\n", 1325 "Cold: hi:%5d, btch:%4d usd:%4d\n",
1275 cpu, 1326 cpu, pageset->pcp[0].high,
1276 temperature ? "cold" : "hot", 1327 pageset->pcp[0].batch, pageset->pcp[0].count,
1277 pageset->pcp[temperature].high, 1328 pageset->pcp[1].high, pageset->pcp[1].batch,
1278 pageset->pcp[temperature].batch, 1329 pageset->pcp[1].count);
1279 pageset->pcp[temperature].count);
1280 } 1330 }
1281 } 1331 }
1282 1332
1283 get_zone_counts(&active, &inactive, &free); 1333 get_zone_counts(&active, &inactive, &free);
1284 1334
1285 printk("Free pages: %11ukB (%ukB HighMem)\n",
1286 K(nr_free_pages()),
1287 K(nr_free_highpages()));
1288
1289 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1335 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1290 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1336 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1291 active, 1337 active,
@@ -1294,13 +1340,17 @@ void show_free_areas(void)
1294 global_page_state(NR_WRITEBACK), 1340 global_page_state(NR_WRITEBACK),
1295 global_page_state(NR_UNSTABLE_NFS), 1341 global_page_state(NR_UNSTABLE_NFS),
1296 nr_free_pages(), 1342 nr_free_pages(),
1297 global_page_state(NR_SLAB), 1343 global_page_state(NR_SLAB_RECLAIMABLE) +
1344 global_page_state(NR_SLAB_UNRECLAIMABLE),
1298 global_page_state(NR_FILE_MAPPED), 1345 global_page_state(NR_FILE_MAPPED),
1299 global_page_state(NR_PAGETABLE)); 1346 global_page_state(NR_PAGETABLE));
1300 1347
1301 for_each_zone(zone) { 1348 for_each_zone(zone) {
1302 int i; 1349 int i;
1303 1350
1351 if (!populated_zone(zone))
1352 continue;
1353
1304 show_node(zone); 1354 show_node(zone);
1305 printk("%s" 1355 printk("%s"
1306 " free:%lukB" 1356 " free:%lukB"
@@ -1333,12 +1383,11 @@ void show_free_areas(void)
1333 for_each_zone(zone) { 1383 for_each_zone(zone) {
1334 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1384 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1335 1385
1386 if (!populated_zone(zone))
1387 continue;
1388
1336 show_node(zone); 1389 show_node(zone);
1337 printk("%s: ", zone->name); 1390 printk("%s: ", zone->name);
1338 if (!populated_zone(zone)) {
1339 printk("empty\n");
1340 continue;
1341 }
1342 1391
1343 spin_lock_irqsave(&zone->lock, flags); 1392 spin_lock_irqsave(&zone->lock, flags);
1344 for (order = 0; order < MAX_ORDER; order++) { 1393 for (order = 0; order < MAX_ORDER; order++) {
@@ -1360,39 +1409,25 @@ void show_free_areas(void)
1360 * Add all populated zones of a node to the zonelist. 1409 * Add all populated zones of a node to the zonelist.
1361 */ 1410 */
1362static int __meminit build_zonelists_node(pg_data_t *pgdat, 1411static int __meminit build_zonelists_node(pg_data_t *pgdat,
1363 struct zonelist *zonelist, int nr_zones, int zone_type) 1412 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1364{ 1413{
1365 struct zone *zone; 1414 struct zone *zone;
1366 1415
1367 BUG_ON(zone_type > ZONE_HIGHMEM); 1416 BUG_ON(zone_type >= MAX_NR_ZONES);
1417 zone_type++;
1368 1418
1369 do { 1419 do {
1420 zone_type--;
1370 zone = pgdat->node_zones + zone_type; 1421 zone = pgdat->node_zones + zone_type;
1371 if (populated_zone(zone)) { 1422 if (populated_zone(zone)) {
1372#ifndef CONFIG_HIGHMEM
1373 BUG_ON(zone_type > ZONE_NORMAL);
1374#endif
1375 zonelist->zones[nr_zones++] = zone; 1423 zonelist->zones[nr_zones++] = zone;
1376 check_highest_zone(zone_type); 1424 check_highest_zone(zone_type);
1377 } 1425 }
1378 zone_type--;
1379 1426
1380 } while (zone_type >= 0); 1427 } while (zone_type);
1381 return nr_zones; 1428 return nr_zones;
1382} 1429}
1383 1430
1384static inline int highest_zone(int zone_bits)
1385{
1386 int res = ZONE_NORMAL;
1387 if (zone_bits & (__force int)__GFP_HIGHMEM)
1388 res = ZONE_HIGHMEM;
1389 if (zone_bits & (__force int)__GFP_DMA32)
1390 res = ZONE_DMA32;
1391 if (zone_bits & (__force int)__GFP_DMA)
1392 res = ZONE_DMA;
1393 return res;
1394}
1395
1396#ifdef CONFIG_NUMA 1431#ifdef CONFIG_NUMA
1397#define MAX_NODE_LOAD (num_online_nodes()) 1432#define MAX_NODE_LOAD (num_online_nodes())
1398static int __meminitdata node_load[MAX_NUMNODES]; 1433static int __meminitdata node_load[MAX_NUMNODES];
@@ -1458,13 +1493,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1458 1493
1459static void __meminit build_zonelists(pg_data_t *pgdat) 1494static void __meminit build_zonelists(pg_data_t *pgdat)
1460{ 1495{
1461 int i, j, k, node, local_node; 1496 int j, node, local_node;
1497 enum zone_type i;
1462 int prev_node, load; 1498 int prev_node, load;
1463 struct zonelist *zonelist; 1499 struct zonelist *zonelist;
1464 nodemask_t used_mask; 1500 nodemask_t used_mask;
1465 1501
1466 /* initialize zonelists */ 1502 /* initialize zonelists */
1467 for (i = 0; i < GFP_ZONETYPES; i++) { 1503 for (i = 0; i < MAX_NR_ZONES; i++) {
1468 zonelist = pgdat->node_zonelists + i; 1504 zonelist = pgdat->node_zonelists + i;
1469 zonelist->zones[0] = NULL; 1505 zonelist->zones[0] = NULL;
1470 } 1506 }
@@ -1494,13 +1530,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1494 node_load[node] += load; 1530 node_load[node] += load;
1495 prev_node = node; 1531 prev_node = node;
1496 load--; 1532 load--;
1497 for (i = 0; i < GFP_ZONETYPES; i++) { 1533 for (i = 0; i < MAX_NR_ZONES; i++) {
1498 zonelist = pgdat->node_zonelists + i; 1534 zonelist = pgdat->node_zonelists + i;
1499 for (j = 0; zonelist->zones[j] != NULL; j++); 1535 for (j = 0; zonelist->zones[j] != NULL; j++);
1500 1536
1501 k = highest_zone(i); 1537 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1502
1503 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1504 zonelist->zones[j] = NULL; 1538 zonelist->zones[j] = NULL;
1505 } 1539 }
1506 } 1540 }
@@ -1510,17 +1544,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1510 1544
1511static void __meminit build_zonelists(pg_data_t *pgdat) 1545static void __meminit build_zonelists(pg_data_t *pgdat)
1512{ 1546{
1513 int i, j, k, node, local_node; 1547 int node, local_node;
1548 enum zone_type i,j;
1514 1549
1515 local_node = pgdat->node_id; 1550 local_node = pgdat->node_id;
1516 for (i = 0; i < GFP_ZONETYPES; i++) { 1551 for (i = 0; i < MAX_NR_ZONES; i++) {
1517 struct zonelist *zonelist; 1552 struct zonelist *zonelist;
1518 1553
1519 zonelist = pgdat->node_zonelists + i; 1554 zonelist = pgdat->node_zonelists + i;
1520 1555
1521 j = 0; 1556 j = build_zonelists_node(pgdat, zonelist, 0, i);
1522 k = highest_zone(i);
1523 j = build_zonelists_node(pgdat, zonelist, j, k);
1524 /* 1557 /*
1525 * Now we build the zonelist so that it contains the zones 1558 * Now we build the zonelist so that it contains the zones
1526 * of all the other nodes. 1559 * of all the other nodes.
@@ -1532,12 +1565,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1532 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1565 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1533 if (!node_online(node)) 1566 if (!node_online(node))
1534 continue; 1567 continue;
1535 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1568 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1536 } 1569 }
1537 for (node = 0; node < local_node; node++) { 1570 for (node = 0; node < local_node; node++) {
1538 if (!node_online(node)) 1571 if (!node_online(node))
1539 continue; 1572 continue;
1540 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1573 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1541 } 1574 }
1542 1575
1543 zonelist->zones[j] = NULL; 1576 zonelist->zones[j] = NULL;
@@ -1558,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy)
1558void __meminit build_all_zonelists(void) 1591void __meminit build_all_zonelists(void)
1559{ 1592{
1560 if (system_state == SYSTEM_BOOTING) { 1593 if (system_state == SYSTEM_BOOTING) {
1561 __build_all_zonelists(0); 1594 __build_all_zonelists(NULL);
1562 cpuset_init_current_mems_allowed(); 1595 cpuset_init_current_mems_allowed();
1563 } else { 1596 } else {
1564 /* we have to stop all cpus to guaranntee there is no user 1597 /* we have to stop all cpus to guaranntee there is no user
@@ -1639,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
1639 1672
1640#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1673#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1641 1674
1642static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1643 unsigned long *zones_size, unsigned long *zholes_size)
1644{
1645 unsigned long realtotalpages, totalpages = 0;
1646 int i;
1647
1648 for (i = 0; i < MAX_NR_ZONES; i++)
1649 totalpages += zones_size[i];
1650 pgdat->node_spanned_pages = totalpages;
1651
1652 realtotalpages = totalpages;
1653 if (zholes_size)
1654 for (i = 0; i < MAX_NR_ZONES; i++)
1655 realtotalpages -= zholes_size[i];
1656 pgdat->node_present_pages = realtotalpages;
1657 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1658}
1659
1660
1661/* 1675/*
1662 * Initially all pages are reserved - free ones are freed 1676 * Initially all pages are reserved - free ones are freed
1663 * up by free_all_bootmem() once the early boot process is 1677 * up by free_all_bootmem() once the early boot process is
@@ -1698,8 +1712,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1698} 1712}
1699 1713
1700#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) 1714#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1701void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 1715void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1702 unsigned long size) 1716 unsigned long pfn, unsigned long size)
1703{ 1717{
1704 unsigned long snum = pfn_to_section_nr(pfn); 1718 unsigned long snum = pfn_to_section_nr(pfn);
1705 unsigned long end = pfn_to_section_nr(pfn + size); 1719 unsigned long end = pfn_to_section_nr(pfn + size);
@@ -1815,6 +1829,9 @@ static int __cpuinit process_zones(int cpu)
1815 1829
1816 for_each_zone(zone) { 1830 for_each_zone(zone) {
1817 1831
1832 if (!populated_zone(zone))
1833 continue;
1834
1818 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 1835 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1819 GFP_KERNEL, cpu_to_node(cpu)); 1836 GFP_KERNEL, cpu_to_node(cpu));
1820 if (!zone_pcp(zone, cpu)) 1837 if (!zone_pcp(zone, cpu))
@@ -1845,8 +1862,10 @@ static inline void free_zone_pagesets(int cpu)
1845 for_each_zone(zone) { 1862 for_each_zone(zone) {
1846 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1863 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1847 1864
1865 /* Free per_cpu_pageset if it is slab allocated */
1866 if (pset != &boot_pageset[cpu])
1867 kfree(pset);
1848 zone_pcp(zone, cpu) = NULL; 1868 zone_pcp(zone, cpu) = NULL;
1849 kfree(pset);
1850 } 1869 }
1851} 1870}
1852 1871
@@ -1972,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1972 return 0; 1991 return 0;
1973} 1992}
1974 1993
1994#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
1995/*
1996 * Basic iterator support. Return the first range of PFNs for a node
1997 * Note: nid == MAX_NUMNODES returns first region regardless of node
1998 */
1999static int __init first_active_region_index_in_nid(int nid)
2000{
2001 int i;
2002
2003 for (i = 0; i < nr_nodemap_entries; i++)
2004 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2005 return i;
2006
2007 return -1;
2008}
2009
2010/*
2011 * Basic iterator support. Return the next active range of PFNs for a node
2012 * Note: nid == MAX_NUMNODES returns next region regardles of node
2013 */
2014static int __init next_active_region_index_in_nid(int index, int nid)
2015{
2016 for (index = index + 1; index < nr_nodemap_entries; index++)
2017 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2018 return index;
2019
2020 return -1;
2021}
2022
2023#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2024/*
2025 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2026 * Architectures may implement their own version but if add_active_range()
2027 * was used and there are no special requirements, this is a convenient
2028 * alternative
2029 */
2030int __init early_pfn_to_nid(unsigned long pfn)
2031{
2032 int i;
2033
2034 for (i = 0; i < nr_nodemap_entries; i++) {
2035 unsigned long start_pfn = early_node_map[i].start_pfn;
2036 unsigned long end_pfn = early_node_map[i].end_pfn;
2037
2038 if (start_pfn <= pfn && pfn < end_pfn)
2039 return early_node_map[i].nid;
2040 }
2041
2042 return 0;
2043}
2044#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2045
2046/* Basic iterator support to walk early_node_map[] */
2047#define for_each_active_range_index_in_nid(i, nid) \
2048 for (i = first_active_region_index_in_nid(nid); i != -1; \
2049 i = next_active_region_index_in_nid(i, nid))
2050
2051/**
2052 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2053 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
2054 * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
2055 *
2056 * If an architecture guarantees that all ranges registered with
2057 * add_active_ranges() contain no holes and may be freed, this
2058 * this function may be used instead of calling free_bootmem() manually.
2059 */
2060void __init free_bootmem_with_active_regions(int nid,
2061 unsigned long max_low_pfn)
2062{
2063 int i;
2064
2065 for_each_active_range_index_in_nid(i, nid) {
2066 unsigned long size_pages = 0;
2067 unsigned long end_pfn = early_node_map[i].end_pfn;
2068
2069 if (early_node_map[i].start_pfn >= max_low_pfn)
2070 continue;
2071
2072 if (end_pfn > max_low_pfn)
2073 end_pfn = max_low_pfn;
2074
2075 size_pages = end_pfn - early_node_map[i].start_pfn;
2076 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2077 PFN_PHYS(early_node_map[i].start_pfn),
2078 size_pages << PAGE_SHIFT);
2079 }
2080}
2081
2082/**
2083 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2084 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
2085 *
2086 * If an architecture guarantees that all ranges registered with
2087 * add_active_ranges() contain no holes and may be freed, this
2088 * this function may be used instead of calling memory_present() manually.
2089 */
2090void __init sparse_memory_present_with_active_regions(int nid)
2091{
2092 int i;
2093
2094 for_each_active_range_index_in_nid(i, nid)
2095 memory_present(early_node_map[i].nid,
2096 early_node_map[i].start_pfn,
2097 early_node_map[i].end_pfn);
2098}
2099
2100/**
2101 * push_node_boundaries - Push node boundaries to at least the requested boundary
2102 * @nid: The nid of the node to push the boundary for
2103 * @start_pfn: The start pfn of the node
2104 * @end_pfn: The end pfn of the node
2105 *
2106 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2107 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2108 * be hotplugged even though no physical memory exists. This function allows
2109 * an arch to push out the node boundaries so mem_map is allocated that can
2110 * be used later.
2111 */
2112#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2113void __init push_node_boundaries(unsigned int nid,
2114 unsigned long start_pfn, unsigned long end_pfn)
2115{
2116 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2117 nid, start_pfn, end_pfn);
2118
2119 /* Initialise the boundary for this node if necessary */
2120 if (node_boundary_end_pfn[nid] == 0)
2121 node_boundary_start_pfn[nid] = -1UL;
2122
2123 /* Update the boundaries */
2124 if (node_boundary_start_pfn[nid] > start_pfn)
2125 node_boundary_start_pfn[nid] = start_pfn;
2126 if (node_boundary_end_pfn[nid] < end_pfn)
2127 node_boundary_end_pfn[nid] = end_pfn;
2128}
2129
2130/* If necessary, push the node boundary out for reserve hotadd */
2131static void __init account_node_boundary(unsigned int nid,
2132 unsigned long *start_pfn, unsigned long *end_pfn)
2133{
2134 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2135 nid, *start_pfn, *end_pfn);
2136
2137 /* Return if boundary information has not been provided */
2138 if (node_boundary_end_pfn[nid] == 0)
2139 return;
2140
2141 /* Check the boundaries and update if necessary */
2142 if (node_boundary_start_pfn[nid] < *start_pfn)
2143 *start_pfn = node_boundary_start_pfn[nid];
2144 if (node_boundary_end_pfn[nid] > *end_pfn)
2145 *end_pfn = node_boundary_end_pfn[nid];
2146}
2147#else
2148void __init push_node_boundaries(unsigned int nid,
2149 unsigned long start_pfn, unsigned long end_pfn) {}
2150
2151static void __init account_node_boundary(unsigned int nid,
2152 unsigned long *start_pfn, unsigned long *end_pfn) {}
2153#endif
2154
2155
2156/**
2157 * get_pfn_range_for_nid - Return the start and end page frames for a node
2158 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
2159 * @start_pfn: Passed by reference. On return, it will have the node start_pfn
2160 * @end_pfn: Passed by reference. On return, it will have the node end_pfn
2161 *
2162 * It returns the start and end page frame of a node based on information
2163 * provided by an arch calling add_active_range(). If called for a node
2164 * with no available memory, a warning is printed and the start and end
2165 * PFNs will be 0
2166 */
2167void __init get_pfn_range_for_nid(unsigned int nid,
2168 unsigned long *start_pfn, unsigned long *end_pfn)
2169{
2170 int i;
2171 *start_pfn = -1UL;
2172 *end_pfn = 0;
2173
2174 for_each_active_range_index_in_nid(i, nid) {
2175 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2176 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2177 }
2178
2179 if (*start_pfn == -1UL) {
2180 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2181 *start_pfn = 0;
2182 }
2183
2184 /* Push the node boundaries out if requested */
2185 account_node_boundary(nid, start_pfn, end_pfn);
2186}
2187
2188/*
2189 * Return the number of pages a zone spans in a node, including holes
2190 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2191 */
2192unsigned long __init zone_spanned_pages_in_node(int nid,
2193 unsigned long zone_type,
2194 unsigned long *ignored)
2195{
2196 unsigned long node_start_pfn, node_end_pfn;
2197 unsigned long zone_start_pfn, zone_end_pfn;
2198
2199 /* Get the start and end of the node and zone */
2200 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2201 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2202 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2203
2204 /* Check that this node has pages within the zone's required range */
2205 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2206 return 0;
2207
2208 /* Move the zone boundaries inside the node if necessary */
2209 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2210 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2211
2212 /* Return the spanned pages */
2213 return zone_end_pfn - zone_start_pfn;
2214}
2215
2216/*
2217 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2218 * then all holes in the requested range will be accounted for
2219 */
2220unsigned long __init __absent_pages_in_range(int nid,
2221 unsigned long range_start_pfn,
2222 unsigned long range_end_pfn)
2223{
2224 int i = 0;
2225 unsigned long prev_end_pfn = 0, hole_pages = 0;
2226 unsigned long start_pfn;
2227
2228 /* Find the end_pfn of the first active range of pfns in the node */
2229 i = first_active_region_index_in_nid(nid);
2230 if (i == -1)
2231 return 0;
2232
2233 /* Account for ranges before physical memory on this node */
2234 if (early_node_map[i].start_pfn > range_start_pfn)
2235 hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2236
2237 prev_end_pfn = early_node_map[i].start_pfn;
2238
2239 /* Find all holes for the zone within the node */
2240 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2241
2242 /* No need to continue if prev_end_pfn is outside the zone */
2243 if (prev_end_pfn >= range_end_pfn)
2244 break;
2245
2246 /* Make sure the end of the zone is not within the hole */
2247 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2248 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2249
2250 /* Update the hole size cound and move on */
2251 if (start_pfn > range_start_pfn) {
2252 BUG_ON(prev_end_pfn > start_pfn);
2253 hole_pages += start_pfn - prev_end_pfn;
2254 }
2255 prev_end_pfn = early_node_map[i].end_pfn;
2256 }
2257
2258 /* Account for ranges past physical memory on this node */
2259 if (range_end_pfn > prev_end_pfn)
2260 hole_pages = range_end_pfn -
2261 max(range_start_pfn, prev_end_pfn);
2262
2263 return hole_pages;
2264}
2265
2266/**
2267 * absent_pages_in_range - Return number of page frames in holes within a range
2268 * @start_pfn: The start PFN to start searching for holes
2269 * @end_pfn: The end PFN to stop searching for holes
2270 *
2271 * It returns the number of pages frames in memory holes within a range
2272 */
2273unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2274 unsigned long end_pfn)
2275{
2276 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2277}
2278
2279/* Return the number of page frames in holes in a zone on a node */
2280unsigned long __init zone_absent_pages_in_node(int nid,
2281 unsigned long zone_type,
2282 unsigned long *ignored)
2283{
2284 unsigned long node_start_pfn, node_end_pfn;
2285 unsigned long zone_start_pfn, zone_end_pfn;
2286
2287 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2288 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2289 node_start_pfn);
2290 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2291 node_end_pfn);
2292
2293 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2294}
2295
2296/* Return the zone index a PFN is in */
2297int memmap_zone_idx(struct page *lmem_map)
2298{
2299 int i;
2300 unsigned long phys_addr = virt_to_phys(lmem_map);
2301 unsigned long pfn = phys_addr >> PAGE_SHIFT;
2302
2303 for (i = 0; i < MAX_NR_ZONES; i++)
2304 if (pfn < arch_zone_highest_possible_pfn[i])
2305 break;
2306
2307 return i;
2308}
2309#else
2310static inline unsigned long zone_spanned_pages_in_node(int nid,
2311 unsigned long zone_type,
2312 unsigned long *zones_size)
2313{
2314 return zones_size[zone_type];
2315}
2316
2317static inline unsigned long zone_absent_pages_in_node(int nid,
2318 unsigned long zone_type,
2319 unsigned long *zholes_size)
2320{
2321 if (!zholes_size)
2322 return 0;
2323
2324 return zholes_size[zone_type];
2325}
2326
2327static inline int memmap_zone_idx(struct page *lmem_map)
2328{
2329 return MAX_NR_ZONES;
2330}
2331#endif
2332
2333static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2334 unsigned long *zones_size, unsigned long *zholes_size)
2335{
2336 unsigned long realtotalpages, totalpages = 0;
2337 enum zone_type i;
2338
2339 for (i = 0; i < MAX_NR_ZONES; i++)
2340 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2341 zones_size);
2342 pgdat->node_spanned_pages = totalpages;
2343
2344 realtotalpages = totalpages;
2345 for (i = 0; i < MAX_NR_ZONES; i++)
2346 realtotalpages -=
2347 zone_absent_pages_in_node(pgdat->node_id, i,
2348 zholes_size);
2349 pgdat->node_present_pages = realtotalpages;
2350 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2351 realtotalpages);
2352}
2353
1975/* 2354/*
1976 * Set up the zone data structures: 2355 * Set up the zone data structures:
1977 * - mark all pages reserved 2356 * - mark all pages reserved
@@ -1981,7 +2360,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1981static void __meminit free_area_init_core(struct pglist_data *pgdat, 2360static void __meminit free_area_init_core(struct pglist_data *pgdat,
1982 unsigned long *zones_size, unsigned long *zholes_size) 2361 unsigned long *zones_size, unsigned long *zholes_size)
1983{ 2362{
1984 unsigned long j; 2363 enum zone_type j;
1985 int nid = pgdat->node_id; 2364 int nid = pgdat->node_id;
1986 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2365 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1987 int ret; 2366 int ret;
@@ -1993,21 +2372,46 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
1993 2372
1994 for (j = 0; j < MAX_NR_ZONES; j++) { 2373 for (j = 0; j < MAX_NR_ZONES; j++) {
1995 struct zone *zone = pgdat->node_zones + j; 2374 struct zone *zone = pgdat->node_zones + j;
1996 unsigned long size, realsize; 2375 unsigned long size, realsize, memmap_pages;
1997 2376
1998 realsize = size = zones_size[j]; 2377 size = zone_spanned_pages_in_node(nid, j, zones_size);
1999 if (zholes_size) 2378 realsize = size - zone_absent_pages_in_node(nid, j,
2000 realsize -= zholes_size[j]; 2379 zholes_size);
2001 2380
2002 if (j < ZONE_HIGHMEM) 2381 /*
2382 * Adjust realsize so that it accounts for how much memory
2383 * is used by this zone for memmap. This affects the watermark
2384 * and per-cpu initialisations
2385 */
2386 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2387 if (realsize >= memmap_pages) {
2388 realsize -= memmap_pages;
2389 printk(KERN_DEBUG
2390 " %s zone: %lu pages used for memmap\n",
2391 zone_names[j], memmap_pages);
2392 } else
2393 printk(KERN_WARNING
2394 " %s zone: %lu pages exceeds realsize %lu\n",
2395 zone_names[j], memmap_pages, realsize);
2396
2397 /* Account for reserved DMA pages */
2398 if (j == ZONE_DMA && realsize > dma_reserve) {
2399 realsize -= dma_reserve;
2400 printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
2401 dma_reserve);
2402 }
2403
2404 if (!is_highmem_idx(j))
2003 nr_kernel_pages += realsize; 2405 nr_kernel_pages += realsize;
2004 nr_all_pages += realsize; 2406 nr_all_pages += realsize;
2005 2407
2006 zone->spanned_pages = size; 2408 zone->spanned_pages = size;
2007 zone->present_pages = realsize; 2409 zone->present_pages = realsize;
2008#ifdef CONFIG_NUMA 2410#ifdef CONFIG_NUMA
2009 zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) 2411 zone->node = nid;
2412 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2010 / 100; 2413 / 100;
2414 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2011#endif 2415#endif
2012 zone->name = zone_names[j]; 2416 zone->name = zone_names[j];
2013 spin_lock_init(&zone->lock); 2417 spin_lock_init(&zone->lock);
@@ -2067,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2067 /* 2471 /*
2068 * With no DISCONTIG, the global mem_map is just set as node 0's 2472 * With no DISCONTIG, the global mem_map is just set as node 0's
2069 */ 2473 */
2070 if (pgdat == NODE_DATA(0)) 2474 if (pgdat == NODE_DATA(0)) {
2071 mem_map = NODE_DATA(0)->node_mem_map; 2475 mem_map = NODE_DATA(0)->node_mem_map;
2476#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2477 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2478 mem_map -= pgdat->node_start_pfn;
2479#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2480 }
2072#endif 2481#endif
2073#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2482#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2074} 2483}
@@ -2079,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2079{ 2488{
2080 pgdat->node_id = nid; 2489 pgdat->node_id = nid;
2081 pgdat->node_start_pfn = node_start_pfn; 2490 pgdat->node_start_pfn = node_start_pfn;
2082 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2491 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2083 2492
2084 alloc_node_mem_map(pgdat); 2493 alloc_node_mem_map(pgdat);
2085 2494
2086 free_area_init_core(pgdat, zones_size, zholes_size); 2495 free_area_init_core(pgdat, zones_size, zholes_size);
2087} 2496}
2088 2497
2498#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2499/**
2500 * add_active_range - Register a range of PFNs backed by physical memory
2501 * @nid: The node ID the range resides on
2502 * @start_pfn: The start PFN of the available physical memory
2503 * @end_pfn: The end PFN of the available physical memory
2504 *
2505 * These ranges are stored in an early_node_map[] and later used by
2506 * free_area_init_nodes() to calculate zone sizes and holes. If the
2507 * range spans a memory hole, it is up to the architecture to ensure
2508 * the memory is not freed by the bootmem allocator. If possible
2509 * the range being registered will be merged with existing ranges.
2510 */
2511void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2512 unsigned long end_pfn)
2513{
2514 int i;
2515
2516 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2517 "%d entries of %d used\n",
2518 nid, start_pfn, end_pfn,
2519 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2520
2521 /* Merge with existing active regions if possible */
2522 for (i = 0; i < nr_nodemap_entries; i++) {
2523 if (early_node_map[i].nid != nid)
2524 continue;
2525
2526 /* Skip if an existing region covers this new one */
2527 if (start_pfn >= early_node_map[i].start_pfn &&
2528 end_pfn <= early_node_map[i].end_pfn)
2529 return;
2530
2531 /* Merge forward if suitable */
2532 if (start_pfn <= early_node_map[i].end_pfn &&
2533 end_pfn > early_node_map[i].end_pfn) {
2534 early_node_map[i].end_pfn = end_pfn;
2535 return;
2536 }
2537
2538 /* Merge backward if suitable */
2539 if (start_pfn < early_node_map[i].end_pfn &&
2540 end_pfn >= early_node_map[i].start_pfn) {
2541 early_node_map[i].start_pfn = start_pfn;
2542 return;
2543 }
2544 }
2545
2546 /* Check that early_node_map is large enough */
2547 if (i >= MAX_ACTIVE_REGIONS) {
2548 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2549 MAX_ACTIVE_REGIONS);
2550 return;
2551 }
2552
2553 early_node_map[i].nid = nid;
2554 early_node_map[i].start_pfn = start_pfn;
2555 early_node_map[i].end_pfn = end_pfn;
2556 nr_nodemap_entries = i + 1;
2557}
2558
2559/**
2560 * shrink_active_range - Shrink an existing registered range of PFNs
2561 * @nid: The node id the range is on that should be shrunk
2562 * @old_end_pfn: The old end PFN of the range
2563 * @new_end_pfn: The new PFN of the range
2564 *
2565 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2566 * The map is kept at the end physical page range that has already been
2567 * registered with add_active_range(). This function allows an arch to shrink
2568 * an existing registered range.
2569 */
2570void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2571 unsigned long new_end_pfn)
2572{
2573 int i;
2574
2575 /* Find the old active region end and shrink */
2576 for_each_active_range_index_in_nid(i, nid)
2577 if (early_node_map[i].end_pfn == old_end_pfn) {
2578 early_node_map[i].end_pfn = new_end_pfn;
2579 break;
2580 }
2581}
2582
2583/**
2584 * remove_all_active_ranges - Remove all currently registered regions
2585 * During discovery, it may be found that a table like SRAT is invalid
2586 * and an alternative discovery method must be used. This function removes
2587 * all currently registered regions.
2588 */
2589void __init remove_all_active_ranges()
2590{
2591 memset(early_node_map, 0, sizeof(early_node_map));
2592 nr_nodemap_entries = 0;
2593#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2594 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2595 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2596#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2597}
2598
2599/* Compare two active node_active_regions */
2600static int __init cmp_node_active_region(const void *a, const void *b)
2601{
2602 struct node_active_region *arange = (struct node_active_region *)a;
2603 struct node_active_region *brange = (struct node_active_region *)b;
2604
2605 /* Done this way to avoid overflows */
2606 if (arange->start_pfn > brange->start_pfn)
2607 return 1;
2608 if (arange->start_pfn < brange->start_pfn)
2609 return -1;
2610
2611 return 0;
2612}
2613
2614/* sort the node_map by start_pfn */
2615static void __init sort_node_map(void)
2616{
2617 sort(early_node_map, (size_t)nr_nodemap_entries,
2618 sizeof(struct node_active_region),
2619 cmp_node_active_region, NULL);
2620}
2621
2622/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2623unsigned long __init find_min_pfn_for_node(unsigned long nid)
2624{
2625 int i;
2626
2627 /* Assuming a sorted map, the first range found has the starting pfn */
2628 for_each_active_range_index_in_nid(i, nid)
2629 return early_node_map[i].start_pfn;
2630
2631 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2632 return 0;
2633}
2634
2635/**
2636 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2637 *
2638 * It returns the minimum PFN based on information provided via
2639 * add_active_range()
2640 */
2641unsigned long __init find_min_pfn_with_active_regions(void)
2642{
2643 return find_min_pfn_for_node(MAX_NUMNODES);
2644}
2645
2646/**
2647 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2648 *
2649 * It returns the maximum PFN based on information provided via
2650 * add_active_range()
2651 */
2652unsigned long __init find_max_pfn_with_active_regions(void)
2653{
2654 int i;
2655 unsigned long max_pfn = 0;
2656
2657 for (i = 0; i < nr_nodemap_entries; i++)
2658 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2659
2660 return max_pfn;
2661}
2662
2663/**
2664 * free_area_init_nodes - Initialise all pg_data_t and zone data
2665 * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
2666 * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
2667 * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
2668 * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
2669 *
2670 * This will call free_area_init_node() for each active node in the system.
2671 * Using the page ranges provided by add_active_range(), the size of each
2672 * zone in each node and their holes is calculated. If the maximum PFN
2673 * between two adjacent zones match, it is assumed that the zone is empty.
2674 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2675 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2676 * starts where the previous one ended. For example, ZONE_DMA32 starts
2677 * at arch_max_dma_pfn.
2678 */
2679void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2680{
2681 unsigned long nid;
2682 enum zone_type i;
2683
2684 /* Record where the zone boundaries are */
2685 memset(arch_zone_lowest_possible_pfn, 0,
2686 sizeof(arch_zone_lowest_possible_pfn));
2687 memset(arch_zone_highest_possible_pfn, 0,
2688 sizeof(arch_zone_highest_possible_pfn));
2689 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2690 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2691 for (i = 1; i < MAX_NR_ZONES; i++) {
2692 arch_zone_lowest_possible_pfn[i] =
2693 arch_zone_highest_possible_pfn[i-1];
2694 arch_zone_highest_possible_pfn[i] =
2695 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2696 }
2697
2698 /* Regions in the early_node_map can be in any order */
2699 sort_node_map();
2700
2701 /* Print out the zone ranges */
2702 printk("Zone PFN ranges:\n");
2703 for (i = 0; i < MAX_NR_ZONES; i++)
2704 printk(" %-8s %8lu -> %8lu\n",
2705 zone_names[i],
2706 arch_zone_lowest_possible_pfn[i],
2707 arch_zone_highest_possible_pfn[i]);
2708
2709 /* Print out the early_node_map[] */
2710 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2711 for (i = 0; i < nr_nodemap_entries; i++)
2712 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2713 early_node_map[i].start_pfn,
2714 early_node_map[i].end_pfn);
2715
2716 /* Initialise every node */
2717 for_each_online_node(nid) {
2718 pg_data_t *pgdat = NODE_DATA(nid);
2719 free_area_init_node(nid, pgdat, NULL,
2720 find_min_pfn_for_node(nid), NULL);
2721 }
2722}
2723#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2724
2725/**
2726 * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
2727 * @new_dma_reserve - The number of pages to mark reserved
2728 *
2729 * The per-cpu batchsize and zone watermarks are determined by present_pages.
2730 * In the DMA zone, a significant percentage may be consumed by kernel image
2731 * and other unfreeable allocations which can skew the watermarks badly. This
2732 * function may optionally be used to account for unfreeable pages in
2733 * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
2734 */
2735void __init set_dma_reserve(unsigned long new_dma_reserve)
2736{
2737 dma_reserve = new_dma_reserve;
2738}
2739
2089#ifndef CONFIG_NEED_MULTIPLE_NODES 2740#ifndef CONFIG_NEED_MULTIPLE_NODES
2090static bootmem_data_t contig_bootmem_data; 2741static bootmem_data_t contig_bootmem_data;
2091struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2742struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -2129,7 +2780,7 @@ static void calculate_totalreserve_pages(void)
2129{ 2780{
2130 struct pglist_data *pgdat; 2781 struct pglist_data *pgdat;
2131 unsigned long reserve_pages = 0; 2782 unsigned long reserve_pages = 0;
2132 int i, j; 2783 enum zone_type i, j;
2133 2784
2134 for_each_online_pgdat(pgdat) { 2785 for_each_online_pgdat(pgdat) {
2135 for (i = 0; i < MAX_NR_ZONES; i++) { 2786 for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -2162,7 +2813,7 @@ static void calculate_totalreserve_pages(void)
2162static void setup_per_zone_lowmem_reserve(void) 2813static void setup_per_zone_lowmem_reserve(void)
2163{ 2814{
2164 struct pglist_data *pgdat; 2815 struct pglist_data *pgdat;
2165 int j, idx; 2816 enum zone_type j, idx;
2166 2817
2167 for_each_online_pgdat(pgdat) { 2818 for_each_online_pgdat(pgdat) {
2168 for (j = 0; j < MAX_NR_ZONES; j++) { 2819 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -2171,9 +2822,12 @@ static void setup_per_zone_lowmem_reserve(void)
2171 2822
2172 zone->lowmem_reserve[j] = 0; 2823 zone->lowmem_reserve[j] = 0;
2173 2824
2174 for (idx = j-1; idx >= 0; idx--) { 2825 idx = j;
2826 while (idx) {
2175 struct zone *lower_zone; 2827 struct zone *lower_zone;
2176 2828
2829 idx--;
2830
2177 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2831 if (sysctl_lowmem_reserve_ratio[idx] < 1)
2178 sysctl_lowmem_reserve_ratio[idx] = 1; 2832 sysctl_lowmem_reserve_ratio[idx] = 1;
2179 2833
@@ -2314,10 +2968,26 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
2314 return rc; 2968 return rc;
2315 2969
2316 for_each_zone(zone) 2970 for_each_zone(zone)
2317 zone->min_unmapped_ratio = (zone->present_pages * 2971 zone->min_unmapped_pages = (zone->present_pages *
2318 sysctl_min_unmapped_ratio) / 100; 2972 sysctl_min_unmapped_ratio) / 100;
2319 return 0; 2973 return 0;
2320} 2974}
2975
2976int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
2977 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2978{
2979 struct zone *zone;
2980 int rc;
2981
2982 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2983 if (rc)
2984 return rc;
2985
2986 for_each_zone(zone)
2987 zone->min_slab_pages = (zone->present_pages *
2988 sysctl_min_slab_ratio) / 100;
2989 return 0;
2990}
2321#endif 2991#endif
2322 2992
2323/* 2993/*