Merge ../linus

author: Dave Jones <davej@redhat.com> 2006-12-12 18:13:32 -0500
committer: Dave Jones <davej@redhat.com> 2006-12-12 18:13:32 -0500
commit: f0eef25339f92f7cd4aeea23d9ae97987a5a1e82 (patch)
tree: 2472e94d39f43a9580a6d2d5d92de0b749023263 /mm/page_alloc.c
parent: 0cfea5dd98205f2fa318836da664a7d7df1afbc1 (diff)
parent: e1036502e5263851259d147771226161e5ccc85a (diff)
1 files changed, 1030 insertions, 126 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9810f0a60db7..e6b17b2989e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,10 @@
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -80,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 EXPORT_SYMBOL(totalram_pages);
-/*
+static char * const zone_names[MAX_NR_ZONES] = {
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-static char *zone_names[MAX_NR_ZONES] = {
         "DMA",
 #ifdef CONFIG_ZONE_DMA32
         "DMA32",
@@ -102,6 +99,38 @@ int min_free_kbytes = 1024;
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
+static unsigned long __initdata dma_reserve;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+  /*
+   * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
+   * ranges of memory (RAM) that may be registered with add_active_range().
+   * Ranges passed to add_active_range() will be merged if possible
+   * so the number of times add_active_range() can be called is
+   * related to the number of nodes and the number of holes
+   */
+  #ifdef CONFIG_MAX_ACTIVE_REGIONS
+    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
+    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+  #else
+    #if MAX_NUMNODES >= 32
+      /* If there can be many nodes, allow up to 50 holes per node */
+      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
+    #else
+      /* By default, allow up to 256 distinct regions */
+      #define MAX_ACTIVE_REGIONS 256
+    #endif
+  #endif
+  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  int __initdata nr_nodemap_entries;
+  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+  unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
+  unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -202,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
-        page[1].lru.next = (void *)free_compound_page;  /* set dtor */
+        set_compound_page_dtor(page, free_compound_page);
        page[1].lru.prev = (void *)order;
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
@@ -451,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
        spin_lock(&zone->lock);
        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
-        __free_one_page(page, zone ,order);
+        __free_one_page(page, zone, order);
        spin_unlock(&zone->lock);
 }
@@ -461,17 +490,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        int i;
        int reserved = 0;
-        arch_free_page(page, order);
-        if (!PageHighMem(page))
-                debug_check_no_locks_freed(page_address(page),
-                                           PAGE_SIZE<<order);
        for (i = 0 ; i < (1 << order) ; ++i)
                reserved += free_pages_check(page + i);
        if (reserved)
                return;
+        if (!PageHighMem(page))
+                debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+        arch_free_page(page, order);
        kernel_map_pages(page, 1 << order, 0);
        local_irq_save(flags);
        __count_vm_events(PGFREE, 1 << order);
        free_one_page(page_zone(page), page, order);
@@ -571,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
                        1 << PG_checked | 1 << PG_mappedtodisk);
        set_page_private(page, 0);
        set_page_refcounted(page);
+        arch_alloc_page(page, order);
        kernel_map_pages(page, 1 << order, 1);
        if (gfp_flags & __GFP_ZERO)
@@ -656,9 +686,15 @@ void drain_node_pages(int nodeid)
                        pcp = &pset->pcp[i];
                        if (pcp->count) {
+                                int to_drain;
                                local_irq_save(flags);
-                                free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                                if (pcp->count >= pcp->batch)
-                                pcp->count = 0;
+                                        to_drain = pcp->batch;
+                                else
+                                        to_drain = pcp->count;
+                                free_pages_bulk(zone, to_drain, &pcp->list, 0);
+                                pcp->count -= to_drain;
                                local_irq_restore(flags);
                        }
                }
@@ -666,7 +702,6 @@ void drain_node_pages(int nodeid)
 }
 #endif
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
        unsigned long flags;
@@ -688,7 +723,6 @@ static void __drain_pages(unsigned int cpu)
                }
        }
 }
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_PM
@@ -747,13 +781,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        struct per_cpu_pages *pcp;
        unsigned long flags;
-        arch_free_page(page, 0);
        if (PageAnon(page))
                page->mapping = NULL;
        if (free_pages_check(page))
                return;
+        if (!PageHighMem(page))
+                debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+        arch_free_page(page, 0);
        kernel_map_pages(page, 1, 0);
        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
@@ -818,7 +853,7 @@ again:
                pcp = &zone_pcp(zone, cpu)->pcp[cold];
                local_irq_save(flags);
                if (!pcp->count) {
-                        pcp->count += rmqueue_bulk(zone, 0,
+                        pcp->count = rmqueue_bulk(zone, 0,
                                                pcp->batch, &pcp->list);
                        if (unlikely(!pcp->count))
                                goto failed;
@@ -858,6 +893,91 @@ failed:
 #define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET            0x40 /* check for correct cpuset */
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+static struct fail_page_alloc_attr {
+        struct fault_attr attr;
+        u32 ignore_gfp_highmem;
+        u32 ignore_gfp_wait;
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+        struct dentry *ignore_gfp_highmem_file;
+        struct dentry *ignore_gfp_wait_file;
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+} fail_page_alloc = {
+        .attr = FAULT_ATTR_INITIALIZER,
+        .ignore_gfp_wait = 1,
+        .ignore_gfp_highmem = 1,
+};
+static int __init setup_fail_page_alloc(char *str)
+{
+        return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+        if (gfp_mask & __GFP_NOFAIL)
+                return 0;
+        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+                return 0;
+        if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+                return 0;
+        return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+static int __init fail_page_alloc_debugfs(void)
+{
+        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+        struct dentry *dir;
+        int err;
+        err = init_fault_attr_dentries(&fail_page_alloc.attr,
+                                       "fail_page_alloc");
+        if (err)
+                return err;
+        dir = fail_page_alloc.attr.dentries.dir;
+        fail_page_alloc.ignore_gfp_wait_file =
+                debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                                      &fail_page_alloc.ignore_gfp_wait);
+        fail_page_alloc.ignore_gfp_highmem_file =
+                debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                                      &fail_page_alloc.ignore_gfp_highmem);
+        if (!fail_page_alloc.ignore_gfp_wait_file ||
+                        !fail_page_alloc.ignore_gfp_highmem_file) {
+                err = -ENOMEM;
+                debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+                debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+                cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+        }
+        return err;
+}
+late_initcall(fail_page_alloc_debugfs);
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+        return 0;
+}
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
 * Return 1 if free pages are above 'mark'. This takes into account the order
 * of the allocation.
@@ -866,7 +986,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
        /* free_pages my go negative - that's OK */
-        long min = mark, free_pages = z->free_pages - (1 << order) + 1;
+        unsigned long min = mark;
+        long free_pages = z->free_pages - (1 << order) + 1;
        int o;
        if (alloc_flags & ALLOC_HIGH)
@@ -889,31 +1010,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
+#ifdef CONFIG_NUMA
 /*
- * get_page_from_freeliest goes through the zonelist trying to allocate
+ * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full.  See further
+ * comments in mmzone.h.  Reduces cache footprint of zonelist scans
+ * that have to skip over alot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_online_map.)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+        nodemask_t *allowednodes;       /* zonelist_cache approximation */
+        zlc = zonelist->zlcache_ptr;
+        if (!zlc)
+                return NULL;
+        if (jiffies - zlc->last_full_zap > 1 * HZ) {
+                bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+                zlc->last_full_zap = jiffies;
+        }
+        allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+                                        &cpuset_current_mems_allowed :
+                                        &node_online_map;
+        return allowednodes;
+}
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ *  1) Check that the zone isn't thought to be full (doesn't have its
+ *     bit set in the zonelist_cache fullzones BITMAP).
+ *  2) Check that the zones node (obtained from the zonelist_cache
+ *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+                                                nodemask_t *allowednodes)
+{
+        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+        int i;                          /* index of *z in zonelist zones */
+        int n;                          /* node that zone *z is on */
+        zlc = zonelist->zlcache_ptr;
+        if (!zlc)
+                return 1;
+        i = z - zonelist->zones;
+        n = zlc->z_to_n[i];
+        /* This zone is worth trying if it is allowed but not full */
+        return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+        int i;                          /* index of *z in zonelist zones */
+        zlc = zonelist->zlcache_ptr;
+        if (!zlc)
+                return;
+        i = z - zonelist->zones;
+        set_bit(i, zlc->fullzones);
+}
+#else   /* CONFIG_NUMA */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+        return NULL;
+}
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+                                nodemask_t *allowednodes)
+{
+        return 1;
+}
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+}
+#endif  /* CONFIG_NUMA */
+/*
+ * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                struct zonelist *zonelist, int alloc_flags)
 {
-        struct zone **z = zonelist->zones;
+        struct zone **z;
        struct page *page = NULL;
-        int classzone_idx = zone_idx(*z);
+        int classzone_idx = zone_idx(zonelist->zones[0]);
        struct zone *zone;
+        nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+        int zlc_active = 0;             /* set if using zonelist_cache */
+        int did_zlc_setup = 0;          /* just call zlc_setup() one time */
+zonelist_scan:
        /*
-         * Go through the zonelist once, looking for a zone with enough free.
+         * Scan zonelist, looking for a zone with enough free.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
+        z = zonelist->zones;
        do {
+                if (NUMA_BUILD && zlc_active &&
+                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                                continue;
                zone = *z;
-                if (unlikely((gfp_mask & __GFP_THISNODE) &&
+                if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                break;
                if ((alloc_flags & ALLOC_CPUSET) &&
-                                !cpuset_zone_allowed(zone, gfp_mask))
+                        !cpuset_zone_allowed(zone, gfp_mask))
-                        continue;
+                                goto try_next_zone;
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
                        unsigned long mark;
@@ -923,18 +1173,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                mark = zone->pages_low;
                        else
                                mark = zone->pages_high;
-                        if (!zone_watermark_ok(zone , order, mark,
+                        if (!zone_watermark_ok(zone, order, mark,
-                                    classzone_idx, alloc_flags))
+                                    classzone_idx, alloc_flags)) {
                                if (!zone_reclaim_mode ||
                                    !zone_reclaim(zone, gfp_mask, order))
-                                        continue;
+                                        goto this_zone_full;
+                        }
                }
                page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
-                if (page) {
+                if (page)
                        break;
+this_zone_full:
+                if (NUMA_BUILD)
+                        zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+                if (NUMA_BUILD && !did_zlc_setup) {
+                        /* we do zlc_setup after the first zone is tried */
+                        allowednodes = zlc_setup(zonelist, alloc_flags);
+                        zlc_active = 1;
+                        did_zlc_setup = 1;
                }
        } while (*(++z) != NULL);
+        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+                /* Disable zlc cache for second zonelist scan */
+                zlc_active = 0;
+                goto zonelist_scan;
+        }
        return page;
 }
@@ -956,6 +1222,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
        might_sleep_if(wait);
+        if (should_fail_alloc_page(gfp_mask, order))
+                return NULL;
 restart:
        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
@@ -969,9 +1238,19 @@ restart:
        if (page)
                goto got_pg;
-        do {
+        /*
+         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+         * __GFP_NOWARN set) should not cause reclaim since the subsystem
+         * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+         * using a larger set of nodes after it has established that the
+         * allowed per node queues are empty and that nodes are
+         * over allocated.
+         */
+        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+                goto nopage;
+        for (z = zonelist->zones; *z; z++)
                wakeup_kswapd(*z, order);
-        } while (*(++z));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -1005,6 +1284,7 @@ restart:
        /* This allocation should allow future memory freeing. */
+rebalance:
        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                        && !in_interrupt()) {
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1015,7 +1295,7 @@ nofail_alloc:
                        if (page)
                                goto got_pg;
                        if (gfp_mask & __GFP_NOFAIL) {
-                                blk_congestion_wait(WRITE, HZ/50);
+                                congestion_wait(WRITE, HZ/50);
                                goto nofail_alloc;
                        }
                }
@@ -1026,7 +1306,6 @@ nofail_alloc:
        if (!wait)
                goto nopage;
-rebalance:
        cond_resched();
        /* We now go into synchronous reclaim */
@@ -1078,7 +1357,7 @@ rebalance:
                        do_retry = 1;
        }
        if (do_retry) {
-                blk_congestion_wait(WRITE, HZ/50);
+                congestion_wait(WRITE, HZ/50);
                goto rebalance;
        }
@@ -1222,14 +1501,12 @@ unsigned int nr_free_pagecache_pages(void)
 {
        return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
 }
-#ifdef CONFIG_NUMA
-static void show_node(struct zone *zone)
+static inline void show_node(struct zone *zone)
 {
-        printk("Node %ld ", zone_to_nid(zone));
+        if (NUMA_BUILD)
+                printk("Node %d ", zone_to_nid(zone));
 }
-#else
-#define show_node(zone) do { } while (0)
-#endif
 void si_meminfo(struct sysinfo *val)
 {
@@ -1271,34 +1548,30 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 */
 void show_free_areas(void)
 {
-        int cpu, temperature;
+        int cpu;
        unsigned long active;
        unsigned long inactive;
        unsigned long free;
        struct zone *zone;
        for_each_zone(zone) {
-                show_node(zone);
+                if (!populated_zone(zone))
-                printk("%s per-cpu:", zone->name);
-                if (!populated_zone(zone)) {
-                        printk(" empty\n");
                        continue;
-                } else
-                        printk("\n");
+                show_node(zone);
+                printk("%s per-cpu:\n", zone->name);
                for_each_online_cpu(cpu) {
                        struct per_cpu_pageset *pageset;
                        pageset = zone_pcp(zone, cpu);
-                        for (temperature = 0; temperature < 2; temperature++)
+                        printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d   "
-                                printk("cpu %d %s: high %d, batch %d used:%d\n",
+                               "Cold: hi:%5d, btch:%4d usd:%4d\n",
-                                        cpu,
+                               cpu, pageset->pcp[0].high,
-                                        temperature ? "cold" : "hot",
+                               pageset->pcp[0].batch, pageset->pcp[0].count,
-                                        pageset->pcp[temperature].high,
+                               pageset->pcp[1].high, pageset->pcp[1].batch,
-                                        pageset->pcp[temperature].batch,
+                               pageset->pcp[1].count);
-                                        pageset->pcp[temperature].count);
                }
        }
@@ -1320,6 +1593,9 @@ void show_free_areas(void)
        for_each_zone(zone) {
                int i;
+                if (!populated_zone(zone))
+                        continue;
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -1352,12 +1628,11 @@ void show_free_areas(void)
        for_each_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
+                if (!populated_zone(zone))
+                        continue;
                show_node(zone);
                printk("%s: ", zone->name);
-                if (!populated_zone(zone)) {
-                        printk("empty\n");
-                        continue;
-                }
                spin_lock_irqsave(&zone->lock, flags);
                for (order = 0; order < MAX_ORDER; order++) {
@@ -1510,6 +1785,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
        }
 }
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+        int i;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                struct zonelist *zonelist;
+                struct zonelist_cache *zlc;
+                struct zone **z;
+                zonelist = pgdat->node_zonelists + i;
+                zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+                bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+                for (z = zonelist->zones; *z; z++)
+                        zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+        }
+}
 #else   /* CONFIG_NUMA */
 static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1547,21 +1840,33 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
        }
 }
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+        int i;
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                pgdat->node_zonelists[i].zlcache_ptr = NULL;
+}
 #endif  /* CONFIG_NUMA */
 /* return values int ....just for stop_machine_run() */
 static int __meminit __build_all_zonelists(void *dummy)
 {
        int nid;
-        for_each_online_node(nid)
+        for_each_online_node(nid) {
                build_zonelists(NODE_DATA(nid));
+                build_zonelist_cache(NODE_DATA(nid));
+        }
        return 0;
 }
 void __meminit build_all_zonelists(void)
 {
        if (system_state == SYSTEM_BOOTING) {
-                __build_all_zonelists(0);
+                __build_all_zonelists(NULL);
                cpuset_init_current_mems_allowed();
        } else {
                /* we have to stop all cpus to guaranntee there is no user
@@ -1642,25 +1947,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
-                unsigned long *zones_size, unsigned long *zholes_size)
-{
-        unsigned long realtotalpages, totalpages = 0;
-        enum zone_type i;
-        for (i = 0; i < MAX_NR_ZONES; i++)
-                totalpages += zones_size[i];
-        pgdat->node_spanned_pages = totalpages;
-        realtotalpages = totalpages;
-        if (zholes_size)
-                for (i = 0; i < MAX_NR_ZONES; i++)
-                        realtotalpages -= zholes_size[i];
-        pgdat->node_present_pages = realtotalpages;
-        printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
-}
 /*
 * Initially all pages are reserved - free ones are freed
 * up by free_all_bootmem() once the early boot process is
@@ -1676,6 +1962,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                if (!early_pfn_valid(pfn))
                        continue;
+                if (!early_pfn_in_nid(pfn, nid))
+                        continue;
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
                init_page_count(page);
@@ -1700,20 +1988,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        }
 }
-#define ZONETABLE_INDEX(x, zone_nr)     ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
-                unsigned long pfn, unsigned long size)
-{
-        unsigned long snum = pfn_to_section_nr(pfn);
-        unsigned long end = pfn_to_section_nr(pfn + size);
-        if (FLAGS_HAS_NODE)
-                zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
-        else
-                for (; snum <= end; snum++)
-                        zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
        memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1818,6 +2092,9 @@ static int __cpuinit process_zones(int cpu)
        for_each_zone(zone) {
+                if (!populated_zone(zone))
+                        continue;
                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
                                         GFP_KERNEL, cpu_to_node(cpu));
                if (!zone_pcp(zone, cpu))
@@ -1863,16 +2140,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
        int ret = NOTIFY_OK;
        switch (action) {
-                case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE:
-                        if (process_zones(cpu))
+                if (process_zones(cpu))
-                                ret = NOTIFY_BAD;
+                        ret = NOTIFY_BAD;
-                        break;
+                break;
-                case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED:
-                case CPU_DEAD:
+        case CPU_DEAD:
-                        free_zone_pagesets(cpu);
+                free_zone_pagesets(cpu);
-                        break;
+                break;
-                default:
+        default:
-                        break;
+                break;
        }
        return ret;
 }
@@ -1977,6 +2254,349 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        return 0;
 }
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * Basic iterator support. Return the first range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns first region regardless of node
+ */
+static int __init first_active_region_index_in_nid(int nid)
+{
+        int i;
+        for (i = 0; i < nr_nodemap_entries; i++)
+                if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+                        return i;
+        return -1;
+}
+/*
+ * Basic iterator support. Return the next active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardles of node
+ */
+static int __init next_active_region_index_in_nid(int index, int nid)
+{
+        for (index = index + 1; index < nr_nodemap_entries; index++)
+                if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+                        return index;
+        return -1;
+}
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ * Architectures may implement their own version but if add_active_range()
+ * was used and there are no special requirements, this is a convenient
+ * alternative
+ */
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+        int i;
+        for (i = 0; i < nr_nodemap_entries; i++) {
+                unsigned long start_pfn = early_node_map[i].start_pfn;
+                unsigned long end_pfn = early_node_map[i].end_pfn;
+                if (start_pfn <= pfn && pfn < end_pfn)
+                        return early_node_map[i].nid;
+        }
+        return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+/* Basic iterator support to walk early_node_map[] */
+#define for_each_active_range_index_in_nid(i, nid) \
+        for (i = first_active_region_index_in_nid(nid); i != -1; \
+                                i = next_active_region_index_in_nid(i, nid))
+/**
+ * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
+ * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * this function may be used instead of calling free_bootmem() manually.
+ */
+void __init free_bootmem_with_active_regions(int nid,
+                                                unsigned long max_low_pfn)
+{
+        int i;
+        for_each_active_range_index_in_nid(i, nid) {
+                unsigned long size_pages = 0;
+                unsigned long end_pfn = early_node_map[i].end_pfn;
+                if (early_node_map[i].start_pfn >= max_low_pfn)
+                        continue;
+                if (end_pfn > max_low_pfn)
+                        end_pfn = max_low_pfn;
+                size_pages = end_pfn - early_node_map[i].start_pfn;
+                free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+                                PFN_PHYS(early_node_map[i].start_pfn),
+                                size_pages << PAGE_SHIFT);
+        }
+}
+/**
+ * sparse_memory_present_with_active_regions - Call memory_present for each active range
+ * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * function may be used instead of calling memory_present() manually.
+ */
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+        int i;
+        for_each_active_range_index_in_nid(i, nid)
+                memory_present(early_node_map[i].nid,
+                                early_node_map[i].start_pfn,
+                                early_node_map[i].end_pfn);
+}
+/**
+ * push_node_boundaries - Push node boundaries to at least the requested boundary
+ * @nid: The nid of the node to push the boundary for
+ * @start_pfn: The start pfn of the node
+ * @end_pfn: The end pfn of the node
+ *
+ * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
+ * time. Specifically, on x86_64, SRAT will report ranges that can potentially
+ * be hotplugged even though no physical memory exists. This function allows
+ * an arch to push out the node boundaries so mem_map is allocated that can
+ * be used later.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+void __init push_node_boundaries(unsigned int nid,
+                unsigned long start_pfn, unsigned long end_pfn)
+{
+        printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+                        nid, start_pfn, end_pfn);
+        /* Initialise the boundary for this node if necessary */
+        if (node_boundary_end_pfn[nid] == 0)
+                node_boundary_start_pfn[nid] = -1UL;
+        /* Update the boundaries */
+        if (node_boundary_start_pfn[nid] > start_pfn)
+                node_boundary_start_pfn[nid] = start_pfn;
+        if (node_boundary_end_pfn[nid] < end_pfn)
+                node_boundary_end_pfn[nid] = end_pfn;
+}
+/* If necessary, push the node boundary out for reserve hotadd */
+static void __init account_node_boundary(unsigned int nid,
+                unsigned long *start_pfn, unsigned long *end_pfn)
+{
+        printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+                        nid, *start_pfn, *end_pfn);
+        /* Return if boundary information has not been provided */
+        if (node_boundary_end_pfn[nid] == 0)
+                return;
+        /* Check the boundaries and update if necessary */
+        if (node_boundary_start_pfn[nid] < *start_pfn)
+                *start_pfn = node_boundary_start_pfn[nid];
+        if (node_boundary_end_pfn[nid] > *end_pfn)
+                *end_pfn = node_boundary_end_pfn[nid];
+}
+#else
+void __init push_node_boundaries(unsigned int nid,
+                unsigned long start_pfn, unsigned long end_pfn) {}
+static void __init account_node_boundary(unsigned int nid,
+                unsigned long *start_pfn, unsigned long *end_pfn) {}
+#endif
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by an arch calling add_active_range(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0.
+ */
+void __init get_pfn_range_for_nid(unsigned int nid,
+                        unsigned long *start_pfn, unsigned long *end_pfn)
+{
+        int i;
+        *start_pfn = -1UL;
+        *end_pfn = 0;
+        for_each_active_range_index_in_nid(i, nid) {
+                *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
+                *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+        }
+        if (*start_pfn == -1UL) {
+                printk(KERN_WARNING "Node %u active with no memory\n", nid);
+                *start_pfn = 0;
+        }
+        /* Push the node boundaries out if requested */
+        account_node_boundary(nid, start_pfn, end_pfn);
+}
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+unsigned long __init zone_spanned_pages_in_node(int nid,
+                                        unsigned long zone_type,
+                                        unsigned long *ignored)
+{
+        unsigned long node_start_pfn, node_end_pfn;
+        unsigned long zone_start_pfn, zone_end_pfn;
+        /* Get the start and end of the node and zone */
+        get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+        zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+        zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+        /* Check that this node has pages within the zone's required range */
+        if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+                return 0;
+        /* Move the zone boundaries inside the node if necessary */
+        zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+        zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+        /* Return the spanned pages */
+        return zone_end_pfn - zone_start_pfn;
+}
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for.
+ */
+unsigned long __init __absent_pages_in_range(int nid,
+                                unsigned long range_start_pfn,
+                                unsigned long range_end_pfn)
+{
+        int i = 0;
+        unsigned long prev_end_pfn = 0, hole_pages = 0;
+        unsigned long start_pfn;
+        /* Find the end_pfn of the first active range of pfns in the node */
+        i = first_active_region_index_in_nid(nid);
+        if (i == -1)
+                return 0;
+        /* Account for ranges before physical memory on this node */
+        if (early_node_map[i].start_pfn > range_start_pfn)
+                hole_pages = early_node_map[i].start_pfn - range_start_pfn;
+        prev_end_pfn = early_node_map[i].start_pfn;
+        /* Find all holes for the zone within the node */
+        for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
+                /* No need to continue if prev_end_pfn is outside the zone */
+                if (prev_end_pfn >= range_end_pfn)
+                        break;
+                /* Make sure the end of the zone is not within the hole */
+                start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+                prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+                /* Update the hole size cound and move on */
+                if (start_pfn > range_start_pfn) {
+                        BUG_ON(prev_end_pfn > start_pfn);
+                        hole_pages += start_pfn - prev_end_pfn;
+                }
+                prev_end_pfn = early_node_map[i].end_pfn;
+        }
+        /* Account for ranges past physical memory on this node */
+        if (range_end_pfn > prev_end_pfn)
+                hole_pages += range_end_pfn -
+                                max(range_start_pfn, prev_end_pfn);
+        return hole_pages;
+}
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * It returns the number of pages frames in memory holes within a range.
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+                                                        unsigned long end_pfn)
+{
+        return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+/* Return the number of page frames in holes in a zone on a node */
+unsigned long __init zone_absent_pages_in_node(int nid,
+                                        unsigned long zone_type,
+                                        unsigned long *ignored)
+{
+        unsigned long node_start_pfn, node_end_pfn;
+        unsigned long zone_start_pfn, zone_end_pfn;
+        get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+        zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
+                                                        node_start_pfn);
+        zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
+                                                        node_end_pfn);
+        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+}
+#else
+static inline unsigned long zone_spanned_pages_in_node(int nid,
+                                        unsigned long zone_type,
+                                        unsigned long *zones_size)
+{
+        return zones_size[zone_type];
+}
+static inline unsigned long zone_absent_pages_in_node(int nid,
+                                                unsigned long zone_type,
+                                                unsigned long *zholes_size)
+{
+        if (!zholes_size)
+                return 0;
+        return zholes_size[zone_type];
+}
+#endif
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+                unsigned long *zones_size, unsigned long *zholes_size)
+{
+        unsigned long realtotalpages, totalpages = 0;
+        enum zone_type i;
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
+                                                                zones_size);
+        pgdat->node_spanned_pages = totalpages;
+        realtotalpages = totalpages;
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                realtotalpages -=
+                        zone_absent_pages_in_node(pgdat->node_id, i,
+                                                                zholes_size);
+        pgdat->node_present_pages = realtotalpages;
+        printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+                                                        realtotalpages);
+}
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -1998,11 +2618,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
        
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
-                unsigned long size, realsize;
+                unsigned long size, realsize, memmap_pages;
-                realsize = size = zones_size[j];
+                size = zone_spanned_pages_in_node(nid, j, zones_size);
-                if (zholes_size)
+                realsize = size - zone_absent_pages_in_node(nid, j,
-                        realsize -= zholes_size[j];
+                                                                zholes_size);
+                /*
+                 * Adjust realsize so that it accounts for how much memory
+                 * is used by this zone for memmap. This affects the watermark
+                 * and per-cpu initialisations
+                 */
+                memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
+                if (realsize >= memmap_pages) {
+                        realsize -= memmap_pages;
+                        printk(KERN_DEBUG
+                                "  %s zone: %lu pages used for memmap\n",
+                                zone_names[j], memmap_pages);
+                } else
+                        printk(KERN_WARNING
+                                "  %s zone: %lu pages exceeds realsize %lu\n",
+                                zone_names[j], memmap_pages, realsize);
+                /* Account for reserved DMA pages */
+                if (j == ZONE_DMA && realsize > dma_reserve) {
+                        realsize -= dma_reserve;
+                        printk(KERN_DEBUG "  DMA zone: %lu pages reserved\n",
+                                                                dma_reserve);
+                }
                if (!is_highmem_idx(j))
                        nr_kernel_pages += realsize;
@@ -2011,6 +2654,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
 #ifdef CONFIG_NUMA
+                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
                                                / 100;
                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -2022,7 +2666,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
                zone->free_pages = 0;
-                zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+                zone->prev_priority = DEF_PRIORITY;
                zone_pcp_init(zone);
                INIT_LIST_HEAD(&zone->active_list);
@@ -2036,7 +2680,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                if (!size)
                        continue;
-                zonetable_add(zone, nid, j, zone_start_pfn, size);
                ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                BUG_ON(ret);
                zone_start_pfn += size;
@@ -2073,8 +2716,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
        /*
         * With no DISCONTIG, the global mem_map is just set as node 0's
         */
-        if (pgdat == NODE_DATA(0))
+        if (pgdat == NODE_DATA(0)) {
                mem_map = NODE_DATA(0)->node_mem_map;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+                if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+                        mem_map -= pgdat->node_start_pfn;
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+        }
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
@@ -2085,13 +2733,254 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
 {
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
-        calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+        calculate_node_totalpages(pgdat, zones_size, zholes_size);
        alloc_node_mem_map(pgdat);
        free_area_init_core(pgdat, zones_size, zholes_size);
 }
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/**
+ * add_active_range - Register a range of PFNs backed by physical memory
+ * @nid: The node ID the range resides on
+ * @start_pfn: The start PFN of the available physical memory
+ * @end_pfn: The end PFN of the available physical memory
+ *
+ * These ranges are stored in an early_node_map[] and later used by
+ * free_area_init_nodes() to calculate zone sizes and holes. If the
+ * range spans a memory hole, it is up to the architecture to ensure
+ * the memory is not freed by the bootmem allocator. If possible
+ * the range being registered will be merged with existing ranges.
+ */
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+                                                unsigned long end_pfn)
+{
+        int i;
+        printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
+                          "%d entries of %d used\n",
+                          nid, start_pfn, end_pfn,
+                          nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+        /* Merge with existing active regions if possible */
+        for (i = 0; i < nr_nodemap_entries; i++) {
+                if (early_node_map[i].nid != nid)
+                        continue;
+                /* Skip if an existing region covers this new one */
+                if (start_pfn >= early_node_map[i].start_pfn &&
+                                end_pfn <= early_node_map[i].end_pfn)
+                        return;
+                /* Merge forward if suitable */
+                if (start_pfn <= early_node_map[i].end_pfn &&
+                                end_pfn > early_node_map[i].end_pfn) {
+                        early_node_map[i].end_pfn = end_pfn;
+                        return;
+                }
+                /* Merge backward if suitable */
+                if (start_pfn < early_node_map[i].end_pfn &&
+                                end_pfn >= early_node_map[i].start_pfn) {
+                        early_node_map[i].start_pfn = start_pfn;
+                        return;
+                }
+        }
+        /* Check that early_node_map is large enough */
+        if (i >= MAX_ACTIVE_REGIONS) {
+                printk(KERN_CRIT "More than %d memory regions, truncating\n",
+                                                        MAX_ACTIVE_REGIONS);
+                return;
+        }
+        early_node_map[i].nid = nid;
+        early_node_map[i].start_pfn = start_pfn;
+        early_node_map[i].end_pfn = end_pfn;
+        nr_nodemap_entries = i + 1;
+}
+/**
+ * shrink_active_range - Shrink an existing registered range of PFNs
+ * @nid: The node id the range is on that should be shrunk
+ * @old_end_pfn: The old end PFN of the range
+ * @new_end_pfn: The new PFN of the range
+ *
+ * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
+ * The map is kept at the end physical page range that has already been
+ * registered with add_active_range(). This function allows an arch to shrink
+ * an existing registered range.
+ */
+void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+                                                unsigned long new_end_pfn)
+{
+        int i;
+        /* Find the old active region end and shrink */
+        for_each_active_range_index_in_nid(i, nid)
+                if (early_node_map[i].end_pfn == old_end_pfn) {
+                        early_node_map[i].end_pfn = new_end_pfn;
+                        break;
+                }
+}
+/**
+ * remove_all_active_ranges - Remove all currently registered regions
+ *
+ * During discovery, it may be found that a table like SRAT is invalid
+ * and an alternative discovery method must be used. This function removes
+ * all currently registered regions.
+ */
+void __init remove_all_active_ranges(void)
+{
+        memset(early_node_map, 0, sizeof(early_node_map));
+        nr_nodemap_entries = 0;
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+        memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
+        memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+}
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+        struct node_active_region *arange = (struct node_active_region *)a;
+        struct node_active_region *brange = (struct node_active_region *)b;
+        /* Done this way to avoid overflows */
+        if (arange->start_pfn > brange->start_pfn)
+                return 1;
+        if (arange->start_pfn < brange->start_pfn)
+                return -1;
+        return 0;
+}
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+        sort(early_node_map, (size_t)nr_nodemap_entries,
+                        sizeof(struct node_active_region),
+                        cmp_node_active_region, NULL);
+}
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+        int i;
+        /* Regions in the early_node_map can be in any order */
+        sort_node_map();
+        /* Assuming a sorted map, the first range found has the starting pfn */
+        for_each_active_range_index_in_nid(i, nid)
+                return early_node_map[i].start_pfn;
+        printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+        return 0;
+}
+/**
+ * find_min_pfn_with_active_regions - Find the minimum PFN registered
+ *
+ * It returns the minimum PFN based on information provided via
+ * add_active_range().
+ */
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+        return find_min_pfn_for_node(MAX_NUMNODES);
+}
+/**
+ * find_max_pfn_with_active_regions - Find the maximum PFN registered
+ *
+ * It returns the maximum PFN based on information provided via
+ * add_active_range().
+ */
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+        int i;
+        unsigned long max_pfn = 0;
+        for (i = 0; i < nr_nodemap_entries; i++)
+                max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+        return max_pfn;
+}
+/**
+ * free_area_init_nodes - Initialise all pg_data_t and zone data
+ * @max_zone_pfn: an array of max PFNs for each zone
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by add_active_range(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init_nodes(unsigned long *max_zone_pfn)
+{
+        unsigned long nid;
+        enum zone_type i;
+        /* Record where the zone boundaries are */
+        memset(arch_zone_lowest_possible_pfn, 0,
+                                sizeof(arch_zone_lowest_possible_pfn));
+        memset(arch_zone_highest_possible_pfn, 0,
+                                sizeof(arch_zone_highest_possible_pfn));
+        arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
+        arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
+        for (i = 1; i < MAX_NR_ZONES; i++) {
+                arch_zone_lowest_possible_pfn[i] =
+                        arch_zone_highest_possible_pfn[i-1];
+                arch_zone_highest_possible_pfn[i] =
+                        max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+        }
+        /* Print out the zone ranges */
+        printk("Zone PFN ranges:\n");
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                printk("  %-8s %8lu -> %8lu\n",
+                                zone_names[i],
+                                arch_zone_lowest_possible_pfn[i],
+                                arch_zone_highest_possible_pfn[i]);
+        /* Print out the early_node_map[] */
+        printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
+        for (i = 0; i < nr_nodemap_entries; i++)
+                printk("  %3d: %8lu -> %8lu\n", early_node_map[i].nid,
+                                                early_node_map[i].start_pfn,
+                                                early_node_map[i].end_pfn);
+        /* Initialise every node */
+        for_each_online_node(nid) {
+                pg_data_t *pgdat = NODE_DATA(nid);
+                free_area_init_node(nid, pgdat, NULL,
+                                find_min_pfn_for_node(nid), NULL);
+        }
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+/**
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+        dma_reserve = new_dma_reserve;
+}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -2105,7 +2994,6 @@ void __init free_area_init(unsigned long *zones_size)
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
-#ifdef CONFIG_HOTPLUG_CPU
 static int page_alloc_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
@@ -2120,7 +3008,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
        }
        return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 void __init page_alloc_init(void)
 {
@@ -2198,10 +3085,11 @@ static void setup_per_zone_lowmem_reserve(void)
        calculate_totalreserve_pages();
 }
-/*
+/**
- * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.
- *      that the pages_{min,low,high} values for each zone are set correctly 
+ *
- *      with respect to min_free_kbytes.
+ * Ensures that the pages_{min,low,high} values for each zone are set correctly
+ * with respect to min_free_kbytes.
 */
 void setup_per_zone_pages_min(void)
 {
@@ -2423,7 +3311,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        /* allow the kernel cmdline to have a say */
        if (!numentries) {
                /* round applicable memory size up to nearest megabyte */
-                numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+                numentries = nr_kernel_pages;
                numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
                numentries >>= 20 - PAGE_SHIFT;
                numentries <<= 20 - PAGE_SHIFT;
@@ -2445,7 +3333,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        if (numentries > max)
                numentries = max;
-        log2qty = long_log2(numentries);
+        log2qty = ilog2(numentries);
        do {
                size = bucketsize << log2qty;
@@ -2467,7 +3355,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
               tablename,
               (1U << log2qty),
-               long_log2(size) - PAGE_SHIFT,
+               ilog2(size) - PAGE_SHIFT,
               size);
        if (_hash_shift)
@@ -2490,3 +3378,19 @@ unsigned long page_to_pfn(struct page *page)
 EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+#if MAX_NUMNODES > 1
+/*
+ * Find the highest possible node id.
+ */
+int highest_possible_node_id(void)
+{
+        unsigned int node;
+        unsigned int highest = 0;
+        for_each_node_mask(node, node_possible_map)
+                highest = node;
+        return highest;
+}
+EXPORT_SYMBOL(highest_possible_node_id);
+#endif
author	Dave Jones <davej@redhat.com>	2006-12-12 18:13:32 -0500
committer	Dave Jones <davej@redhat.com>	2006-12-12 18:13:32 -0500
commit	f0eef25339f92f7cd4aeea23d9ae97987a5a1e82 (patch)
tree	2472e94d39f43a9580a6d2d5d92de0b749023263 /mm/page_alloc.c
parent	0cfea5dd98205f2fa318836da664a7d7df1afbc1 (diff)
parent	e1036502e5263851259d147771226161e5ccc85a (diff)