11 files changed, 1136 insertions, 200 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 92a3ebd8d795..601159a46ab6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2256,6 +2256,54 @@ oom:
 }
 /*
+ * do_no_pfn() tries to create a new page mapping for a page without
+ * a struct_page backing it
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * It is expected that the ->nopfn handler always returns the same pfn
+ * for a given virtual mapping.
+ *
+ * Mark this `noinline' to prevent it from bloating the main pagefault code.
+ */
+static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long address, pte_t *page_table, pmd_t *pmd,
+                     int write_access)
+{
+        spinlock_t *ptl;
+        pte_t entry;
+        unsigned long pfn;
+        int ret = VM_FAULT_MINOR;
+        pte_unmap(page_table);
+        BUG_ON(!(vma->vm_flags & VM_PFNMAP));
+        BUG_ON(is_cow_mapping(vma->vm_flags));
+        pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+        if (pfn == NOPFN_OOM)
+                return VM_FAULT_OOM;
+        if (pfn == NOPFN_SIGBUS)
+                return VM_FAULT_SIGBUS;
+        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+        /* Only go through if we didn't race with anybody else... */
+        if (pte_none(*page_table)) {
+                entry = pfn_pte(pfn, vma->vm_page_prot);
+                if (write_access)
+                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                set_pte_at(mm, address, page_table, entry);
+        }
+        pte_unmap_unlock(page_table, ptl);
+        return ret;
+}
+/*
 * Fault of a previously existing named mapping. Repopulate the pte
 * from the encoded file_pte if possible. This enables swappable
 * nonlinear vmas.
@@ -2317,11 +2365,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        old_entry = entry = *pte;
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
-                        if (!vma->vm_ops || !vma->vm_ops->nopage)
+                        if (vma->vm_ops) {
-                                return do_anonymous_page(mm, vma, address,
+                                if (vma->vm_ops->nopage)
-                                        pte, pmd, write_access);
+                                        return do_no_page(mm, vma, address,
-                        return do_no_page(mm, vma, address,
+                                                          pte, pmd,
-                                        pte, pmd, write_access);
+                                                          write_access);
+                                if (unlikely(vma->vm_ops->nopfn))
+                                        return do_no_pfn(mm, vma, address, pte,
+                                                         pmd, write_access);
+                        }
+                        return do_anonymous_page(mm, vma, address,
+                                                 pte, pmd, write_access);
                }
                if (pte_file(entry))
                        return do_file_page(mm, vma, address,
@@ -2550,3 +2604,56 @@ int in_gate_area_no_task(unsigned long addr)
 }
 #endif  /* __HAVE_ARCH_GATE_AREA */
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+        struct mm_struct *mm;
+        struct vm_area_struct *vma;
+        struct page *page;
+        void *old_buf = buf;
+        mm = get_task_mm(tsk);
+        if (!mm)
+                return 0;
+        down_read(&mm->mmap_sem);
+        /* ignore errors, just check how much was sucessfully transfered */
+        while (len) {
+                int bytes, ret, offset;
+                void *maddr;
+                ret = get_user_pages(tsk, mm, addr, 1,
+                                write, 1, &page, &vma);
+                if (ret <= 0)
+                        break;
+                bytes = len;
+                offset = addr & (PAGE_SIZE-1);
+                if (bytes > PAGE_SIZE-offset)
+                        bytes = PAGE_SIZE-offset;
+                maddr = kmap(page);
+                if (write) {
+                        copy_to_user_page(vma, page, addr,
+                                          maddr + offset, buf, bytes);
+                        set_page_dirty_lock(page);
+                } else {
+                        copy_from_user_page(vma, page, addr,
+                                            buf, maddr + offset, bytes);
+                }
+                kunmap(page);
+                page_cache_release(page);
+                len -= bytes;
+                buf += bytes;
+                addr += bytes;
+        }
+        up_read(&mm->mmap_sem);
+        mmput(mm);
+        return buf - old_buf;
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 38f89650bc84..cf18f0942553 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1136,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 */
 unsigned slab_node(struct mempolicy *policy)
 {
-        switch (policy->policy) {
+        int pol = policy ? policy->policy : MPOL_DEFAULT;
+        switch (pol) {
        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);
diff --git a/mm/nommu.c b/mm/nommu.c
index d99dea31e443..564540662192 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp)
 }
 /*
- * The nommu dodgy version :-)
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ *   slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
 */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, int len, int write, int force,
        struct page **pages, struct vm_area_struct **vmas)
 {
+        struct vm_area_struct *vma;
+        unsigned long vm_flags;
        int i;
-        static struct vm_area_struct dummy_vma;
+        /* calculate required read or write permissions.
+         * - if 'force' is set, we only require the "MAY" flags.
+         */
+        vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+        vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
        for (i = 0; i < len; i++) {
+                vma = find_vma(mm, start);
+                if (!vma)
+                        goto finish_or_fault;
+                /* protect what we can, including chardevs */
+                if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
+                    !(vm_flags & vma->vm_flags))
+                        goto finish_or_fault;
                if (pages) {
                        pages[i] = virt_to_page(start);
                        if (pages[i])
                                page_cache_get(pages[i]);
                }
                if (vmas)
-                        vmas[i] = &dummy_vma;
+                        vmas[i] = vma;
                start += PAGE_SIZE;
        }
-        return(i);
+        return i;
+finish_or_fault:
+        return i ? : -EFAULT;
 }
 EXPORT_SYMBOL(get_user_pages);
@@ -286,6 +310,77 @@ static void show_process_blocks(void)
 }
 #endif /* DEBUG */
+/*
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * - should be called with mm->mmap_sem held writelocked
+ */
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+{
+        struct vm_list_struct **ppv;
+        for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
+                if ((*ppv)->vma->vm_start > vml->vma->vm_start)
+                        break;
+        vml->next = *ppv;
+        *ppv = vml;
+}
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+        struct vm_list_struct *loop, *vml;
+        /* search the vm_start ordered list */
+        vml = NULL;
+        for (loop = mm->context.vmlist; loop; loop = loop->next) {
+                if (loop->vma->vm_start > addr)
+                        break;
+                vml = loop;
+        }
+        if (vml && vml->vma->vm_end > addr)
+                return vml->vma;
+        return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+        return find_vma(mm, addr);
+}
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+                                                    unsigned long addr)
+{
+        struct vm_list_struct *vml;
+        /* search the vm_start ordered list */
+        for (vml = mm->context.vmlist; vml; vml = vml->next) {
+                if (vml->vma->vm_start == addr)
+                        return vml->vma;
+                if (vml->vma->vm_start > addr)
+                        break;
+        }
+        return NULL;
+}
+/*
+ * find a VMA in the global tree
+ */
 static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
 {
        struct vm_area_struct *vma;
@@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
        return NULL;
 }
+/*
+ * add a VMA in the global tree
+ */
 static void add_nommu_vma(struct vm_area_struct *vma)
 {
        struct vm_area_struct *pvma;
@@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma)
        rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
 }
+/*
+ * delete a VMA from the global list
+ */
 static void delete_nommu_vma(struct vm_area_struct *vma)
 {
        struct address_space *mapping;
@@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file,
        realalloc += kobjsize(vml);
        askedalloc += sizeof(*vml);
-        vml->next = current->mm->context.vmlist;
+        add_vma_to_mm(current->mm, vml);
-        current->mm->context.vmlist = vml;
        up_write(&nommu_vma_sem);
@@ -908,6 +1008,11 @@ static void put_vma(struct vm_area_struct *vma)
        }
 }
+/*
+ * release a mapping
+ * - under NOMMU conditions the parameters must match exactly to the mapping to
+ *   be removed
+ */
 int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 {
        struct vm_list_struct *vml, **parent;
@@ -917,10 +1022,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
        printk("do_munmap:\n");
 #endif
-        for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
+        for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
+                if ((*parent)->vma->vm_start > addr)
+                        break;
                if ((*parent)->vma->vm_start == addr &&
                    ((len == 0) || ((*parent)->vma->vm_end == end)))
                        goto found;
+        }
        printk("munmap of non-mmaped memory by process %d (%s): %p\n",
               current->pid, current->comm, (void *) addr);
@@ -946,7 +1054,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
        return 0;
 }
-/* Release all mmaps. */
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
+{
+        int ret;
+        struct mm_struct *mm = current->mm;
+        down_write(&mm->mmap_sem);
+        ret = do_munmap(mm, addr, len);
+        up_write(&mm->mmap_sem);
+        return ret;
+}
+/*
+ * Release all mappings
+ */
 void exit_mmap(struct mm_struct * mm)
 {
        struct vm_list_struct *tmp;
@@ -973,37 +1094,26 @@ void exit_mmap(struct mm_struct * mm)
        }
 }
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
-{
-        int ret;
-        struct mm_struct *mm = current->mm;
-        down_write(&mm->mmap_sem);
-        ret = do_munmap(mm, addr, len);
-        up_write(&mm->mmap_sem);
-        return ret;
-}
 unsigned long do_brk(unsigned long addr, unsigned long len)
 {
        return -ENOMEM;
 }
 /*
- * Expand (or shrink) an existing mapping, potentially moving it at the
+ * expand (or shrink) an existing mapping, potentially moving it at the same
- * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
 *
- * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * under NOMMU conditions, we only permit changing a mapping's size, and only
- * This option implies MREMAP_MAYMOVE.
+ * as long as it stays within the hole allocated by the kmalloc() call in
+ * do_mmap_pgoff() and the block is not shareable
 *
- * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the
+ * MREMAP_FIXED is not supported under NOMMU conditions
- * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
 */
 unsigned long do_mremap(unsigned long addr,
                        unsigned long old_len, unsigned long new_len,
                        unsigned long flags, unsigned long new_addr)
 {
-        struct vm_list_struct *vml = NULL;
+        struct vm_area_struct *vma;
        /* insanity checks first */
        if (new_len == 0)
@@ -1012,58 +1122,46 @@ unsigned long do_mremap(unsigned long addr,
        if (flags & MREMAP_FIXED && new_addr != addr)
                return (unsigned long) -EINVAL;
-        for (vml = current->mm->context.vmlist; vml; vml = vml->next)
+        vma = find_vma_exact(current->mm, addr);
-                if (vml->vma->vm_start == addr)
+        if (!vma)
-                        goto found;
+                return (unsigned long) -EINVAL;
-        return (unsigned long) -EINVAL;
- found:
+        if (vma->vm_end != vma->vm_start + old_len)
-        if (vml->vma->vm_end != vml->vma->vm_start + old_len)
                return (unsigned long) -EFAULT;
-        if (vml->vma->vm_flags & VM_MAYSHARE)
+        if (vma->vm_flags & VM_MAYSHARE)
                return (unsigned long) -EPERM;
        if (new_len > kobjsize((void *) addr))
                return (unsigned long) -ENOMEM;
        /* all checks complete - do it */
-        vml->vma->vm_end = vml->vma->vm_start + new_len;
+        vma->vm_end = vma->vm_start + new_len;
        askedalloc -= old_len;
        askedalloc += new_len;
-        return vml->vma->vm_start;
+        return vma->vm_start;
 }
-/*
+asmlinkage unsigned long sys_mremap(unsigned long addr,
- * Look up the first VMA which satisfies  addr < vm_end,  NULL if none
+        unsigned long old_len, unsigned long new_len,
- */
+        unsigned long flags, unsigned long new_addr)
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
-        struct vm_list_struct *vml;
+        unsigned long ret;
-        for (vml = mm->context.vmlist; vml; vml = vml->next)
-                if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
-                        return vml->vma;
-        return NULL;
+        down_write(&current->mm->mmap_sem);
+        ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+        up_write(&current->mm->mmap_sem);
+        return ret;
 }
-EXPORT_SYMBOL(find_vma);
 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                        unsigned int foll_flags)
 {
        return NULL;
 }
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
-        return NULL;
-}
 int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
                unsigned long to, unsigned long size, pgprot_t prot)
 {
@@ -1206,3 +1304,44 @@ struct page *filemap_nopage(struct vm_area_struct *area,
        BUG();
        return NULL;
 }
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+        struct vm_area_struct *vma;
+        struct mm_struct *mm;
+        if (addr + len < addr)
+                return 0;
+        mm = get_task_mm(tsk);
+        if (!mm)
+                return 0;
+        down_read(&mm->mmap_sem);
+        /* the access must start within one of the target process's mappings */
+        vma = find_vma(mm, addr);
+        if (vma) {
+                /* don't overrun this mapping */
+                if (addr + len >= vma->vm_end)
+                        len = vma->vm_end - addr;
+                /* only read or write mappings where it is permitted */
+                if (write && vma->vm_flags & VM_MAYWRITE)
+                        len -= copy_to_user((void *) addr, buf, len);
+                else if (!write && vma->vm_flags & VM_MAYREAD)
+                        len -= copy_from_user(buf, (void *) addr, len);
+                else
+                        len = 0;
+        } else {
+                len = 0;
+        }
+        up_read(&mm->mmap_sem);
+        mmput(mm);
+        return len;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9810f0a60db7..4f59d90b81e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,8 @@
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -102,6 +104,38 @@ int min_free_kbytes = 1024;
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
+static unsigned long __initdata dma_reserve;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+  /*
+   * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
+   * ranges of memory (RAM) that may be registered with add_active_range().
+   * Ranges passed to add_active_range() will be merged if possible
+   * so the number of times add_active_range() can be called is
+   * related to the number of nodes and the number of holes
+   */
+  #ifdef CONFIG_MAX_ACTIVE_REGIONS
+    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
+    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+  #else
+    #if MAX_NUMNODES >= 32
+      /* If there can be many nodes, allow up to 50 holes per node */
+      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
+    #else
+      /* By default, allow up to 256 distinct regions */
+      #define MAX_ACTIVE_REGIONS 256
+    #endif
+  #endif
+  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  int __initdata nr_nodemap_entries;
+  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+  unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
+  unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -908,7 +942,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
         */
        do {
                zone = *z;
-                if (unlikely((gfp_mask & __GFP_THISNODE) &&
+                if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                break;
                if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1222,14 +1256,12 @@ unsigned int nr_free_pagecache_pages(void)
 {
        return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
 }
-#ifdef CONFIG_NUMA
-static void show_node(struct zone *zone)
+static inline void show_node(struct zone *zone)
 {
-        printk("Node %ld ", zone_to_nid(zone));
+        if (NUMA_BUILD)
+                printk("Node %ld ", zone_to_nid(zone));
 }
-#else
-#define show_node(zone) do { } while (0)
-#endif
 void si_meminfo(struct sysinfo *val)
 {
@@ -1271,34 +1303,30 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 */
 void show_free_areas(void)
 {
-        int cpu, temperature;
+        int cpu;
        unsigned long active;
        unsigned long inactive;
        unsigned long free;
        struct zone *zone;
        for_each_zone(zone) {
-                show_node(zone);
+                if (!populated_zone(zone))
-                printk("%s per-cpu:", zone->name);
-                if (!populated_zone(zone)) {
-                        printk(" empty\n");
                        continue;
-                } else
-                        printk("\n");
+                show_node(zone);
+                printk("%s per-cpu:\n", zone->name);
                for_each_online_cpu(cpu) {
                        struct per_cpu_pageset *pageset;
                        pageset = zone_pcp(zone, cpu);
-                        for (temperature = 0; temperature < 2; temperature++)
+                        printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d   "
-                                printk("cpu %d %s: high %d, batch %d used:%d\n",
+                               "Cold: hi:%5d, btch:%4d usd:%4d\n",
-                                        cpu,
+                               cpu, pageset->pcp[0].high,
-                                        temperature ? "cold" : "hot",
+                               pageset->pcp[0].batch, pageset->pcp[0].count,
-                                        pageset->pcp[temperature].high,
+                               pageset->pcp[1].high, pageset->pcp[1].batch,
-                                        pageset->pcp[temperature].batch,
+                               pageset->pcp[1].count);
-                                        pageset->pcp[temperature].count);
                }
        }
@@ -1320,6 +1348,9 @@ void show_free_areas(void)
        for_each_zone(zone) {
                int i;
+                if (!populated_zone(zone))
+                        continue;
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -1352,12 +1383,11 @@ void show_free_areas(void)
        for_each_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
+                if (!populated_zone(zone))
+                        continue;
                show_node(zone);
                printk("%s: ", zone->name);
-                if (!populated_zone(zone)) {
-                        printk("empty\n");
-                        continue;
-                }
                spin_lock_irqsave(&zone->lock, flags);
                for (order = 0; order < MAX_ORDER; order++) {
@@ -1561,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy)
 void __meminit build_all_zonelists(void)
 {
        if (system_state == SYSTEM_BOOTING) {
-                __build_all_zonelists(0);
+                __build_all_zonelists(NULL);
                cpuset_init_current_mems_allowed();
        } else {
                /* we have to stop all cpus to guaranntee there is no user
@@ -1642,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
-                unsigned long *zones_size, unsigned long *zholes_size)
-{
-        unsigned long realtotalpages, totalpages = 0;
-        enum zone_type i;
-        for (i = 0; i < MAX_NR_ZONES; i++)
-                totalpages += zones_size[i];
-        pgdat->node_spanned_pages = totalpages;
-        realtotalpages = totalpages;
-        if (zholes_size)
-                for (i = 0; i < MAX_NR_ZONES; i++)
-                        realtotalpages -= zholes_size[i];
-        pgdat->node_present_pages = realtotalpages;
-        printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
-}
 /*
 * Initially all pages are reserved - free ones are freed
 * up by free_all_bootmem() once the early boot process is
@@ -1818,6 +1829,9 @@ static int __cpuinit process_zones(int cpu)
        for_each_zone(zone) {
+                if (!populated_zone(zone))
+                        continue;
                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
                                         GFP_KERNEL, cpu_to_node(cpu));
                if (!zone_pcp(zone, cpu))
@@ -1977,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        return 0;
 }
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * Basic iterator support. Return the first range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns first region regardless of node
+ */
+static int __init first_active_region_index_in_nid(int nid)
+{
+        int i;
+        for (i = 0; i < nr_nodemap_entries; i++)
+                if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+                        return i;
+        return -1;
+}
+/*
+ * Basic iterator support. Return the next active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardles of node
+ */
+static int __init next_active_region_index_in_nid(int index, int nid)
+{
+        for (index = index + 1; index < nr_nodemap_entries; index++)
+                if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+                        return index;
+        return -1;
+}
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ * Architectures may implement their own version but if add_active_range()
+ * was used and there are no special requirements, this is a convenient
+ * alternative
+ */
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+        int i;
+        for (i = 0; i < nr_nodemap_entries; i++) {
+                unsigned long start_pfn = early_node_map[i].start_pfn;
+                unsigned long end_pfn = early_node_map[i].end_pfn;
+                if (start_pfn <= pfn && pfn < end_pfn)
+                        return early_node_map[i].nid;
+        }
+        return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+/* Basic iterator support to walk early_node_map[] */
+#define for_each_active_range_index_in_nid(i, nid) \
+        for (i = first_active_region_index_in_nid(nid); i != -1; \
+                                i = next_active_region_index_in_nid(i, nid))
+/**
+ * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
+ * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * this function may be used instead of calling free_bootmem() manually.
+ */
+void __init free_bootmem_with_active_regions(int nid,
+                                                unsigned long max_low_pfn)
+{
+        int i;
+        for_each_active_range_index_in_nid(i, nid) {
+                unsigned long size_pages = 0;
+                unsigned long end_pfn = early_node_map[i].end_pfn;
+                if (early_node_map[i].start_pfn >= max_low_pfn)
+                        continue;
+                if (end_pfn > max_low_pfn)
+                        end_pfn = max_low_pfn;
+                size_pages = end_pfn - early_node_map[i].start_pfn;
+                free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+                                PFN_PHYS(early_node_map[i].start_pfn),
+                                size_pages << PAGE_SHIFT);
+        }
+}
+/**
+ * sparse_memory_present_with_active_regions - Call memory_present for each active range
+ * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * this function may be used instead of calling memory_present() manually.
+ */
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+        int i;
+        for_each_active_range_index_in_nid(i, nid)
+                memory_present(early_node_map[i].nid,
+                                early_node_map[i].start_pfn,
+                                early_node_map[i].end_pfn);
+}
+/**
+ * push_node_boundaries - Push node boundaries to at least the requested boundary
+ * @nid: The nid of the node to push the boundary for
+ * @start_pfn: The start pfn of the node
+ * @end_pfn: The end pfn of the node
+ *
+ * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
+ * time. Specifically, on x86_64, SRAT will report ranges that can potentially
+ * be hotplugged even though no physical memory exists. This function allows
+ * an arch to push out the node boundaries so mem_map is allocated that can
+ * be used later.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+void __init push_node_boundaries(unsigned int nid,
+                unsigned long start_pfn, unsigned long end_pfn)
+{
+        printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+                        nid, start_pfn, end_pfn);
+        /* Initialise the boundary for this node if necessary */
+        if (node_boundary_end_pfn[nid] == 0)
+                node_boundary_start_pfn[nid] = -1UL;
+        /* Update the boundaries */
+        if (node_boundary_start_pfn[nid] > start_pfn)
+                node_boundary_start_pfn[nid] = start_pfn;
+        if (node_boundary_end_pfn[nid] < end_pfn)
+                node_boundary_end_pfn[nid] = end_pfn;
+}
+/* If necessary, push the node boundary out for reserve hotadd */
+static void __init account_node_boundary(unsigned int nid,
+                unsigned long *start_pfn, unsigned long *end_pfn)
+{
+        printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+                        nid, *start_pfn, *end_pfn);
+        /* Return if boundary information has not been provided */
+        if (node_boundary_end_pfn[nid] == 0)
+                return;
+        /* Check the boundaries and update if necessary */
+        if (node_boundary_start_pfn[nid] < *start_pfn)
+                *start_pfn = node_boundary_start_pfn[nid];
+        if (node_boundary_end_pfn[nid] > *end_pfn)
+                *end_pfn = node_boundary_end_pfn[nid];
+}
+#else
+void __init push_node_boundaries(unsigned int nid,
+                unsigned long start_pfn, unsigned long end_pfn) {}
+static void __init account_node_boundary(unsigned int nid,
+                unsigned long *start_pfn, unsigned long *end_pfn) {}
+#endif
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by an arch calling add_active_range(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0
+ */
+void __init get_pfn_range_for_nid(unsigned int nid,
+                        unsigned long *start_pfn, unsigned long *end_pfn)
+{
+        int i;
+        *start_pfn = -1UL;
+        *end_pfn = 0;
+        for_each_active_range_index_in_nid(i, nid) {
+                *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
+                *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+        }
+        if (*start_pfn == -1UL) {
+                printk(KERN_WARNING "Node %u active with no memory\n", nid);
+                *start_pfn = 0;
+        }
+        /* Push the node boundaries out if requested */
+        account_node_boundary(nid, start_pfn, end_pfn);
+}
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+unsigned long __init zone_spanned_pages_in_node(int nid,
+                                        unsigned long zone_type,
+                                        unsigned long *ignored)
+{
+        unsigned long node_start_pfn, node_end_pfn;
+        unsigned long zone_start_pfn, zone_end_pfn;
+        /* Get the start and end of the node and zone */
+        get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+        zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+        zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+        /* Check that this node has pages within the zone's required range */
+        if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+                return 0;
+        /* Move the zone boundaries inside the node if necessary */
+        zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+        zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+        /* Return the spanned pages */
+        return zone_end_pfn - zone_start_pfn;
+}
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for
+ */
+unsigned long __init __absent_pages_in_range(int nid,
+                                unsigned long range_start_pfn,
+                                unsigned long range_end_pfn)
+{
+        int i = 0;
+        unsigned long prev_end_pfn = 0, hole_pages = 0;
+        unsigned long start_pfn;
+        /* Find the end_pfn of the first active range of pfns in the node */
+        i = first_active_region_index_in_nid(nid);
+        if (i == -1)
+                return 0;
+        /* Account for ranges before physical memory on this node */
+        if (early_node_map[i].start_pfn > range_start_pfn)
+                hole_pages = early_node_map[i].start_pfn - range_start_pfn;
+        prev_end_pfn = early_node_map[i].start_pfn;
+        /* Find all holes for the zone within the node */
+        for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
+                /* No need to continue if prev_end_pfn is outside the zone */
+                if (prev_end_pfn >= range_end_pfn)
+                        break;
+                /* Make sure the end of the zone is not within the hole */
+                start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+                prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+                /* Update the hole size cound and move on */
+                if (start_pfn > range_start_pfn) {
+                        BUG_ON(prev_end_pfn > start_pfn);
+                        hole_pages += start_pfn - prev_end_pfn;
+                }
+                prev_end_pfn = early_node_map[i].end_pfn;
+        }
+        /* Account for ranges past physical memory on this node */
+        if (range_end_pfn > prev_end_pfn)
+                hole_pages = range_end_pfn -
+                                max(range_start_pfn, prev_end_pfn);
+        return hole_pages;
+}
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * It returns the number of pages frames in memory holes within a range
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+                                                        unsigned long end_pfn)
+{
+        return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+/* Return the number of page frames in holes in a zone on a node */
+unsigned long __init zone_absent_pages_in_node(int nid,
+                                        unsigned long zone_type,
+                                        unsigned long *ignored)
+{
+        unsigned long node_start_pfn, node_end_pfn;
+        unsigned long zone_start_pfn, zone_end_pfn;
+        get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+        zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
+                                                        node_start_pfn);
+        zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
+                                                        node_end_pfn);
+        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+}
+/* Return the zone index a PFN is in */
+int memmap_zone_idx(struct page *lmem_map)
+{
+        int i;
+        unsigned long phys_addr = virt_to_phys(lmem_map);
+        unsigned long pfn = phys_addr >> PAGE_SHIFT;
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                if (pfn < arch_zone_highest_possible_pfn[i])
+                        break;
+        return i;
+}
+#else
+static inline unsigned long zone_spanned_pages_in_node(int nid,
+                                        unsigned long zone_type,
+                                        unsigned long *zones_size)
+{
+        return zones_size[zone_type];
+}
+static inline unsigned long zone_absent_pages_in_node(int nid,
+                                                unsigned long zone_type,
+                                                unsigned long *zholes_size)
+{
+        if (!zholes_size)
+                return 0;
+        return zholes_size[zone_type];
+}
+static inline int memmap_zone_idx(struct page *lmem_map)
+{
+        return MAX_NR_ZONES;
+}
+#endif
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+                unsigned long *zones_size, unsigned long *zholes_size)
+{
+        unsigned long realtotalpages, totalpages = 0;
+        enum zone_type i;
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
+                                                                zones_size);
+        pgdat->node_spanned_pages = totalpages;
+        realtotalpages = totalpages;
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                realtotalpages -=
+                        zone_absent_pages_in_node(pgdat->node_id, i,
+                                                                zholes_size);
+        pgdat->node_present_pages = realtotalpages;
+        printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+                                                        realtotalpages);
+}
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -1998,11 +2372,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
        
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
-                unsigned long size, realsize;
+                unsigned long size, realsize, memmap_pages;
-                realsize = size = zones_size[j];
+                size = zone_spanned_pages_in_node(nid, j, zones_size);
-                if (zholes_size)
+                realsize = size - zone_absent_pages_in_node(nid, j,
-                        realsize -= zholes_size[j];
+                                                                zholes_size);
+                /*
+                 * Adjust realsize so that it accounts for how much memory
+                 * is used by this zone for memmap. This affects the watermark
+                 * and per-cpu initialisations
+                 */
+                memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
+                if (realsize >= memmap_pages) {
+                        realsize -= memmap_pages;
+                        printk(KERN_DEBUG
+                                "  %s zone: %lu pages used for memmap\n",
+                                zone_names[j], memmap_pages);
+                } else
+                        printk(KERN_WARNING
+                                "  %s zone: %lu pages exceeds realsize %lu\n",
+                                zone_names[j], memmap_pages, realsize);
+                /* Account for reserved DMA pages */
+                if (j == ZONE_DMA && realsize > dma_reserve) {
+                        realsize -= dma_reserve;
+                        printk(KERN_DEBUG "  DMA zone: %lu pages reserved\n",
+                                                                dma_reserve);
+                }
                if (!is_highmem_idx(j))
                        nr_kernel_pages += realsize;
@@ -2011,6 +2408,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
 #ifdef CONFIG_NUMA
+                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
                                                / 100;
                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -2073,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
        /*
         * With no DISCONTIG, the global mem_map is just set as node 0's
         */
-        if (pgdat == NODE_DATA(0))
+        if (pgdat == NODE_DATA(0)) {
                mem_map = NODE_DATA(0)->node_mem_map;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+                if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+                        mem_map -= pgdat->node_start_pfn;
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+        }
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
@@ -2085,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
 {
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
-        calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+        calculate_node_totalpages(pgdat, zones_size, zholes_size);
        alloc_node_mem_map(pgdat);
        free_area_init_core(pgdat, zones_size, zholes_size);
 }
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/**
+ * add_active_range - Register a range of PFNs backed by physical memory
+ * @nid: The node ID the range resides on
+ * @start_pfn: The start PFN of the available physical memory
+ * @end_pfn: The end PFN of the available physical memory
+ *
+ * These ranges are stored in an early_node_map[] and later used by
+ * free_area_init_nodes() to calculate zone sizes and holes. If the
+ * range spans a memory hole, it is up to the architecture to ensure
+ * the memory is not freed by the bootmem allocator. If possible
+ * the range being registered will be merged with existing ranges.
+ */
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+                                                unsigned long end_pfn)
+{
+        int i;
+        printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
+                          "%d entries of %d used\n",
+                          nid, start_pfn, end_pfn,
+                          nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+        /* Merge with existing active regions if possible */
+        for (i = 0; i < nr_nodemap_entries; i++) {
+                if (early_node_map[i].nid != nid)
+                        continue;
+                /* Skip if an existing region covers this new one */
+                if (start_pfn >= early_node_map[i].start_pfn &&
+                                end_pfn <= early_node_map[i].end_pfn)
+                        return;
+                /* Merge forward if suitable */
+                if (start_pfn <= early_node_map[i].end_pfn &&
+                                end_pfn > early_node_map[i].end_pfn) {
+                        early_node_map[i].end_pfn = end_pfn;
+                        return;
+                }
+                /* Merge backward if suitable */
+                if (start_pfn < early_node_map[i].end_pfn &&
+                                end_pfn >= early_node_map[i].start_pfn) {
+                        early_node_map[i].start_pfn = start_pfn;
+                        return;
+                }
+        }
+        /* Check that early_node_map is large enough */
+        if (i >= MAX_ACTIVE_REGIONS) {
+                printk(KERN_CRIT "More than %d memory regions, truncating\n",
+                                                        MAX_ACTIVE_REGIONS);
+                return;
+        }
+        early_node_map[i].nid = nid;
+        early_node_map[i].start_pfn = start_pfn;
+        early_node_map[i].end_pfn = end_pfn;
+        nr_nodemap_entries = i + 1;
+}
+/**
+ * shrink_active_range - Shrink an existing registered range of PFNs
+ * @nid: The node id the range is on that should be shrunk
+ * @old_end_pfn: The old end PFN of the range
+ * @new_end_pfn: The new PFN of the range
+ *
+ * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
+ * The map is kept at the end physical page range that has already been
+ * registered with add_active_range(). This function allows an arch to shrink
+ * an existing registered range.
+ */
+void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+                                                unsigned long new_end_pfn)
+{
+        int i;
+        /* Find the old active region end and shrink */
+        for_each_active_range_index_in_nid(i, nid)
+                if (early_node_map[i].end_pfn == old_end_pfn) {
+                        early_node_map[i].end_pfn = new_end_pfn;
+                        break;
+                }
+}
+/**
+ * remove_all_active_ranges - Remove all currently registered regions
+ * During discovery, it may be found that a table like SRAT is invalid
+ * and an alternative discovery method must be used. This function removes
+ * all currently registered regions.
+ */
+void __init remove_all_active_ranges()
+{
+        memset(early_node_map, 0, sizeof(early_node_map));
+        nr_nodemap_entries = 0;
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+        memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
+        memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+}
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+        struct node_active_region *arange = (struct node_active_region *)a;
+        struct node_active_region *brange = (struct node_active_region *)b;
+        /* Done this way to avoid overflows */
+        if (arange->start_pfn > brange->start_pfn)
+                return 1;
+        if (arange->start_pfn < brange->start_pfn)
+                return -1;
+        return 0;
+}
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+        sort(early_node_map, (size_t)nr_nodemap_entries,
+                        sizeof(struct node_active_region),
+                        cmp_node_active_region, NULL);
+}
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+        int i;
+        /* Assuming a sorted map, the first range found has the starting pfn */
+        for_each_active_range_index_in_nid(i, nid)
+                return early_node_map[i].start_pfn;
+        printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+        return 0;
+}
+/**
+ * find_min_pfn_with_active_regions - Find the minimum PFN registered
+ *
+ * It returns the minimum PFN based on information provided via
+ * add_active_range()
+ */
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+        return find_min_pfn_for_node(MAX_NUMNODES);
+}
+/**
+ * find_max_pfn_with_active_regions - Find the maximum PFN registered
+ *
+ * It returns the maximum PFN based on information provided via
+ * add_active_range()
+ */
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+        int i;
+        unsigned long max_pfn = 0;
+        for (i = 0; i < nr_nodemap_entries; i++)
+                max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+        return max_pfn;
+}
+/**
+ * free_area_init_nodes - Initialise all pg_data_t and zone data
+ * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
+ * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
+ * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
+ * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by add_active_range(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init_nodes(unsigned long *max_zone_pfn)
+{
+        unsigned long nid;
+        enum zone_type i;
+        /* Record where the zone boundaries are */
+        memset(arch_zone_lowest_possible_pfn, 0,
+                                sizeof(arch_zone_lowest_possible_pfn));
+        memset(arch_zone_highest_possible_pfn, 0,
+                                sizeof(arch_zone_highest_possible_pfn));
+        arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
+        arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
+        for (i = 1; i < MAX_NR_ZONES; i++) {
+                arch_zone_lowest_possible_pfn[i] =
+                        arch_zone_highest_possible_pfn[i-1];
+                arch_zone_highest_possible_pfn[i] =
+                        max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+        }
+        /* Regions in the early_node_map can be in any order */
+        sort_node_map();
+        /* Print out the zone ranges */
+        printk("Zone PFN ranges:\n");
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                printk("  %-8s %8lu -> %8lu\n",
+                                zone_names[i],
+                                arch_zone_lowest_possible_pfn[i],
+                                arch_zone_highest_possible_pfn[i]);
+        /* Print out the early_node_map[] */
+        printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
+        for (i = 0; i < nr_nodemap_entries; i++)
+                printk("  %3d: %8lu -> %8lu\n", early_node_map[i].nid,
+                                                early_node_map[i].start_pfn,
+                                                early_node_map[i].end_pfn);
+        /* Initialise every node */
+        for_each_online_node(nid) {
+                pg_data_t *pgdat = NODE_DATA(nid);
+                free_area_init_node(nid, pgdat, NULL,
+                                find_min_pfn_for_node(nid), NULL);
+        }
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+/**
+ * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
+ * @new_dma_reserve - The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in
+ * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+        dma_reserve = new_dma_reserve;
+}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
diff --git a/mm/shmem.c b/mm/shmem.c
index 8631be45b40d..eda907c3a86a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1351,7 +1351,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_mode = mode;
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -2157,8 +2156,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(shmem_inode_cachep))
+        kmem_cache_destroy(shmem_inode_cachep);
-                printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
 }
 static const struct address_space_operations shmem_aops = {
diff --git a/mm/slab.c b/mm/slab.c
index 7a48eb1a60c8..792bfe320a8b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -972,7 +972,39 @@ static int transfer_objects(struct array_cache *to,
        return nr;
 }
-#ifdef CONFIG_NUMA
+#ifndef CONFIG_NUMA
+#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+        return (struct array_cache **)BAD_ALIEN_MAGIC;
+}
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+        return 0;
+}
+static inline void *alternate_node_alloc(struct kmem_cache *cachep,
+                gfp_t flags)
+{
+        return NULL;
+}
+static inline void *__cache_alloc_node(struct kmem_cache *cachep,
+                 gfp_t flags, int nodeid)
+{
+        return NULL;
+}
+#else   /* CONFIG_NUMA */
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
@@ -1101,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
        }
        return 1;
 }
-#else
-#define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
-{
-        return (struct array_cache **)BAD_ALIEN_MAGIC;
-}
-static inline void free_alien_cache(struct array_cache **ac_ptr)
-{
-}
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
-{
-        return 0;
-}
 #endif
 static int __cpuinit cpuup_callback(struct notifier_block *nfb,
@@ -1564,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
         */
        flags |= __GFP_COMP;
 #endif
-        flags |= cachep->gfpflags;
+        /*
+         * Under NUMA we want memory on the indicated node. We will handle
+         * the needed fallback ourselves since we want to serve from our
+         * per node object lists first for other nodes.
+         */
+        flags |= cachep->gfpflags | GFP_THISNODE;
        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
        if (!page)
@@ -2442,7 +2460,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 * @cachep: the cache to destroy
 *
 * Remove a struct kmem_cache object from the slab cache.
- * Returns 0 on success.
 *
 * It is expected this function will be called by a module when it is
 * unloaded.  This will remove the cache completely, and avoid a duplicate
@@ -2454,7 +2471,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 * The caller must guarantee that noone will allocate memory from the cache
 * during the kmem_cache_destroy().
 */
-int kmem_cache_destroy(struct kmem_cache *cachep)
+void kmem_cache_destroy(struct kmem_cache *cachep)
 {
        BUG_ON(!cachep || in_interrupt());
@@ -2475,7 +2492,7 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
                list_add(&cachep->next, &cache_chain);
                mutex_unlock(&cache_chain_mutex);
                unlock_cpu_hotplug();
-                return 1;
+                return;
        }
        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
@@ -2483,7 +2500,6 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
        __kmem_cache_destroy(cachep);
        unlock_cpu_hotplug();
-        return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3030,14 +3046,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        void *objp;
        struct array_cache *ac;
-#ifdef CONFIG_NUMA
-        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
-                objp = alternate_node_alloc(cachep, flags);
-                if (objp != NULL)
-                        return objp;
-        }
-#endif
        check_irq_off();
        ac = cpu_cache_get(cachep);
        if (likely(ac->avail)) {
@@ -3055,12 +3063,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
                                                gfp_t flags, void *caller)
 {
        unsigned long save_flags;
-        void *objp;
+        void *objp = NULL;
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
-        objp = ____cache_alloc(cachep, flags);
+        if (unlikely(NUMA_BUILD &&
+                        current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
+                objp = alternate_node_alloc(cachep, flags);
+        if (!objp)
+                objp = ____cache_alloc(cachep, flags);
+        /*
+         * We may just have run out of memory on the local node.
+         * __cache_alloc_node() knows how to locate memory on other nodes
+         */
+        if (NUMA_BUILD && !objp)
+                objp = __cache_alloc_node(cachep, flags, numa_node_id());
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp,
                                            caller);
@@ -3079,7 +3099,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        int nid_alloc, nid_here;
-        if (in_interrupt())
+        if (in_interrupt() || (flags & __GFP_THISNODE))
                return NULL;
        nid_alloc = nid_here = numa_node_id();
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -3092,6 +3112,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 /*
+ * Fallback function if there was no memory available and no objects on a
+ * certain node and we are allowed to fall back. We mimick the behavior of
+ * the page allocator. We fall back according to a zonelist determined by
+ * the policy layer while obeying cpuset constraints.
+ */
+void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+        struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
+                                        ->node_zonelists[gfp_zone(flags)];
+        struct zone **z;
+        void *obj = NULL;
+        for (z = zonelist->zones; *z && !obj; z++)
+                if (zone_idx(*z) <= ZONE_NORMAL &&
+                                cpuset_zone_allowed(*z, flags))
+                        obj = __cache_alloc_node(cache,
+                                        flags | __GFP_THISNODE,
+                                        zone_to_nid(*z));
+        return obj;
+}
+/*
 * A interface to enable slab creation on nodeid
 */
 static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3144,11 +3186,15 @@ retry:
 must_grow:
        spin_unlock(&l3->list_lock);
        x = cache_grow(cachep, flags, nodeid);
+        if (x)
+                goto retry;
-        if (!x)
+        if (!(flags & __GFP_THISNODE))
-                return NULL;
+                /* Unable to grow the cache. Fall back to other nodes. */
+                return fallback_alloc(cachep, flags);
+        return NULL;
-        goto retry;
 done:
        return obj;
 }
diff --git a/mm/slob.c b/mm/slob.c
index 20188627347c..542394184a58 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 }
 EXPORT_SYMBOL(kmem_cache_create);
-int kmem_cache_destroy(struct kmem_cache *c)
+void kmem_cache_destroy(struct kmem_cache *c)
 {
        slob_free(c, sizeof(struct kmem_cache));
-        return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/truncate.c b/mm/truncate.c
index c6ab55ec6883..a654928323dc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
@@ -52,36 +53,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 /*
 * This is for invalidate_inode_pages().  That function can be called at
 * any time, and is not supposed to throw away dirty pages.  But pages can
- * be marked dirty at any time too.  So we re-check the dirtiness inside
+ * be marked dirty at any time too, so use remove_mapping which safely
- * ->tree_lock.  That provides exclusion against the __set_page_dirty
+ * discards clean, unused pages.
- * functions.
 *
 * Returns non-zero if the page was successfully invalidated.
 */
 static int
 invalidate_complete_page(struct address_space *mapping, struct page *page)
 {
+        int ret;
        if (page->mapping != mapping)
                return 0;
        if (PagePrivate(page) && !try_to_release_page(page, 0))
                return 0;
-        write_lock_irq(&mapping->tree_lock);
+        ret = remove_mapping(mapping, page);
-        if (PageDirty(page))
-                goto failed;
-        if (page_count(page) != 2)      /* caller's ref + pagecache ref */
-                goto failed;
-        BUG_ON(PagePrivate(page));
-        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
        ClearPageUptodate(page);
-        page_cache_release(page);       /* pagecache ref */
-        return 1;
+        return ret;
-failed:
-        write_unlock_irq(&mapping->tree_lock);
-        return 0;
 }
 /**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9aad8b0cc6ee..1ac191ce5641 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -241,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 /**
 *      get_vm_area  -  reserve a contingous kernel virtual area
- *
 *      @size:          size of the area
 *      @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
 *
@@ -273,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr)
 }
 /* Caller must hold vmlist_lock */
-struct vm_struct *__remove_vm_area(void *addr)
+static struct vm_struct *__remove_vm_area(void *addr)
 {
        struct vm_struct **p, *tmp;
@@ -296,7 +295,6 @@ found:
 /**
 *      remove_vm_area  -  find and remove a contingous kernel virtual area
- *
 *      @addr:          base address
 *
 *      Search for the kernel VM area starting at @addr, and remove it.
@@ -355,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages)
 /**
 *      vfree  -  release memory allocated by vmalloc()
- *
 *      @addr:          memory base address
 *
 *      Free the virtually contiguous memory area starting at @addr, as
@@ -373,7 +370,6 @@ EXPORT_SYMBOL(vfree);
 /**
 *      vunmap  -  release virtual mapping obtained by vmap()
- *
 *      @addr:          memory base address
 *
 *      Free the virtually contiguous memory area starting at @addr,
@@ -390,7 +386,6 @@ EXPORT_SYMBOL(vunmap);
 /**
 *      vmap  -  map an array of pages into virtually contiguous space
- *
 *      @pages:         array of page pointers
 *      @count:         number of pages to map
 *      @flags:         vm_area->flags
@@ -471,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 /**
 *      __vmalloc_node  -  allocate virtually contiguous memory
- *
 *      @size:          allocation size
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
@@ -505,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc);
 /**
 *      vmalloc  -  allocate virtually contiguous memory
- *
 *      @size:          allocation size
- *
 *      Allocate enough pages to cover @size from the page level
 *      allocator and map them into contiguous kernel virtual space.
 *
@@ -521,11 +513,11 @@ void *vmalloc(unsigned long size)
 EXPORT_SYMBOL(vmalloc);
 /**
- *      vmalloc_user  -  allocate virtually contiguous memory which has
+ * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
- *                         been zeroed so it can be mapped to userspace without
+ * @size: allocation size
- *                         leaking data.
 *
- *      @size:          allocation size
+ * The resulting memory area is zeroed so it can be mapped to userspace
+ * without leaking data.
 */
 void *vmalloc_user(unsigned long size)
 {
@@ -544,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user);
 /**
 *      vmalloc_node  -  allocate memory on a specific node
- *
 *      @size:          allocation size
 *      @node:          numa node
 *
@@ -566,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node);
 /**
 *      vmalloc_exec  -  allocate virtually contiguous, executable memory
- *
 *      @size:          allocation size
 *
 *      Kernel-internal function to allocate enough pages to cover @size
@@ -584,7 +574,6 @@ void *vmalloc_exec(unsigned long size)
 /**
 *      vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
- *
 *      @size:          allocation size
 *
 *      Allocate enough 32bit PA addressable pages to cover @size from the
@@ -597,11 +586,11 @@ void *vmalloc_32(unsigned long size)
 EXPORT_SYMBOL(vmalloc_32);
 /**
- *      vmalloc_32_user  -  allocate virtually contiguous memory (32bit
+ * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
- *                            addressable) which is zeroed so it can be
- *                            mapped to userspace without leaking data.
- *
 *      @size:          allocation size
+ *
+ * The resulting memory area is 32bit addressable and zeroed so it can be
+ * mapped to userspace without leaking data.
 */
 void *vmalloc_32_user(unsigned long size)
 {
@@ -695,7 +684,6 @@ finished:
 /**
 *      remap_vmalloc_range  -  map vmalloc pages to userspace
- *
 *      @vma:           vma to cover (map full range of vma)
 *      @addr:          vmalloc memory
 *      @pgoff:         number of pages into addr before first page to map
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 87779dda4ec6..eca70310adb2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
@@ -370,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
                }
+                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -383,11 +384,30 @@ int remove_mapping(struct address_space *mapping, struct page *page)
        BUG_ON(mapping != page_mapping(page));
        write_lock_irq(&mapping->tree_lock);
        /*
-         * The non-racy check for busy page.  It is critical to check
+         * The non racy check for a busy page.
-         * PageDirty _after_ making sure that the page is freeable and
+         *
-         * not in use by anybody.       (pagecache + us == 2)
+         * Must be careful with the order of the tests. When someone has
+         * a ref to the page, it may be possible that they dirty it then
+         * drop the reference. So if PageDirty is tested before page_count
+         * here, then the following race may occur:
+         *
+         * get_user_pages(&page);
+         * [user mapping goes away]
+         * write_to(page);
+         *                              !PageDirty(page)    [good]
+         * SetPageDirty(page);
+         * put_page(page);
+         *                              !page_count(page)   [good, discard it]
+         *
+         * [oops, our write_to data is lost]
+         *
+         * Reversing the order of the tests ensures such a situation cannot
+         * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+         * load is not satisfied before that of page->_count.
+         *
+         * Note that if SetPageDirty is always performed via set_page_dirty,
+         * and thus under tree_lock, then this ordering is not required.
         */
        if (unlikely(page_count(page) != 2))
                goto cannot_free;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 490d8c1a0ded..a2b6a9f96e5c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -371,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
                __inc_zone_state(z, NUMA_MISS);
                __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
        }
-        if (z->zone_pgdat == NODE_DATA(numa_node_id()))
+        if (z->node == numa_node_id())
                __inc_zone_state(z, NUMA_LOCAL);
        else
                __inc_zone_state(z, NUMA_OTHER);
@@ -465,6 +465,7 @@ static char *vmstat_text[] = {
        "nr_writeback",
        "nr_unstable",
        "nr_bounce",
+        "nr_vmscan_write",
 #ifdef CONFIG_NUMA
        "numa_hit",