11 files changed, 479 insertions, 145 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8a..9abc6008544b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
+int randomize_va_space __read_mostly = 1;
+static int __init disable_randmaps(char *s)
+{
+        randomize_va_space = 0;
+        return 0;
+}
+__setup("norandmaps", disable_randmaps);
 /*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a918f77f02f3..1fe76d963ac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
                onlined_pages++;
        }
        zone->present_pages += onlined_pages;
+        zone->zone_pgdat->node_present_pages += onlined_pages;
        setup_per_zone_pages_min();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b75..954981b14303 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
        }
        return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
        struct zonelist *zl;
-        int num, max, nd;
+        int num, max, nd, k;
        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-        zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
        if (!zl)
                return NULL;
        num = 0;
-        for_each_node_mask(nd, *nodes)
+        /* First put in the highest zones from all nodes, then all the next 
-                zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+           lower zones etc. Avoid empty zones because the memory allocator
+           doesn't like them. If you implement node hot removal you
+           have to fix that. */
+        for (k = policy_zone; k >= 0; k--) { 
+                for_each_node_mask(nd, *nodes) { 
+                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
+                        if (z->present_pages > 0) 
+                                zl->zones[num++] = z;
+                }
+        }
        zl->zones[num] = NULL;
        return zl;
 }
@@ -187,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
        return policy;
 }
-static void gather_stats(struct page *, void *);
+static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);
@@ -229,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        continue;
                if (flags & MPOL_MF_STATS)
-                        gather_stats(page, private);
+                        gather_stats(page, private, pte_dirty(*pte));
                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                        migrate_page_add(page, private, flags);
                else
@@ -542,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
         */
        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
                if (isolate_lru_page(page))
-                        list_add(&page->lru, pagelist);
+                        list_add_tail(&page->lru, pagelist);
        }
 }
@@ -559,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist,
        LIST_HEAD(moved);
        LIST_HEAD(failed);
        int err = 0;
+        unsigned long offset = 0;
        int nr_pages;
        struct page *page;
        struct list_head *p;
@@ -566,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist,
 redo:
        nr_pages = 0;
        list_for_each(p, pagelist) {
-                if (vma)
+                if (vma) {
-                        page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+                        /*
+                         * The address passed to alloc_page_vma is used to
+                         * generate the proper interleave behavior. We fake
+                         * the address here by an increasing offset in order
+                         * to get the proper distribution of pages.
+                         *
+                         * No decision has been made as to which page
+                         * a certain old page is moved to so we cannot
+                         * specify the correct address.
+                         */
+                        page = alloc_page_vma(GFP_HIGHUSER, vma,
+                                        offset + vma->vm_start);
+                        offset += PAGE_SIZE;
+                }
                else
                        page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
@@ -575,9 +599,9 @@ redo:
                        err = -ENOMEM;
                        goto out;
                }
-                list_add(&page->lru, &newlist);
+                list_add_tail(&page->lru, &newlist);
                nr_pages++;
-                if (nr_pages > MIGRATE_CHUNK_SIZE);
+                if (nr_pages > MIGRATE_CHUNK_SIZE)
                        break;
        }
        err = migrate_pages(pagelist, &newlist, &moved, &failed);
@@ -798,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
+        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+                return -EINVAL;
        nlongs = BITS_TO_LONGS(maxnode);
        if ((maxnode % BITS_PER_LONG) == 0)
@@ -928,7 +954,8 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                goto out;
        }
-        err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+        err = do_migrate_pages(mm, &old, &new,
+                capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
        mmput(mm);
        return err;
@@ -1726,66 +1753,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 struct numa_maps {
        unsigned long pages;
        unsigned long anon;
-        unsigned long mapped;
+        unsigned long active;
+        unsigned long writeback;
        unsigned long mapcount_max;
+        unsigned long dirty;
+        unsigned long swapcache;
        unsigned long node[MAX_NUMNODES];
 };
-static void gather_stats(struct page *page, void *private)
+static void gather_stats(struct page *page, void *private, int pte_dirty)
 {
        struct numa_maps *md = private;
        int count = page_mapcount(page);
-        if (count)
+        md->pages++;
-                md->mapped++;
+        if (pte_dirty || PageDirty(page))
+                md->dirty++;
-        if (count > md->mapcount_max)
+        if (PageSwapCache(page))
-                md->mapcount_max = count;
+                md->swapcache++;
-        md->pages++;
+        if (PageActive(page))
+                md->active++;
+        if (PageWriteback(page))
+                md->writeback++;
        if (PageAnon(page))
                md->anon++;
+        if (count > md->mapcount_max)
+                md->mapcount_max = count;
        md->node[page_to_nid(page)]++;
        cond_resched();
 }
+#ifdef CONFIG_HUGETLB_PAGE
+static void check_huge_range(struct vm_area_struct *vma,
+                unsigned long start, unsigned long end,
+                struct numa_maps *md)
+{
+        unsigned long addr;
+        struct page *page;
+        for (addr = start; addr < end; addr += HPAGE_SIZE) {
+                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+                pte_t pte;
+                if (!ptep)
+                        continue;
+                pte = *ptep;
+                if (pte_none(pte))
+                        continue;
+                page = pte_page(pte);
+                if (!page)
+                        continue;
+                gather_stats(page, md, pte_dirty(*ptep));
+        }
+}
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+                unsigned long start, unsigned long end,
+                struct numa_maps *md)
+{
+}
+#endif
 int show_numa_map(struct seq_file *m, void *v)
 {
        struct task_struct *task = m->private;
        struct vm_area_struct *vma = v;
        struct numa_maps *md;
+        struct file *file = vma->vm_file;
+        struct mm_struct *mm = vma->vm_mm;
        int n;
        char buffer[50];
-        if (!vma->vm_mm)
+        if (!mm)
                return 0;
        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
        if (!md)
                return 0;
-        check_pgd_range(vma, vma->vm_start, vma->vm_end,
+        mpol_to_str(buffer, sizeof(buffer),
-                    &node_online_map, MPOL_MF_STATS, md);
+                        get_vma_policy(task, vma, vma->vm_start));
-        if (md->pages) {
+        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
-                mpol_to_str(buffer, sizeof(buffer),
-                            get_vma_policy(task, vma, vma->vm_start));
-                seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+        if (file) {
-                           vma->vm_start, buffer, md->pages,
+                seq_printf(m, " file=");
-                           md->mapped, md->mapcount_max);
+                seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+                seq_printf(m, " heap");
+        } else if (vma->vm_start <= mm->start_stack &&
+                        vma->vm_end >= mm->start_stack) {
+                seq_printf(m, " stack");
+        }
-                if (md->anon)
+        if (is_vm_hugetlb_page(vma)) {
-                        seq_printf(m," anon=%lu",md->anon);
+                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+                seq_printf(m, " huge");
+        } else {
+                check_pgd_range(vma, vma->vm_start, vma->vm_end,
+                                &node_online_map, MPOL_MF_STATS, md);
+        }
-                for_each_online_node(n)
+        if (!md->pages)
-                        if (md->node[n])
+                goto out;
-                                seq_printf(m, " N%d=%lu", n, md->node[n]);
-                seq_putc(m, '\n');
+        if (md->anon)
-        }
+                seq_printf(m," anon=%lu",md->anon);
+        if (md->dirty)
+                seq_printf(m," dirty=%lu",md->dirty);
+        if (md->pages != md->anon && md->pages != md->dirty)
+                seq_printf(m, " mapped=%lu", md->pages);
+        if (md->mapcount_max > 1)
+                seq_printf(m, " mapmax=%lu", md->mapcount_max);
+        if (md->swapcache)
+                seq_printf(m," swapcache=%lu", md->swapcache);
+        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+                seq_printf(m," active=%lu", md->active);
+        if (md->writeback)
+                seq_printf(m," writeback=%lu", md->writeback);
+        for_each_online_node(n)
+                if (md->node[n])
+                        seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+        seq_putc(m, '\n');
        kfree(md);
        if (m->count < m->size)
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d68232..4951f4786f28 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -53,10 +53,11 @@ DECLARE_RWSEM(nommu_vma_sem);
 struct vm_operations_struct generic_file_vm_ops = {
 };
-EXPORT_SYMBOL(vmalloc);
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(vmalloc_to_page);
 EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
 /*
 * Handle all mappings that got truncated by a "truncate()"
@@ -203,6 +204,13 @@ void *vmalloc(unsigned long size)
 {
       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
+EXPORT_SYMBOL(vmalloc);
+void *vmalloc_node(unsigned long size, int node)
+{
+        return vmalloc(size);
+}
+EXPORT_SYMBOL(vmalloc_node);
 /*
 *      vmalloc_32  -  allocate virtually continguos memory (32bit addressable)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b05ab8f2a562..78747afad6b0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        /*
         * Processes which fork a lot of child processes are likely
-         * a good choice. We add the vmsize of the children if they
+         * a good choice. We add half the vmsize of the children if they
         * have an own mm. This prevents forking servers to flood the
-         * machine with an endless amount of children
+         * machine with an endless amount of children. In case a single
+         * child is eating the vast majority of memory, adding only half
+         * to the parents will make the child our kill candidate of choice.
         */
        list_for_each(tsk, &p->children) {
                struct task_struct *chld;
                chld = list_entry(tsk, struct task_struct, sibling);
                if (chld->mm != p->mm && chld->mm)
-                        points += chld->mm->total_vm;
+                        points += chld->mm->total_vm/2 + 1;
        }
        /*
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 }
 /*
+ * Types of limitations to the nodes from which allocations may occur
+ */
+#define CONSTRAINT_NONE 1
+#define CONSTRAINT_MEMORY_POLICY 2
+#define CONSTRAINT_CPUSET 3
+/*
+ * Determine the type of allocation constraint.
+ */
+static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NUMA
+        struct zone **z;
+        nodemask_t nodes = node_online_map;
+        for (z = zonelist->zones; *z; z++)
+                if (cpuset_zone_allowed(*z, gfp_mask))
+                        node_clear((*z)->zone_pgdat->node_id,
+                                        nodes);
+                else
+                        return CONSTRAINT_CPUSET;
+        if (!nodes_empty(nodes))
+                return CONSTRAINT_MEMORY_POLICY;
+#endif
+        return CONSTRAINT_NONE;
+}
+/*
 * Simple selection loop. We chose the process with the highest
 * number of 'points'. We expect the caller will lock the tasklist.
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
-static struct task_struct * select_bad_process(void)
+static struct task_struct *select_bad_process(unsigned long *ppoints)
 {
-        unsigned long maxpoints = 0;
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
        struct timespec uptime;
+        *ppoints = 0;
        do_posix_clock_monotonic_gettime(&uptime);
        do_each_thread(g, p) {
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void)
                        return p;
                points = badness(p, uptime.tv_sec);
-                if (points > maxpoints || !chosen) {
+                if (points > *ppoints || !chosen) {
                        chosen = p;
-                        maxpoints = points;
+                        *ppoints = points;
                }
        } while_each_thread(g, p);
        return chosen;
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void)
 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
 * we select a process with CAP_SYS_RAW_IO set).
 */
-static void __oom_kill_task(task_t *p)
+static void __oom_kill_task(task_t *p, const char *message)
 {
        if (p->pid == 1) {
                WARN_ON(1);
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p)
                return;
        }
        task_unlock(p);
-        printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
+        printk(KERN_ERR "%s: Killed process %d (%s).\n",
-                                                        p->pid, p->comm);
+                                message, p->pid, p->comm);
        /*
         * We give our sacrificial lamb high priority and access to
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p)
        force_sig(SIGKILL, p);
 }
-static struct mm_struct *oom_kill_task(task_t *p)
+static struct mm_struct *oom_kill_task(task_t *p, const char *message)
 {
        struct mm_struct *mm = get_task_mm(p);
        task_t * g, * q;
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p)
                return NULL;
        }
-        __oom_kill_task(p);
+        __oom_kill_task(p, message);
        /*
         * kill all processes that share the ->mm (i.e. all threads),
         * but are in a different thread group
         */
        do_each_thread(g, q)
                if (q->mm == mm && q->tgid != p->tgid)
-                        __oom_kill_task(q);
+                        __oom_kill_task(q, message);
        while_each_thread(g, q);
        return mm;
 }
-static struct mm_struct *oom_kill_process(struct task_struct *p)
+static struct mm_struct *oom_kill_process(struct task_struct *p,
+                                unsigned long points, const char *message)
 {
        struct mm_struct *mm;
        struct task_struct *c;
        struct list_head *tsk;
+        printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
+                "children.\n", p->pid, p->comm, points);
        /* Try to kill a child first */
        list_for_each(tsk, &p->children) {
                c = list_entry(tsk, struct task_struct, sibling);
                if (c->mm == p->mm)
                        continue;
-                mm = oom_kill_task(c);
+                mm = oom_kill_task(c, message);
                if (mm)
                        return mm;
        }
-        return oom_kill_task(p);
+        return oom_kill_task(p, message);
 }
 /**
@@ -263,10 +298,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-void out_of_memory(gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
        struct mm_struct *mm = NULL;
-        task_t * p;
+        task_t *p;
+        unsigned long points = 0;
        if (printk_ratelimit()) {
                printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -277,24 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order)
        cpuset_lock();
        read_lock(&tasklist_lock);
+        /*
+         * Check if there were limitations on the allocation (only relevant for
+         * NUMA) that may require different handling.
+         */
+        switch (constrained_alloc(zonelist, gfp_mask)) {
+        case CONSTRAINT_MEMORY_POLICY:
+                mm = oom_kill_process(current, points,
+                                "No available memory (MPOL_BIND)");
+                break;
+        case CONSTRAINT_CPUSET:
+                mm = oom_kill_process(current, points,
+                                "No available memory in cpuset");
+                break;
+        case CONSTRAINT_NONE:
 retry:
-        p = select_bad_process();
+                /*
+                 * Rambo mode: Shoot down a process and hope it solves whatever
+                 * issues we may have.
+                 */
+                p = select_bad_process(&points);
-        if (PTR_ERR(p) == -1UL)
+                if (PTR_ERR(p) == -1UL)
-                goto out;
+                        goto out;
-        /* Found nothing?!?! Either we hang forever, or we panic. */
+                /* Found nothing?!?! Either we hang forever, or we panic. */
-        if (!p) {
+                if (!p) {
-                read_unlock(&tasklist_lock);
+                        read_unlock(&tasklist_lock);
-                cpuset_unlock();
+                        cpuset_unlock();
-                panic("Out of memory and no killable processes...\n");
+                        panic("Out of memory and no killable processes...\n");
-        }
+                }
-        mm = oom_kill_process(p);
+                mm = oom_kill_process(p, points, "Out of memory");
-        if (!mm)
+                if (!mm)
-                goto retry;
+                        goto retry;
+                break;
+        }
- out:
+out:
        read_unlock(&tasklist_lock);
        cpuset_unlock();
        if (mm)
@@ -305,5 +365,5 @@ retry:
         * retry to allocate memory unless "p" is current
         */
        if (!test_thread_flag(TIF_MEMDIE))
-                schedule_timeout_interruptible(1);
+                schedule_timeout_uninterruptible(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62c122528587..234bd4895d14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 }
 #ifdef CONFIG_NUMA
-/* Called from the slab reaper to drain remote pagesets */
+/*
-void drain_remote_pages(void)
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ */
+void drain_node_pages(int nodeid)
 {
-        struct zone *zone;
+        int i, z;
-        int i;
        unsigned long flags;
        local_irq_save(flags);
-        for_each_zone(zone) {
+        for (z = 0; z < MAX_NR_ZONES; z++) {
+                struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
                struct per_cpu_pageset *pset;
-                /* Do not drain local pagesets */
-                if (zone->zone_pgdat->node_id == numa_node_id())
-                        continue;
                pset = zone_pcp(zone, smp_processor_id());
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -1015,7 +1014,7 @@ rebalance:
                if (page)
                        goto got_pg;
-                out_of_memory(gfp_mask, order);
+                out_of_memory(zonelist, gfp_mask, order);
                goto restart;
        }
@@ -1541,29 +1540,29 @@ static int __initdata node_load[MAX_NUMNODES];
 */
 static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
 {
-        int i, n, val;
+        int n, val;
        int min_val = INT_MAX;
        int best_node = -1;
-        for_each_online_node(i) {
+        /* Use the local node if we haven't already */
-                cpumask_t tmp;
+        if (!node_isset(node, *used_node_mask)) {
+                node_set(node, *used_node_mask);
+                return node;
+        }
-                /* Start from local node */
+        for_each_online_node(n) {
-                n = (node+i) % num_online_nodes();
+                cpumask_t tmp;
                /* Don't want a node to appear more than once */
                if (node_isset(n, *used_node_mask))
                        continue;
-                /* Use the local node if we haven't already */
-                if (!node_isset(node, *used_node_mask)) {
-                        best_node = node;
-                        break;
-                }
                /* Use the distance array to find the distance */
                val = node_distance(node, n);
+                /* Penalize nodes under us ("prefer the next node") */
+                val += (n < node);
                /* Give preference to headless and unused nodes */
                tmp = node_to_cpumask(n);
                if (!cpus_empty(tmp))
diff --git a/mm/rmap.c b/mm/rmap.c
index df2c41c2a9a2..67f0e20b101f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -212,25 +212,33 @@ out:
 * through real pte's pointing to valid pages and then releasing
 * the page from the swap cache.
 *
- * Must hold page lock on page.
+ * Must hold page lock on page and mmap_sem of one vma that contains
+ * the page.
 */
 void remove_from_swap(struct page *page)
 {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
+        unsigned long mapping;
-        if (!PageAnon(page) || !PageSwapCache(page))
+        if (!PageSwapCache(page))
                return;
-        anon_vma = page_lock_anon_vma(page);
+        mapping = (unsigned long)page->mapping;
-        if (!anon_vma)
+        if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
                return;
+        /*
+         * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+         */
+        anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+        spin_lock(&anon_vma->lock);
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
                remove_vma_swap(vma, page);
        spin_unlock(&anon_vma->lock);
        delete_from_swap_cache(page);
 }
 EXPORT_SYMBOL(remove_from_swap);
@@ -529,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
 */
 void page_add_file_rmap(struct page *page)
 {
-        BUG_ON(PageAnon(page));
-        BUG_ON(!pfn_valid(page_to_pfn(page)));
        if (atomic_inc_and_test(&page->_mapcount))
                __inc_page_state(nr_mapped);
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index f7ac7b812f92..7c455fbaff7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
 #include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
+#include <linux/ctype.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
 }
 #ifdef CONFIG_NUMA
+static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+        char *nodelist = strchr(value, ':');
+        int err = 1;
+        if (nodelist) {
+                /* NUL-terminate policy string */
+                *nodelist++ = '\0';
+                if (nodelist_parse(nodelist, *policy_nodes))
+                        goto out;
+        }
+        if (!strcmp(value, "default")) {
+                *policy = MPOL_DEFAULT;
+                /* Don't allow a nodelist */
+                if (!nodelist)
+                        err = 0;
+        } else if (!strcmp(value, "prefer")) {
+                *policy = MPOL_PREFERRED;
+                /* Insist on a nodelist of one node only */
+                if (nodelist) {
+                        char *rest = nodelist;
+                        while (isdigit(*rest))
+                                rest++;
+                        if (!*rest)
+                                err = 0;
+                }
+        } else if (!strcmp(value, "bind")) {
+                *policy = MPOL_BIND;
+                /* Insist on a nodelist */
+                if (nodelist)
+                        err = 0;
+        } else if (!strcmp(value, "interleave")) {
+                *policy = MPOL_INTERLEAVE;
+                /* Default to nodes online if no nodelist */
+                if (!nodelist)
+                        *policy_nodes = node_online_map;
+                err = 0;
+        }
+out:
+        /* Restore string for error message */
+        if (nodelist)
+                *--nodelist = ':';
+        return err;
+}
 static struct page *shmem_swapin_async(struct shared_policy *p,
                                       swp_entry_t entry, unsigned long idx)
 {
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
        return page;
 }
 #else
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+        return 1;
+}
 static inline struct page *
 shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
 {
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
 {
        char *this_char, *value, *rest;
-        while ((this_char = strsep(&options, ",")) != NULL) {
+        while (options != NULL) {
+                this_char = options;
+                for (;;) {
+                        /*
+                         * NUL-terminate this option: unfortunately,
+                         * mount options form a comma-separated list,
+                         * but mpol's nodelist may also contain commas.
+                         */
+                        options = strchr(options, ',');
+                        if (options == NULL)
+                                break;
+                        options++;
+                        if (!isdigit(*options)) {
+                                options[-1] = '\0';
+                                break;
+                        }
+                }
                if (!*this_char)
                        continue;
                if ((value = strchr(this_char,'=')) != NULL) {
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
                        if (*rest)
                                goto bad_val;
                } else if (!strcmp(this_char,"mpol")) {
-                        if (!strcmp(value,"default"))
+                        if (shmem_parse_mpol(value,policy,policy_nodes))
-                                *policy = MPOL_DEFAULT;
-                        else if (!strcmp(value,"preferred"))
-                                *policy = MPOL_PREFERRED;
-                        else if (!strcmp(value,"bind"))
-                                *policy = MPOL_BIND;
-                        else if (!strcmp(value,"interleave"))
-                                *policy = MPOL_INTERLEAVE;
-                        else
                                goto bad_val;
-                } else if (!strcmp(this_char,"mpol_nodelist")) {
-                        nodelist_parse(value, *policy_nodes);
                } else {
                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
                               this_char);
diff --git a/mm/slab.c b/mm/slab.c
index add05d808a4a..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
        dump_stack();
 }
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+static void init_reap_node(int cpu)
+{
+        int node;
+        node = next_node(cpu_to_node(cpu), node_online_map);
+        if (node == MAX_NUMNODES)
+                node = 0;
+        __get_cpu_var(reap_node) = node;
+}
+static void next_reap_node(void)
+{
+        int node = __get_cpu_var(reap_node);
+        /*
+         * Also drain per cpu pages on remote zones
+         */
+        if (node != numa_node_id())
+                drain_node_pages(node);
+        node = next_node(node, node_online_map);
+        if (unlikely(node >= MAX_NUMNODES))
+                node = first_node(node_online_map);
+        __get_cpu_var(reap_node) = node;
+}
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
 /*
 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
         * at that time.
         */
        if (keventd_up() && reap_work->func == NULL) {
+                init_reap_node(cpu);
                INIT_WORK(reap_work, cache_reap, NULL);
                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
        }
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
        }
 }
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+        int node = __get_cpu_var(reap_node);
+        if (l3->alien) {
+                struct array_cache *ac = l3->alien[node];
+                if (ac && ac->avail) {
+                        spin_lock_irq(&ac->lock);
+                        __drain_alien_cache(cachep, ac, node);
+                        spin_unlock_irq(&ac->lock);
+                }
+        }
+}
 static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
        int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
 #else
 #define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void)
        struct cache_sizes *sizes;
        struct cache_names *names;
        int i;
+        int order;
        for (i = 0; i < NUM_INIT_LISTS; i++) {
                kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void)
        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
-        cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
+        for (order = 0; order < MAX_ORDER; order++) {
-                       &left_over, &cache_cache.num);
+                cache_estimate(order, cache_cache.buffer_size,
+                        cache_line_size(), 0, &left_over, &cache_cache.num);
+                if (cache_cache.num)
+                        break;
+        }
        if (!cache_cache.num)
                BUG();
+        cache_cache.gfporder = order;
        cache_cache.colour = left_over / cache_cache.colour_off;
        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
                                      sizeof(struct slab), cache_line_size());
@@ -1628,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
                        size_t size, size_t align, unsigned long flags)
 {
        size_t left_over = 0;
+        int gfporder;
-        for (;; cachep->gfporder++) {
+        for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
                unsigned int num;
                size_t remainder;
-                if (cachep->gfporder > MAX_GFP_ORDER) {
+                cache_estimate(gfporder, size, align, flags, &remainder, &num);
-                        cachep->num = 0;
-                        break;
-                }
-                cache_estimate(cachep->gfporder, size, align, flags,
-                               &remainder, &num);
                if (!num)
                        continue;
                /* More than offslab_limit objects will cause problems */
-                if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+                if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
                        break;
+                /* Found something acceptable - save it away */
                cachep->num = num;
+                cachep->gfporder = gfporder;
                left_over = remainder;
                /*
+                 * A VFS-reclaimable slab tends to have most allocations
+                 * as GFP_NOFS and we really don't want to have to be allocating
+                 * higher-order pages when we are unable to shrink dcache.
+                 */
+                if (flags & SLAB_RECLAIM_ACCOUNT)
+                        break;
+                /*
                 * Large number of objects is good, but very large slabs are
                 * currently bad for the gfp()s.
                 */
-                if (cachep->gfporder >= slab_break_gfp_order)
+                if (gfporder >= slab_break_gfp_order)
                        break;
-                if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+                /*
-                        /* Acceptable internal fragmentation */
+                 * Acceptable internal fragmentation?
+                 */
+                if ((left_over * 8) <= (PAGE_SIZE << gfporder))
                        break;
        }
        return left_over;
@@ -1869,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        size = ALIGN(size, align);
-        if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
+        left_over = calculate_slab_order(cachep, size, align, flags);
-                /*
-                 * A VFS-reclaimable slab tends to have most allocations
-                 * as GFP_NOFS and we really don't want to have to be allocating
-                 * higher-order pages when we are unable to shrink dcache.
-                 */
-                cachep->gfporder = 0;
-                cache_estimate(cachep->gfporder, size, align, flags,
-                               &left_over, &cachep->num);
-        } else
-                left_over = calculate_slab_order(cachep, size, align, flags);
        if (!cachep->num) {
                printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -2554,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
                       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
                       cachep->name, cachep->num, slabp, slabp->inuse);
                for (i = 0;
-                     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+                     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
                     i++) {
                        if ((i % 16) == 0)
                                printk("\n%03x:", i);
@@ -3494,8 +3557,7 @@ static void cache_reap(void *unused)
                check_irq_on();
                l3 = searchp->nodelists[numa_node_id()];
-                if (l3->alien)
+                reap_alien(searchp, l3);
-                        drain_alien_cache(searchp, l3->alien);
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3545,7 +3607,7 @@ static void cache_reap(void *unused)
        }
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
-        drain_remote_pages();
+        next_reap_node();
        /* Setup the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
diff --git a/mm/swap.c b/mm/swap.c
index cce3dda59c59..e9ec06d845e8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -489,13 +489,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
        if (count >= FBC_BATCH || count <= -FBC_BATCH) {
                spin_lock(&fbc->lock);
                fbc->count += count;
+                *pcount = 0;
                spin_unlock(&fbc->lock);
-                count = 0;
+        } else {
+                *pcount = count;
        }
-        *pcount = count;
        put_cpu();
 }
 EXPORT_SYMBOL(percpu_counter_mod);
+/*
+ * Add up all the per-cpu counts, return the result.  This is a more accurate
+ * but much slower version of percpu_counter_read_positive()
+ */
+long percpu_counter_sum(struct percpu_counter *fbc)
+{
+        long ret;
+        int cpu;
+        spin_lock(&fbc->lock);
+        ret = fbc->count;
+        for_each_cpu(cpu) {
+                long *pcount = per_cpu_ptr(fbc->counters, cpu);
+                ret += *pcount;
+        }
+        spin_unlock(&fbc->lock);
+        return ret < 0 ? 0 : ret;
+}
+EXPORT_SYMBOL(percpu_counter_sum);
 #endif
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1838c15ca4fd..7ccf763bb30b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1883,7 +1883,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        if (!(gfp_mask & __GFP_WAIT) ||
                zone->all_unreclaimable ||
-                atomic_read(&zone->reclaim_in_progress) > 0)
+                atomic_read(&zone->reclaim_in_progress) > 0 ||
+                (p->flags & PF_MEMALLOC))
                        return 0;
        node_id = zone->zone_pgdat->node_id;
@@ -1908,7 +1909,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
        cond_resched();
-        p->flags |= PF_MEMALLOC;
+        /*
+         * We need to be able to allocate from the reserves for RECLAIM_SWAP
+         * and we also need to be able to write out pages for RECLAIM_WRITE
+         * and RECLAIM_SWAP.
+         */
+        p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
@@ -1932,11 +1938,10 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * a long time.
                 */
                shrink_slab(sc.nr_scanned, gfp_mask, order);
-                sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
        }
        p->reclaim_state = NULL;
-        current->flags &= ~PF_MEMALLOC;
+        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
        if (sc.nr_reclaimed == 0)
                zone->last_unsuccessful_zone_reclaim = jiffies;