11 files changed, 306 insertions, 121 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67f29516662a..508707704d2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
        BUG_ON(page_count(page));
        INIT_LIST_HEAD(&page->lru);
-        page[1].mapping = NULL;
+        page[1].lru.next = NULL;                        /* reset dtor */
        spin_lock(&hugetlb_lock);
        enqueue_huge_page(page);
@@ -105,7 +105,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
        }
        spin_unlock(&hugetlb_lock);
        set_page_count(page, 1);
-        page[1].mapping = (void *)free_huge_page;
+        page[1].lru.next = (void *)free_huge_page;      /* set dtor */
        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
                clear_user_highpage(&page[i], addr);
        return page;
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299a..af3d573b0141 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
        struct mm_struct * mm = vma->vm_mm;
        int error = 0;
        pgoff_t pgoff;
-        int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+        int new_flags = vma->vm_flags;
        switch (behavior) {
+        case MADV_NORMAL:
+                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+                break;
        case MADV_SEQUENTIAL:
-                new_flags |= VM_SEQ_READ;
+                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
                break;
        case MADV_RANDOM:
-                new_flags |= VM_RAND_READ;
+                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
                break;
-        default:
+        case MADV_DONTFORK:
+                new_flags |= VM_DONTCOPY;
+                break;
+        case MADV_DOFORK:
+                new_flags &= ~VM_DONTCOPY;
                break;
        }
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
        long error;
        switch (behavior) {
+        case MADV_DOFORK:
+                if (vma->vm_flags & VM_IO) {
+                        error = -EINVAL;
+                        break;
+                }
+        case MADV_DONTFORK:
        case MADV_NORMAL:
        case MADV_SEQUENTIAL:
        case MADV_RANDOM:
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8a..9abc6008544b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
+int randomize_va_space __read_mostly = 1;
+static int __init disable_randmaps(char *s)
+{
+        randomize_va_space = 0;
+        return 0;
+}
+__setup("norandmaps", disable_randmaps);
 /*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b75..880831bd3003 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
        }
        return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
        struct zonelist *zl;
-        int num, max, nd;
+        int num, max, nd, k;
        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-        zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
        if (!zl)
                return NULL;
        num = 0;
-        for_each_node_mask(nd, *nodes)
+        /* First put in the highest zones from all nodes, then all the next 
-                zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+           lower zones etc. Avoid empty zones because the memory allocator
+           doesn't like them. If you implement node hot removal you
+           have to fix that. */
+        for (k = policy_zone; k >= 0; k--) { 
+                for_each_node_mask(nd, *nodes) { 
+                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
+                        if (z->present_pages > 0) 
+                                zl->zones[num++] = z;
+                }
+        }
        zl->zones[num] = NULL;
        return zl;
 }
@@ -577,7 +587,7 @@ redo:
                }
                list_add(&page->lru, &newlist);
                nr_pages++;
-                if (nr_pages > MIGRATE_CHUNK_SIZE);
+                if (nr_pages > MIGRATE_CHUNK_SIZE)
                        break;
        }
        err = migrate_pages(pagelist, &newlist, &moved, &failed);
@@ -798,6 +808,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
+        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+                return -EINVAL;
        nlongs = BITS_TO_LONGS(maxnode);
        if ((maxnode % BITS_PER_LONG) == 0)
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d68232..99d21020ec9d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc);
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(vmalloc_to_page);
 EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
 /*
 * Handle all mappings that got truncated by a "truncate()"
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b05ab8f2a562..8123fad5a485 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        /*
         * Processes which fork a lot of child processes are likely
-         * a good choice. We add the vmsize of the children if they
+         * a good choice. We add half the vmsize of the children if they
         * have an own mm. This prevents forking servers to flood the
-         * machine with an endless amount of children
+         * machine with an endless amount of children. In case a single
+         * child is eating the vast majority of memory, adding only half
+         * to the parents will make the child our kill candidate of choice.
         */
        list_for_each(tsk, &p->children) {
                struct task_struct *chld;
                chld = list_entry(tsk, struct task_struct, sibling);
                if (chld->mm != p->mm && chld->mm)
-                        points += chld->mm->total_vm;
+                        points += chld->mm->total_vm/2 + 1;
        }
        /*
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 }
 /*
+ * Types of limitations to the nodes from which allocations may occur
+ */
+#define CONSTRAINT_NONE 1
+#define CONSTRAINT_MEMORY_POLICY 2
+#define CONSTRAINT_CPUSET 3
+/*
+ * Determine the type of allocation constraint.
+ */
+static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NUMA
+        struct zone **z;
+        nodemask_t nodes = node_online_map;
+        for (z = zonelist->zones; *z; z++)
+                if (cpuset_zone_allowed(*z, gfp_mask))
+                        node_clear((*z)->zone_pgdat->node_id,
+                                        nodes);
+                else
+                        return CONSTRAINT_CPUSET;
+        if (!nodes_empty(nodes))
+                return CONSTRAINT_MEMORY_POLICY;
+#endif
+        return CONSTRAINT_NONE;
+}
+/*
 * Simple selection loop. We chose the process with the highest
 * number of 'points'. We expect the caller will lock the tasklist.
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
-static struct task_struct * select_bad_process(void)
+static struct task_struct *select_bad_process(unsigned long *ppoints)
 {
-        unsigned long maxpoints = 0;
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
        struct timespec uptime;
+        *ppoints = 0;
        do_posix_clock_monotonic_gettime(&uptime);
        do_each_thread(g, p) {
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void)
                        return p;
                points = badness(p, uptime.tv_sec);
-                if (points > maxpoints || !chosen) {
+                if (points > *ppoints || !chosen) {
                        chosen = p;
-                        maxpoints = points;
+                        *ppoints = points;
                }
        } while_each_thread(g, p);
        return chosen;
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void)
 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
 * we select a process with CAP_SYS_RAW_IO set).
 */
-static void __oom_kill_task(task_t *p)
+static void __oom_kill_task(task_t *p, const char *message)
 {
        if (p->pid == 1) {
                WARN_ON(1);
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p)
                return;
        }
        task_unlock(p);
-        printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
+        printk(KERN_ERR "%s: Killed process %d (%s).\n",
-                                                        p->pid, p->comm);
+                                message, p->pid, p->comm);
        /*
         * We give our sacrificial lamb high priority and access to
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p)
        force_sig(SIGKILL, p);
 }
-static struct mm_struct *oom_kill_task(task_t *p)
+static struct mm_struct *oom_kill_task(task_t *p, const char *message)
 {
        struct mm_struct *mm = get_task_mm(p);
        task_t * g, * q;
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p)
                return NULL;
        }
-        __oom_kill_task(p);
+        __oom_kill_task(p, message);
        /*
         * kill all processes that share the ->mm (i.e. all threads),
         * but are in a different thread group
         */
        do_each_thread(g, q)
                if (q->mm == mm && q->tgid != p->tgid)
-                        __oom_kill_task(q);
+                        __oom_kill_task(q, message);
        while_each_thread(g, q);
        return mm;
 }
-static struct mm_struct *oom_kill_process(struct task_struct *p)
+static struct mm_struct *oom_kill_process(struct task_struct *p,
+                                unsigned long points, const char *message)
 {
        struct mm_struct *mm;
        struct task_struct *c;
        struct list_head *tsk;
+        printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
+                "children.\n", p->pid, p->comm, points);
        /* Try to kill a child first */
        list_for_each(tsk, &p->children) {
                c = list_entry(tsk, struct task_struct, sibling);
                if (c->mm == p->mm)
                        continue;
-                mm = oom_kill_task(c);
+                mm = oom_kill_task(c, message);
                if (mm)
                        return mm;
        }
-        return oom_kill_task(p);
+        return oom_kill_task(p, message);
 }
 /**
@@ -263,10 +298,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-void out_of_memory(gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
        struct mm_struct *mm = NULL;
-        task_t * p;
+        task_t *p;
+        unsigned long points;
        if (printk_ratelimit()) {
                printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -277,25 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order)
        cpuset_lock();
        read_lock(&tasklist_lock);
+        /*
+         * Check if there were limitations on the allocation (only relevant for
+         * NUMA) that may require different handling.
+         */
+        switch (constrained_alloc(zonelist, gfp_mask)) {
+        case CONSTRAINT_MEMORY_POLICY:
+                mm = oom_kill_process(current, points,
+                                "No available memory (MPOL_BIND)");
+                break;
+        case CONSTRAINT_CPUSET:
+                mm = oom_kill_process(current, points,
+                                "No available memory in cpuset");
+                break;
+        case CONSTRAINT_NONE:
 retry:
-        p = select_bad_process();
+                /*
+                 * Rambo mode: Shoot down a process and hope it solves whatever
+                 * issues we may have.
+                 */
+                p = select_bad_process(&points);
-        if (PTR_ERR(p) == -1UL)
+                if (PTR_ERR(p) == -1UL)
-                goto out;
+                        goto out;
-        /* Found nothing?!?! Either we hang forever, or we panic. */
+                /* Found nothing?!?! Either we hang forever, or we panic. */
-        if (!p) {
+                if (!p) {
-                read_unlock(&tasklist_lock);
+                        read_unlock(&tasklist_lock);
-                cpuset_unlock();
+                        cpuset_unlock();
-                panic("Out of memory and no killable processes...\n");
+                        panic("Out of memory and no killable processes...\n");
-        }
+                }
-        mm = oom_kill_process(p);
+                mm = oom_kill_process(p, points, "Out of memory");
-        if (!mm)
+                if (!mm)
-                goto retry;
+                        goto retry;
+                break;
+        }
- out:
+out:
-        read_unlock(&tasklist_lock);
        cpuset_unlock();
        if (mm)
                mmput(mm);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde04ff4be31..791690d7d3fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
 int percpu_pagelist_fraction;
 static void fastcall free_hot_cold_page(struct page *page, int cold);
+static void __free_pages_ok(struct page *page, unsigned int order);
 /*
 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
 * All pages have PG_compound set.  All pages have their ->private pointing at
 * the head page (even the head page has this).
 *
- * The first tail page's ->mapping, if non-zero, holds the address of the
+ * The first tail page's ->lru.next holds the address of the compound page's
- * compound page's put_page() function.
+ * put_page() function.  Its ->lru.prev holds the order of allocation.
- *
+ * This usage means that zero-order pages may not be compound.
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present.  This usage means that zero-order pages
- * may not be compound.
 */
+static void free_compound_page(struct page *page)
+{
+        __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
 static void prep_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
-        page[1].mapping = NULL;
+        page[1].lru.next = (void *)free_compound_page;  /* set dtor */
-        page[1].index = order;
+        page[1].lru.prev = (void *)order;
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
-        if (unlikely(page[1].index != order))
+        if (unlikely((unsigned long)page[1].lru.prev != order))
                bad_page(page);
        for (i = 0; i < nr_pages; i++) {
@@ -1011,7 +1015,7 @@ rebalance:
                if (page)
                        goto got_pg;
-                out_of_memory(gfp_mask, order);
+                out_of_memory(zonelist, gfp_mask, order);
                goto restart;
        }
@@ -1537,29 +1541,29 @@ static int __initdata node_load[MAX_NUMNODES];
 */
 static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
 {
-        int i, n, val;
+        int n, val;
        int min_val = INT_MAX;
        int best_node = -1;
-        for_each_online_node(i) {
+        /* Use the local node if we haven't already */
-                cpumask_t tmp;
+        if (!node_isset(node, *used_node_mask)) {
+                node_set(node, *used_node_mask);
+                return node;
+        }
-                /* Start from local node */
+        for_each_online_node(n) {
-                n = (node+i) % num_online_nodes();
+                cpumask_t tmp;
                /* Don't want a node to appear more than once */
                if (node_isset(n, *used_node_mask))
                        continue;
-                /* Use the local node if we haven't already */
-                if (!node_isset(node, *used_node_mask)) {
-                        best_node = node;
-                        break;
-                }
                /* Use the distance array to find the distance */
                val = node_distance(node, n);
+                /* Penalize nodes under us ("prefer the next node") */
+                val += (n < node);
                /* Give preference to headless and unused nodes */
                tmp = node_to_cpumask(n);
                if (!cpus_empty(tmp))
diff --git a/mm/shmem.c b/mm/shmem.c
index f7ac7b812f92..7c455fbaff7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
 #include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
+#include <linux/ctype.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
 }
 #ifdef CONFIG_NUMA
+static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+        char *nodelist = strchr(value, ':');
+        int err = 1;
+        if (nodelist) {
+                /* NUL-terminate policy string */
+                *nodelist++ = '\0';
+                if (nodelist_parse(nodelist, *policy_nodes))
+                        goto out;
+        }
+        if (!strcmp(value, "default")) {
+                *policy = MPOL_DEFAULT;
+                /* Don't allow a nodelist */
+                if (!nodelist)
+                        err = 0;
+        } else if (!strcmp(value, "prefer")) {
+                *policy = MPOL_PREFERRED;
+                /* Insist on a nodelist of one node only */
+                if (nodelist) {
+                        char *rest = nodelist;
+                        while (isdigit(*rest))
+                                rest++;
+                        if (!*rest)
+                                err = 0;
+                }
+        } else if (!strcmp(value, "bind")) {
+                *policy = MPOL_BIND;
+                /* Insist on a nodelist */
+                if (nodelist)
+                        err = 0;
+        } else if (!strcmp(value, "interleave")) {
+                *policy = MPOL_INTERLEAVE;
+                /* Default to nodes online if no nodelist */
+                if (!nodelist)
+                        *policy_nodes = node_online_map;
+                err = 0;
+        }
+out:
+        /* Restore string for error message */
+        if (nodelist)
+                *--nodelist = ':';
+        return err;
+}
 static struct page *shmem_swapin_async(struct shared_policy *p,
                                       swp_entry_t entry, unsigned long idx)
 {
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
        return page;
 }
 #else
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+        return 1;
+}
 static inline struct page *
 shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
 {
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
 {
        char *this_char, *value, *rest;
-        while ((this_char = strsep(&options, ",")) != NULL) {
+        while (options != NULL) {
+                this_char = options;
+                for (;;) {
+                        /*
+                         * NUL-terminate this option: unfortunately,
+                         * mount options form a comma-separated list,
+                         * but mpol's nodelist may also contain commas.
+                         */
+                        options = strchr(options, ',');
+                        if (options == NULL)
+                                break;
+                        options++;
+                        if (!isdigit(*options)) {
+                                options[-1] = '\0';
+                                break;
+                        }
+                }
                if (!*this_char)
                        continue;
                if ((value = strchr(this_char,'=')) != NULL) {
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
                        if (*rest)
                                goto bad_val;
                } else if (!strcmp(this_char,"mpol")) {
-                        if (!strcmp(value,"default"))
+                        if (shmem_parse_mpol(value,policy,policy_nodes))
-                                *policy = MPOL_DEFAULT;
-                        else if (!strcmp(value,"preferred"))
-                                *policy = MPOL_PREFERRED;
-                        else if (!strcmp(value,"bind"))
-                                *policy = MPOL_BIND;
-                        else if (!strcmp(value,"interleave"))
-                                *policy = MPOL_INTERLEAVE;
-                        else
                                goto bad_val;
-                } else if (!strcmp(this_char,"mpol_nodelist")) {
-                        nodelist_parse(value, *policy_nodes);
                } else {
                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
                               this_char);
diff --git a/mm/slab.c b/mm/slab.c
index d66c2b0d9715..add05d808a4a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1717,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                BUG();
        }
+        /*
+         * Prevent CPUs from coming and going.
+         * lock_cpu_hotplug() nests outside cache_chain_mutex
+         */
+        lock_cpu_hotplug();
        mutex_lock(&cache_chain_mutex);
        list_for_each(p, &cache_chain) {
@@ -1918,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->dtor = dtor;
        cachep->name = name;
-        /* Don't let CPUs to come and go */
-        lock_cpu_hotplug();
        if (g_cpucache_up == FULL) {
                enable_cpucache(cachep);
@@ -1978,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
-        unlock_cpu_hotplug();
      oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                      name);
        mutex_unlock(&cache_chain_mutex);
+        unlock_cpu_hotplug();
        return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
diff --git a/mm/swap.c b/mm/swap.c
index 76247424dea1..cce3dda59c59 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page)
        if (put_page_testzero(page)) {
                void (*dtor)(struct page *page);
-                dtor = (void (*)(struct page *))page[1].mapping;
+                dtor = (void (*)(struct page *))page[1].lru.next;
                (*dtor)(page);
        }
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd06..1838c15ca4fd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                BUG_ON(PageActive(page));
                sc->nr_scanned++;
+                if (!sc->may_swap && page_mapped(page))
+                        goto keep_locked;
                /* Double the slab pressure for mapped and swapcache pages */
                if (page_mapped(page) || PageSwapCache(page))
                        sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
        struct address_space *mapping = page_mapping(page);
        if (page_mapped(page) && mapping)
-                if (try_to_unmap(page, 0) != SWAP_SUCCESS)
+                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
                        goto unlock_retry;
        if (PageDirty(page)) {
@@ -839,7 +843,7 @@ EXPORT_SYMBOL(migrate_page);
 * pages are swapped out.
 *
 * The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
 * or no retryable pages exist anymore.
 *
 * Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +932,21 @@ redo:
                        goto unlock_both;
                if (mapping->a_ops->migratepage) {
+                        /*
+                         * Most pages have a mapping and most filesystems
+                         * should provide a migration function. Anonymous
+                         * pages are part of swap space which also has its
+                         * own migration function. This is the most common
+                         * path for page migration.
+                         */
                        rc = mapping->a_ops->migratepage(newpage, page);
                        goto unlock_both;
                }
                /*
-                 * Trigger writeout if page is dirty
+                 * Default handling if a filesystem does not provide
+                 * a migration function. We can only migrate clean
+                 * pages so try to write out any dirty pages first.
                 */
                if (PageDirty(page)) {
                        switch (pageout(page, mapping)) {
@@ -949,9 +962,10 @@ redo:
                                ; /* try to migrate the page below */
                        }
                }
                /*
-                 * If we have no buffer or can release the buffer
+                 * Buffers are managed in a filesystem specific way.
-                 * then do a simple migration.
+                 * We must have no buffers or drop them.
                 */
                if (!page_has_buffers(page) ||
                    try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +980,11 @@ redo:
                 * swap them out.
                 */
                if (pass > 4) {
+                        /*
+                         * Persistently unable to drop buffers..... As a
+                         * measure of last resort we fall back to
+                         * swap_page().
+                         */
                        unlock_page(newpage);
                        newpage = NULL;
                        rc = swap_page(page);
@@ -1176,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        struct page *page;
        struct pagevec pvec;
        int reclaim_mapped = 0;
-        long mapped_ratio;
-        long distress;
+        if (unlikely(sc->may_swap)) {
-        long swap_tendency;
+                long mapped_ratio;
+                long distress;
+                long swap_tendency;
+                /*
+                 * `distress' is a measure of how much trouble we're having
+                 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+                 */
+                distress = 100 >> zone->prev_priority;
+                /*
+                 * The point of this algorithm is to decide when to start
+                 * reclaiming mapped memory instead of just pagecache.  Work out
+                 * how much memory
+                 * is mapped.
+                 */
+                mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+                /*
+                 * Now decide how much we really want to unmap some pages.  The
+                 * mapped ratio is downgraded - just because there's a lot of
+                 * mapped memory doesn't necessarily mean that page reclaim
+                 * isn't succeeding.
+                 *
+                 * The distress ratio is important - we don't want to start
+                 * going oom.
+                 *
+                 * A 100% value of vm_swappiness overrides this algorithm
+                 * altogether.
+                 */
+                swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+                /*
+                 * Now use this metric to decide whether to start moving mapped
+                 * memory onto the inactive list.
+                 */
+                if (swap_tendency >= 100)
+                        reclaim_mapped = 1;
+        }
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        zone->nr_active -= pgmoved;
        spin_unlock_irq(&zone->lru_lock);
-        /*
-         * `distress' is a measure of how much trouble we're having reclaiming
-         * pages.  0 -> no problems.  100 -> great trouble.
-         */
-        distress = 100 >> zone->prev_priority;
-        /*
-         * The point of this algorithm is to decide when to start reclaiming
-         * mapped memory instead of just pagecache.  Work out how much memory
-         * is mapped.
-         */
-        mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-        /*
-         * Now decide how much we really want to unmap some pages.  The mapped
-         * ratio is downgraded - just because there's a lot of mapped memory
-         * doesn't necessarily mean that page reclaim isn't succeeding.
-         *
-         * The distress ratio is important - we don't want to start going oom.
-         *
-         * A 100% value of vm_swappiness overrides this algorithm altogether.
-         */
-        swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-        /*
-         * Now use this metric to decide whether to start moving mapped memory
-         * onto the inactive list.
-         */
-        if (swap_tendency >= 100)
-                reclaim_mapped = 1;
        while (!list_empty(&l_hold)) {
                cond_resched();
                page = lru_to_page(&l_hold);
@@ -1595,9 +1621,7 @@ scan:
                        sc.nr_reclaimed = 0;
                        sc.priority = priority;
                        sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-                        atomic_inc(&zone->reclaim_in_progress);
                        shrink_zone(zone, &sc);
-                        atomic_dec(&zone->reclaim_in_progress);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);