9 files changed, 246 insertions, 127 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a965b6b35f26..44da3d476994 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *    ->private_lock            (try_to_unmap_one)
 *    ->tree_lock               (try_to_unmap_one)
 *    ->zone.lru_lock           (follow_page->mark_page_accessed)
+ *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
 *    ->private_lock            (page_remove_rmap->set_page_dirty)
 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
 *    ->inode_lock              (page_remove_rmap->set_page_dirty)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3171f884d245..73790188b0eb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -185,8 +185,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 }
 static void gather_stats(struct page *, void *);
-static void migrate_page_add(struct vm_area_struct *vma,
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
-        struct page *page, struct list_head *pagelist, unsigned long flags);
+                                unsigned long flags);
 /* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -208,6 +208,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                page = vm_normal_page(vma, addr, *pte);
                if (!page)
                        continue;
+                /*
+                 * The check for PageReserved here is important to avoid
+                 * handling zero pages and other pages that may have been
+                 * marked special by the system.
+                 *
+                 * If the PageReserved would not be checked here then f.e.
+                 * the location of the zero page could have an influence
+                 * on MPOL_MF_STRICT, zero pages would be counted for
+                 * the per node stats, and there would be useless attempts
+                 * to put zero pages on the migration list.
+                 */
                if (PageReserved(page))
                        continue;
                nid = page_to_nid(page);
@@ -216,11 +227,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (flags & MPOL_MF_STATS)
                        gather_stats(page, private);
-                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-                        spin_unlock(ptl);
+                        migrate_page_add(page, private, flags);
-                        migrate_page_add(vma, page, private, flags);
-                        spin_lock(ptl);
-                }
                else
                        break;
        } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        int err;
        struct vm_area_struct *first, *vma, *prev;
+        /* Clear the LRU lists so pages can be isolated */
+        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+                lru_add_drain_all();
        first = find_vma(mm, start);
        if (!first)
                return ERR_PTR(-EFAULT);
@@ -519,51 +531,15 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 * page migration
 */
-/* Check if we are the only process mapping the page in question */
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
-static inline int single_mm_mapping(struct mm_struct *mm,
+                                unsigned long flags)
-                        struct address_space *mapping)
-{
-        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
-        int rc = 1;
-        spin_lock(&mapping->i_mmap_lock);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-                if (mm != vma->vm_mm) {
-                        rc = 0;
-                        goto out;
-                }
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-                if (mm != vma->vm_mm) {
-                        rc = 0;
-                        goto out;
-                }
-out:
-        spin_unlock(&mapping->i_mmap_lock);
-        return rc;
-}
-/*
- * Add a page to be migrated to the pagelist
- */
-static void migrate_page_add(struct vm_area_struct *vma,
-        struct page *page, struct list_head *pagelist, unsigned long flags)
 {
        /*
-         * Avoid migrating a page that is shared by others and not writable.
+         * Avoid migrating a page that is shared with others.
         */
-        if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
-            mapping_writably_mapped(page->mapping) ||
+                if (isolate_lru_page(page))
-            single_mm_mapping(vma->vm_mm, page->mapping)) {
-                int rc = isolate_lru_page(page);
-                if (rc == 1)
                        list_add(&page->lru, pagelist);
-                /*
-                 * If the isolate attempt was not successful then we just
-                 * encountered an unswappable page. Something must be wrong.
-                 */
-                WARN_ON(rc == 0);
        }
 }
@@ -1000,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
        return nid;
 }
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+        switch (policy->policy) {
+        case MPOL_INTERLEAVE:
+                return interleave_nodes(policy);
+        case MPOL_BIND:
+                /*
+                 * Follow bind policy behavior and start allocation at the
+                 * first node.
+                 */
+                return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+        case MPOL_PREFERRED:
+                if (policy->v.preferred_node >= 0)
+                        return policy->v.preferred_node;
+                /* Fall through */
+        default:
+                return numa_node_id();
+        }
+}
 /* Do static interleaving for a VMA with known offset. */
 static unsigned offset_il_node(struct mempolicy *pol,
                struct vm_area_struct *vma, unsigned long off)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5240e426c1f7..945559fb63d2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,7 @@
 static long ratelimit_pages = 32;
 static long total_pages;        /* The total number of pages in the machine. */
-static int dirty_exceeded;      /* Dirty mem may be over limit */
+static int dirty_exceeded __cacheline_aligned_in_smp;   /* Dirty mem may be over limit */
 /*
 * When balance_dirty_pages decides that the caller needs to perform some
@@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping)
                if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
                        break;
-                dirty_exceeded = 1;
+                if (!dirty_exceeded)
+                        dirty_exceeded = 1;
                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                 * Unstable writes are a feature of certain networked
@@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                blk_congestion_wait(WRITE, HZ/10);
        }
-        if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+        if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
                dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d1..df54e2fc8ee0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                mark = (*z)->pages_high;
                        if (!zone_watermark_ok(*z, order, mark,
                                    classzone_idx, alloc_flags))
-                                continue;
+                                if (!zone_reclaim_mode ||
+                                    !zone_reclaim(*z, gfp_mask, order))
+                                        continue;
                }
                page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
        prev_node = local_node;
        nodes_clear(used_mask);
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+                int distance = node_distance(local_node, node);
+                /*
+                 * If another node is sufficiently far away then it is better
+                 * to reclaim pages in a zone before going off node.
+                 */
+                if (distance > RECLAIM_DISTANCE)
+                        zone_reclaim_mode = 1;
                /*
                 * We don't want to pressure a particular node.
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
-                if (node_distance(local_node, node) !=
-                                node_distance(local_node, prev_node))
+                if (distance != node_distance(local_node, prev_node))
                        node_load[node] += load;
                prev_node = node;
                load--;
diff --git a/mm/rmap.c b/mm/rmap.c
index dfbb89f99a15..d85a99d28c03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -33,7 +33,7 @@
 *     mapping->i_mmap_lock
 *       anon_vma->lock
 *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed)
+ *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
 *           swap_lock (in swap_duplicate, swap_info_get)
 *             mmlist_lock (in mmput, drain_mmlist and others)
 *             mapping->private_lock (in __set_page_dirty_buffers)
diff --git a/mm/slab.c b/mm/slab.c
index 9374293a3012..6f8495e2185b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
 * Further notes from the original documentation:
 *
 * 11 April '97.  Started multi-threading - markhe
- *      The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
 *      The sem is only needed when accessing/extending the cache-chain, which
 *      can never happen inside an interrupt (kmem_cache_create(),
 *      kmem_cache_shrink() and kmem_cache_reap()).
@@ -103,6 +103,8 @@
 #include        <linux/rcupdate.h>
 #include        <linux/string.h>
 #include        <linux/nodemask.h>
+#include        <linux/mempolicy.h>
+#include        <linux/mutex.h>
 #include        <asm/uaccess.h>
 #include        <asm/cacheflush.h>
@@ -631,7 +633,7 @@ static kmem_cache_t cache_cache = {
 };
 /* Guard access to the cache-chain. */
-static struct semaphore cache_chain_sem;
+static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 /*
@@ -772,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 }
 #ifdef CONFIG_NUMA
+static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
        struct array_cache **ac_ptr;
@@ -857,7 +861,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
-                down(&cache_chain_sem);
+                mutex_lock(&cache_chain_mutex);
                /* we need to do this right in the beginning since
                 * alloc_arraycache's are going to use this list.
                 * kmalloc_node allows us to add the slab to the right
@@ -912,7 +916,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                l3->shared = nc;
                        }
                }
-                up(&cache_chain_sem);
+                mutex_unlock(&cache_chain_mutex);
                break;
        case CPU_ONLINE:
                start_cpu_timer(cpu);
@@ -921,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
        case CPU_DEAD:
                /* fall thru */
        case CPU_UP_CANCELED:
-                down(&cache_chain_sem);
+                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
@@ -973,13 +977,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        spin_unlock_irq(&cachep->spinlock);
                        kfree(nc);
                }
-                up(&cache_chain_sem);
+                mutex_unlock(&cache_chain_mutex);
                break;
 #endif
        }
        return NOTIFY_OK;
      bad:
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        return NOTIFY_BAD;
 }
@@ -1047,7 +1051,6 @@ void __init kmem_cache_init(void)
         */
        /* 1) create the cache_cache */
-        init_MUTEX(&cache_chain_sem);
        INIT_LIST_HEAD(&cache_chain);
        list_add(&cache_cache.next, &cache_chain);
        cache_cache.colour_off = cache_line_size();
@@ -1168,10 +1171,10 @@ void __init kmem_cache_init(void)
        /* 6) resize the head arrays to their final sizes */
        {
                kmem_cache_t *cachep;
-                down(&cache_chain_sem);
+                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next)
                    enable_cpucache(cachep);
-                up(&cache_chain_sem);
+                mutex_unlock(&cache_chain_mutex);
        }
        /* Done! */
@@ -1590,7 +1593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                BUG();
        }
-        down(&cache_chain_sem);
+        mutex_lock(&cache_chain_mutex);
        list_for_each(p, &cache_chain) {
                kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
@@ -1856,7 +1859,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                      name);
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2044,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t *cachep)
        lock_cpu_hotplug();
        /* Find the cache in the chain of caches. */
-        down(&cache_chain_sem);
+        mutex_lock(&cache_chain_mutex);
        /*
         * the chain is never empty, cache_cache is never destroyed
         */
        list_del(&cachep->next);
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        if (__cache_shrink(cachep)) {
                slab_error(cachep, "Can't free all objects");
-                down(&cache_chain_sem);
+                mutex_lock(&cache_chain_mutex);
                list_add(&cachep->next, &cache_chain);
-                up(&cache_chain_sem);
+                mutex_unlock(&cache_chain_mutex);
                unlock_cpu_hotplug();
                return 1;
        }
@@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
        void *objp;
        struct array_cache *ac;
+#ifdef CONFIG_NUMA
+        if (unlikely(current->mempolicy && !in_interrupt())) {
+                int nid = slab_node(current->mempolicy);
+                if (nid != numa_node_id())
+                        return __cache_alloc_node(cachep, flags, nid);
+        }
+#endif
        check_irq_off();
        ac = ac_data(cachep);
        if (likely(ac->avail)) {
@@ -3314,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
 * - clear the per-cpu caches for this CPU.
 * - return freeable pages to the main free memory pool.
 *
- * If we cannot acquire the cache chain semaphore then just give up - we'll
+ * If we cannot acquire the cache chain mutex then just give up - we'll
 * try again on the next iteration.
 */
 static void cache_reap(void *unused)
@@ -3322,7 +3334,7 @@ static void cache_reap(void *unused)
        struct list_head *walk;
        struct kmem_list3 *l3;
-        if (down_trylock(&cache_chain_sem)) {
+        if (!mutex_trylock(&cache_chain_mutex)) {
                /* Give up. Setup the next iteration. */
                schedule_delayed_work(&__get_cpu_var(reap_work),
                                      REAPTIMEOUT_CPUC);
@@ -3393,7 +3405,7 @@ static void cache_reap(void *unused)
                cond_resched();
        }
        check_irq_on();
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        drain_remote_pages();
        /* Setup the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
@@ -3429,7 +3441,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
        loff_t n = *pos;
        struct list_head *p;
-        down(&cache_chain_sem);
+        mutex_lock(&cache_chain_mutex);
        if (!n)
                print_slabinfo_header(m);
        p = cache_chain.next;
@@ -3451,7 +3463,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 {
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
@@ -3603,7 +3615,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
                return -EINVAL;
        /* Find the cache in the chain of caches. */
-        down(&cache_chain_sem);
+        mutex_lock(&cache_chain_mutex);
        res = -EINVAL;
        list_for_each(p, &cache_chain) {
                kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
@@ -3620,7 +3632,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
                        break;
                }
        }
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        if (res >= 0)
                res = count;
        return res;
diff --git a/mm/swap.c b/mm/swap.c
index cbb48e721ab9..bc2442a7b0ee 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -174,6 +174,32 @@ void lru_add_drain(void)
        put_cpu();
 }
+#ifdef CONFIG_NUMA
+static void lru_add_drain_per_cpu(void *dummy)
+{
+        lru_add_drain();
+}
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+        return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+}
+#else
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+        lru_add_drain();
+        return 0;
+}
+#endif
 /*
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 957fef43fa60..f1e69c30d203 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,7 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/mutex.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
@@ -46,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1};
 struct swap_info_struct swap_info[MAX_SWAPFILES];
-static DECLARE_MUTEX(swapon_sem);
+static DEFINE_MUTEX(swapon_mutex);
 /*
 * We need this because the bdev->unplug_fn can sleep and we cannot
 * hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a semaphore.
+ * cannot be turned into a mutex.
 */
 static DECLARE_RWSEM(swap_unplug_sem);
@@ -1161,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        up_write(&swap_unplug_sem);
        destroy_swap_extents(p);
-        down(&swapon_sem);
+        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        drain_mmlist();
@@ -1180,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        p->swap_map = NULL;
        p->flags = 0;
        spin_unlock(&swap_lock);
-        up(&swapon_sem);
+        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
        inode = mapping->host;
        if (S_ISBLK(inode->i_mode)) {
@@ -1209,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
        int i;
        loff_t l = *pos;
-        down(&swapon_sem);
+        mutex_lock(&swapon_mutex);
        for (i = 0; i < nr_swapfiles; i++, ptr++) {
                if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
@@ -1238,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
 static void swap_stop(struct seq_file *swap, void *v)
 {
-        up(&swapon_sem);
+        mutex_unlock(&swapon_mutex);
 }
 static int swap_show(struct seq_file *swap, void *v)
@@ -1540,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                goto bad_swap;
        }
-        down(&swapon_sem);
+        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        p->flags = SWP_ACTIVE;
        nr_swap_pages += nr_good_pages;
@@ -1566,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                swap_info[prev].next = p - swap_info;
        }
        spin_unlock(&swap_lock);
-        up(&swapon_sem);
+        mutex_unlock(&swapon_mutex);
        error = 0;
        goto out;
 bad_swap:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf903b2d198f..2e34b61a70c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -71,6 +71,9 @@ struct scan_control {
        int may_writepage;
+        /* Can pages be swapped as part of reclaim? */
+        int may_swap;
        /* This context's SWAP_CLUSTER_MAX. If freeing memory for
         * suspend, we effectively ignore SWAP_CLUSTER_MAX.
         * In this context, it doesn't matter that we scan the
@@ -458,6 +461,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * Try to allocate it some swap space here.
                 */
                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!sc->may_swap)
+                                goto keep_locked;
                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
                }
@@ -586,7 +591,7 @@ static inline void move_to_lru(struct page *page)
 }
 /*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU.
 *
 * returns the number of pages put back.
 */
@@ -760,46 +765,33 @@ next:
        return nr_failed + retry;
 }
-static void lru_add_drain_per_cpu(void *dummy)
-{
-        lru_add_drain();
-}
 /*
 * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
+ * indicated list with elevated refcount.
- * page is not on the LRU lists yet.
 *
 * Result:
 *  0 = page not on LRU list
 *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
 */
 int isolate_lru_page(struct page *page)
 {
-        int rc = 0;
+        int ret = 0;
-        struct zone *zone = page_zone(page);
-redo:
+        if (PageLRU(page)) {
-        spin_lock_irq(&zone->lru_lock);
+                struct zone *zone = page_zone(page);
-        rc = __isolate_lru_page(page);
+                spin_lock_irq(&zone->lru_lock);
-        if (rc == 1) {
+                if (TestClearPageLRU(page)) {
-                if (PageActive(page))
+                        ret = 1;
-                        del_page_from_active_list(zone, page);
+                        get_page(page);
-                else
+                        if (PageActive(page))
-                        del_page_from_inactive_list(zone, page);
+                                del_page_from_active_list(zone, page);
-        }
+                        else
-        spin_unlock_irq(&zone->lru_lock);
+                                del_page_from_inactive_list(zone, page);
-        if (rc == 0) {
+                }
-                /*
+                spin_unlock_irq(&zone->lru_lock);
-                 * Maybe this page is still waiting for a cpu to drain it
-                 * from one of the lru lists?
-                 */
-                rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-                if (rc == 0 && PageLRU(page))
-                        goto redo;
        }
-        return rc;
+        return ret;
 }
 #endif
@@ -831,18 +823,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                switch (__isolate_lru_page(page)) {
+                if (!TestClearPageLRU(page))
-                case 1:
-                        /* Succeeded to isolate page */
-                        list_move(&page->lru, dst);
-                        nr_taken++;
-                        break;
-                case -ENOENT:
-                        /* Not possible to isolate */
-                        list_move(&page->lru, src);
-                        break;
-                default:
                        BUG();
+                list_del(&page->lru);
+                if (get_page_testone(page)) {
+                        /*
+                         * It is being freed elsewhere
+                         */
+                        __put_page(page);
+                        SetPageLRU(page);
+                        list_add(&page->lru, src);
+                        continue;
+                } else {
+                        list_add(&page->lru, dst);
+                        nr_taken++;
                }
        }
@@ -1177,6 +1171,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        sc.gfp_mask = gfp_mask;
        sc.may_writepage = 0;
+        sc.may_swap = 1;
        inc_page_state(allocstall);
@@ -1279,6 +1274,7 @@ loop_again:
        total_reclaimed = 0;
        sc.gfp_mask = GFP_KERNEL;
        sc.may_writepage = 0;
+        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
        inc_page_state(pageoutrun);
@@ -1576,3 +1572,71 @@ static int __init kswapd_init(void)
 }
 module_init(kswapd_init)
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+        int nr_pages = 1 << order;
+        struct task_struct *p = current;
+        struct reclaim_state reclaim_state;
+        struct scan_control sc = {
+                .gfp_mask       = gfp_mask,
+                .may_writepage  = 0,
+                .may_swap       = 0,
+                .nr_mapped      = read_page_state(nr_mapped),
+                .nr_scanned     = 0,
+                .nr_reclaimed   = 0,
+                .priority       = 0
+        };
+        if (!(gfp_mask & __GFP_WAIT) ||
+                zone->zone_pgdat->node_id != numa_node_id() ||
+                zone->all_unreclaimable ||
+                atomic_read(&zone->reclaim_in_progress) > 0)
+                        return 0;
+        if (time_before(jiffies,
+                zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+                        return 0;
+        disable_swap_token();
+        if (nr_pages > SWAP_CLUSTER_MAX)
+                sc.swap_cluster_max = nr_pages;
+        else
+                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+        cond_resched();
+        p->flags |= PF_MEMALLOC;
+        reclaim_state.reclaimed_slab = 0;
+        p->reclaim_state = &reclaim_state;
+        shrink_zone(zone, &sc);
+        p->reclaim_state = NULL;
+        current->flags &= ~PF_MEMALLOC;
+        if (sc.nr_reclaimed == 0)
+                zone->last_unsuccessful_zone_reclaim = jiffies;
+        return sc.nr_reclaimed > nr_pages;
+}
+#endif