7 files changed, 220 insertions, 128 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b21d78c941b5..67f29516662a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -107,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
        set_page_count(page, 1);
        page[1].mapping = (void *)free_huge_page;
        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-                clear_highpage(&page[i]);
+                clear_user_highpage(&page[i], addr);
        return page;
 }
@@ -391,12 +391,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!new_page) {
                page_cache_release(old_page);
+                return VM_FAULT_OOM;
-                /* Logically this is OOM, not a SIGBUS, but an OOM
-                 * could cause the kernel to go killing other
-                 * processes which won't help the hugepage situation
-                 * at all (?) */
-                return VM_FAULT_SIGBUS;
        }
        spin_unlock(&mm->page_table_lock);
@@ -444,6 +439,7 @@ retry:
                page = alloc_huge_page(vma, address);
                if (!page) {
                        hugetlb_put_quota(mapping);
+                        ret = VM_FAULT_OOM;
                        goto out;
                }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27da6d5c77ba..3bd7fb7e4b75 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1159,6 +1159,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
                return interleave_nodes(pol);
 }
+#ifdef CONFIG_HUGETLBFS
 /* Return a zonelist suitable for a huge page allocation. */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 {
@@ -1172,6 +1173,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
        }
        return zonelist_policy(GFP_HIGHUSER, pol);
 }
+#endif
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44b4eb4202d9..dde04ff4be31 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1213,18 +1213,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
        int cpu = 0;
-        memset(ret, 0, sizeof(*ret));
+        memset(ret, 0, nr * sizeof(unsigned long));
        cpus_and(*cpumask, *cpumask, cpu_online_map);
        cpu = first_cpu(*cpumask);
        while (cpu < NR_CPUS) {
                unsigned long *in, *out, off;
+                if (!cpu_isset(cpu, *cpumask))
+                        continue;
                in = (unsigned long *)&per_cpu(page_states, cpu);
                cpu = next_cpu(cpu, *cpumask);
-                if (cpu < NR_CPUS)
+                if (likely(cpu < NR_CPUS))
                        prefetch(&per_cpu(page_states, cpu));
                out = (unsigned long *)ret;
@@ -1886,8 +1889,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
 * not check if the processor is online before following the pageset pointer.
 * Other parts of the kernel may not check if the zone is available.
 */
-static struct per_cpu_pageset
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
-        boot_pageset[NR_CPUS];
 /*
 * Dynamically allocate memory for the
diff --git a/mm/slab.c b/mm/slab.c
index 71370256a7eb..add05d808a4a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -294,6 +294,7 @@ struct kmem_list3 {
        unsigned long next_reap;
        int free_touched;
        unsigned int free_limit;
+        unsigned int colour_next;       /* Per-node cache coloring */
        spinlock_t list_lock;
        struct array_cache *shared;     /* shared per node */
        struct array_cache **alien;     /* on other nodes */
@@ -344,6 +345,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
        INIT_LIST_HEAD(&parent->slabs_free);
        parent->shared = NULL;
        parent->alien = NULL;
+        parent->colour_next = 0;
        spin_lock_init(&parent->list_lock);
        parent->free_objects = 0;
        parent->free_touched = 0;
@@ -390,7 +392,6 @@ struct kmem_cache {
        size_t colour;          /* cache colouring range */
        unsigned int colour_off;        /* colour offset */
-        unsigned int colour_next;       /* cache colouring */
        struct kmem_cache *slabp_cache;
        unsigned int slab_size;
        unsigned int dflags;    /* dynamic flags */
@@ -883,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
        }
 }
-static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
        int i = 0;
        struct array_cache *ac;
        unsigned long flags;
        for_each_online_node(i) {
-                ac = l3->alien[i];
+                ac = alien[i];
                if (ac) {
                        spin_lock_irqsave(&ac->lock, flags);
                        __drain_alien_cache(cachep, ac, i);
@@ -899,9 +900,18 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
        }
 }
 #else
-#define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
+#define drain_alien_cache(cachep, alien) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+        return (struct array_cache **) 0x01020304ul;
+}
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
 #endif
 static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -935,6 +945,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
                                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                                /*
+                                 * The l3s don't come and go as CPUs come and
+                                 * go.  cache_chain_mutex is sufficient
+                                 * protection here.
+                                 */
                                cachep->nodelists[node] = l3;
                        }
@@ -949,26 +964,46 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                   & array cache's */
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
+                        struct array_cache *shared;
+                        struct array_cache **alien;
                        nc = alloc_arraycache(node, cachep->limit,
-                                              cachep->batchcount);
+                                                cachep->batchcount);
                        if (!nc)
                                goto bad;
+                        shared = alloc_arraycache(node,
+                                        cachep->shared * cachep->batchcount,
+                                        0xbaadf00d);
+                        if (!shared)
+                                goto bad;
+                        alien = alloc_alien_cache(node, cachep->limit);
+                        if (!alien)
+                                goto bad;
                        cachep->array[cpu] = nc;
                        l3 = cachep->nodelists[node];
                        BUG_ON(!l3);
-                        if (!l3->shared) {
-                                if (!(nc = alloc_arraycache(node,
-                                                            cachep->shared *
-                                                            cachep->batchcount,
-                                                            0xbaadf00d)))
-                                        goto bad;
-                                /* we are serialised from CPU_DEAD or
+                        spin_lock_irq(&l3->list_lock);
-                                   CPU_UP_CANCELLED by the cpucontrol lock */
+                        if (!l3->shared) {
-                                l3->shared = nc;
+                                /*
+                                 * We are serialised from CPU_DEAD or
+                                 * CPU_UP_CANCELLED by the cpucontrol lock
+                                 */
+                                l3->shared = shared;
+                                shared = NULL;
                        }
+#ifdef CONFIG_NUMA
+                        if (!l3->alien) {
+                                l3->alien = alien;
+                                alien = NULL;
+                        }
+#endif
+                        spin_unlock_irq(&l3->list_lock);
+                        kfree(shared);
+                        free_alien_cache(alien);
                }
                mutex_unlock(&cache_chain_mutex);
                break;
@@ -977,25 +1012,34 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
+                /*
+                 * Even if all the cpus of a node are down, we don't free the
+                 * kmem_list3 of any cache. This to avoid a race between
+                 * cpu_down, and a kmalloc allocation from another cpu for
+                 * memory from the node of the cpu going down.  The list3
+                 * structure is usually allocated from kmem_cache_create() and
+                 * gets destroyed at kmem_cache_destroy().
+                 */
                /* fall thru */
        case CPU_UP_CANCELED:
                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
+                        struct array_cache *shared;
+                        struct array_cache **alien;
                        cpumask_t mask;
                        mask = node_to_cpumask(node);
-                        spin_lock_irq(&cachep->spinlock);
                        /* cpu is dead; no one can alloc from it. */
                        nc = cachep->array[cpu];
                        cachep->array[cpu] = NULL;
                        l3 = cachep->nodelists[node];
                        if (!l3)
-                                goto unlock_cache;
+                                goto free_array_cache;
-                        spin_lock(&l3->list_lock);
+                        spin_lock_irq(&l3->list_lock);
                        /* Free limit for this kmem_list3 */
                        l3->free_limit -= cachep->batchcount;
@@ -1003,34 +1047,44 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                free_block(cachep, nc->entry, nc->avail, node);
                        if (!cpus_empty(mask)) {
-                                spin_unlock(&l3->list_lock);
+                                spin_unlock_irq(&l3->list_lock);
-                                goto unlock_cache;
+                                goto free_array_cache;
                        }
-                        if (l3->shared) {
+                        shared = l3->shared;
+                        if (shared) {
                                free_block(cachep, l3->shared->entry,
                                           l3->shared->avail, node);
-                                kfree(l3->shared);
                                l3->shared = NULL;
                        }
-                        if (l3->alien) {
-                                drain_alien_cache(cachep, l3);
-                                free_alien_cache(l3->alien);
-                                l3->alien = NULL;
-                        }
-                        /* free slabs belonging to this node */
+                        alien = l3->alien;
-                        if (__node_shrink(cachep, node)) {
+                        l3->alien = NULL;
-                                cachep->nodelists[node] = NULL;
-                                spin_unlock(&l3->list_lock);
+                        spin_unlock_irq(&l3->list_lock);
-                                kfree(l3);
-                        } else {
+                        kfree(shared);
-                                spin_unlock(&l3->list_lock);
+                        if (alien) {
+                                drain_alien_cache(cachep, alien);
+                                free_alien_cache(alien);
                        }
-                      unlock_cache:
+free_array_cache:
-                        spin_unlock_irq(&cachep->spinlock);
                        kfree(nc);
                }
+                /*
+                 * In the previous loop, all the objects were freed to
+                 * the respective cache's slabs,  now we can go ahead and
+                 * shrink each nodelist to its limit.
+                 */
+                list_for_each_entry(cachep, &cache_chain, next) {
+                        l3 = cachep->nodelists[node];
+                        if (!l3)
+                                continue;
+                        spin_lock_irq(&l3->list_lock);
+                        /* free slabs belonging to this node */
+                        __node_shrink(cachep, node);
+                        spin_unlock_irq(&l3->list_lock);
+                }
                mutex_unlock(&cache_chain_mutex);
                break;
 #endif
@@ -1119,7 +1173,6 @@ void __init kmem_cache_init(void)
                BUG();
        cache_cache.colour = left_over / cache_cache.colour_off;
-        cache_cache.colour_next = 0;
        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
                                      sizeof(struct slab), cache_line_size());
@@ -1664,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                BUG();
        }
+        /*
+         * Prevent CPUs from coming and going.
+         * lock_cpu_hotplug() nests outside cache_chain_mutex
+         */
+        lock_cpu_hotplug();
        mutex_lock(&cache_chain_mutex);
        list_for_each(p, &cache_chain) {
@@ -1865,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->dtor = dtor;
        cachep->name = name;
-        /* Don't let CPUs to come and go */
-        lock_cpu_hotplug();
        if (g_cpucache_up == FULL) {
                enable_cpucache(cachep);
@@ -1925,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
-        unlock_cpu_hotplug();
      oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                      name);
        mutex_unlock(&cache_chain_mutex);
+        unlock_cpu_hotplug();
        return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2011,18 +2068,16 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
        smp_call_function_all_cpus(do_drain, cachep);
        check_irq_on();
-        spin_lock_irq(&cachep->spinlock);
        for_each_online_node(node) {
                l3 = cachep->nodelists[node];
                if (l3) {
-                        spin_lock(&l3->list_lock);
+                        spin_lock_irq(&l3->list_lock);
                        drain_array_locked(cachep, l3->shared, 1, node);
-                        spin_unlock(&l3->list_lock);
+                        spin_unlock_irq(&l3->list_lock);
                        if (l3->alien)
-                                drain_alien_cache(cachep, l3);
+                                drain_alien_cache(cachep, l3->alien);
                }
        }
-        spin_unlock_irq(&cachep->spinlock);
 }
 static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -2324,20 +2379,20 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                 */
                ctor_flags |= SLAB_CTOR_ATOMIC;
-        /* About to mess with non-constant members - lock. */
+        /* Take the l3 list lock to change the colour_next on this node */
        check_irq_off();
-        spin_lock(&cachep->spinlock);
+        l3 = cachep->nodelists[nodeid];
+        spin_lock(&l3->list_lock);
        /* Get colour for the slab, and cal the next value. */
-        offset = cachep->colour_next;
+        offset = l3->colour_next;
-        cachep->colour_next++;
+        l3->colour_next++;
-        if (cachep->colour_next >= cachep->colour)
+        if (l3->colour_next >= cachep->colour)
-                cachep->colour_next = 0;
+                l3->colour_next = 0;
-        offset *= cachep->colour_off;
+        spin_unlock(&l3->list_lock);
-        spin_unlock(&cachep->spinlock);
+        offset *= cachep->colour_off;
-        check_irq_off();
        if (local_flags & __GFP_WAIT)
                local_irq_enable();
@@ -2367,7 +2422,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
        check_irq_off();
-        l3 = cachep->nodelists[nodeid];
        spin_lock(&l3->list_lock);
        /* Make slab active. */
@@ -2725,6 +2779,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
        BUG_ON(!l3);
      retry:
+        check_irq_off();
        spin_lock(&l3->list_lock);
        entry = l3->slabs_partial.next;
        if (entry == &l3->slabs_partial) {
@@ -3304,11 +3359,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
        smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
        check_irq_on();
-        spin_lock_irq(&cachep->spinlock);
+        spin_lock(&cachep->spinlock);
        cachep->batchcount = batchcount;
        cachep->limit = limit;
        cachep->shared = shared;
-        spin_unlock_irq(&cachep->spinlock);
+        spin_unlock(&cachep->spinlock);
        for_each_online_cpu(i) {
                struct array_cache *ccold = new.new[i];
@@ -3440,7 +3495,7 @@ static void cache_reap(void *unused)
                l3 = searchp->nodelists[numa_node_id()];
                if (l3->alien)
-                        drain_alien_cache(searchp, l3);
+                        drain_alien_cache(searchp, l3->alien);
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3564,8 +3619,7 @@ static int s_show(struct seq_file *m, void *p)
        int node;
        struct kmem_list3 *l3;
-        check_irq_on();
+        spin_lock(&cachep->spinlock);
-        spin_lock_irq(&cachep->spinlock);
        active_objs = 0;
        num_slabs = 0;
        for_each_online_node(node) {
@@ -3573,7 +3627,8 @@ static int s_show(struct seq_file *m, void *p)
                if (!l3)
                        continue;
-                spin_lock(&l3->list_lock);
+                check_irq_on();
+                spin_lock_irq(&l3->list_lock);
                list_for_each(q, &l3->slabs_full) {
                        slabp = list_entry(q, struct slab, list);
@@ -3598,9 +3653,10 @@ static int s_show(struct seq_file *m, void *p)
                        num_slabs++;
                }
                free_objects += l3->free_objects;
-                shared_avail += l3->shared->avail;
+                if (l3->shared)
+                        shared_avail += l3->shared->avail;
-                spin_unlock(&l3->list_lock);
+                spin_unlock_irq(&l3->list_lock);
        }
        num_slabs += active_slabs;
        num_objs = num_slabs * cachep->num;
@@ -3644,7 +3700,7 @@ static int s_show(struct seq_file *m, void *p)
        }
 #endif
        seq_putc(m, '\n');
-        spin_unlock_irq(&cachep->spinlock);
+        spin_unlock(&cachep->spinlock);
        return 0;
 }
diff --git a/mm/slob.c b/mm/slob.c
index 1c240c4b71d9..a1f42bdc0245 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(slab_reclaim_pages);
 #ifdef CONFIG_SMP
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
        int i;
        struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
diff --git a/mm/swap.c b/mm/swap.c
index bc2442a7b0ee..76247424dea1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,19 +34,22 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
-void put_page(struct page *page)
+static void put_compound_page(struct page *page)
 {
-        if (unlikely(PageCompound(page))) {
+        page = (struct page *)page_private(page);
-                page = (struct page *)page_private(page);
+        if (put_page_testzero(page)) {
-                if (put_page_testzero(page)) {
+                void (*dtor)(struct page *page);
-                        void (*dtor)(struct page *page);
-                        dtor = (void (*)(struct page *))page[1].mapping;
+                dtor = (void (*)(struct page *))page[1].mapping;
-                        (*dtor)(page);
+                (*dtor)(page);
-                }
-                return;
        }
-        if (put_page_testzero(page))
+}
+void put_page(struct page *page)
+{
+        if (unlikely(PageCompound(page)))
+                put_compound_page(page);
+        else if (put_page_testzero(page))
                __page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -244,6 +247,15 @@ void release_pages(struct page **pages, int nr, int cold)
                struct page *page = pages[i];
                struct zone *pagezone;
+                if (unlikely(PageCompound(page))) {
+                        if (zone) {
+                                spin_unlock_irq(&zone->lru_lock);
+                                zone = NULL;
+                        }
+                        put_compound_page(page);
+                        continue;
+                }
                if (!put_page_testzero(page))
                        continue;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd06..1838c15ca4fd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                BUG_ON(PageActive(page));
                sc->nr_scanned++;
+                if (!sc->may_swap && page_mapped(page))
+                        goto keep_locked;
                /* Double the slab pressure for mapped and swapcache pages */
                if (page_mapped(page) || PageSwapCache(page))
                        sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
        struct address_space *mapping = page_mapping(page);
        if (page_mapped(page) && mapping)
-                if (try_to_unmap(page, 0) != SWAP_SUCCESS)
+                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
                        goto unlock_retry;
        if (PageDirty(page)) {
@@ -839,7 +843,7 @@ EXPORT_SYMBOL(migrate_page);
 * pages are swapped out.
 *
 * The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
 * or no retryable pages exist anymore.
 *
 * Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +932,21 @@ redo:
                        goto unlock_both;
                if (mapping->a_ops->migratepage) {
+                        /*
+                         * Most pages have a mapping and most filesystems
+                         * should provide a migration function. Anonymous
+                         * pages are part of swap space which also has its
+                         * own migration function. This is the most common
+                         * path for page migration.
+                         */
                        rc = mapping->a_ops->migratepage(newpage, page);
                        goto unlock_both;
                }
                /*
-                 * Trigger writeout if page is dirty
+                 * Default handling if a filesystem does not provide
+                 * a migration function. We can only migrate clean
+                 * pages so try to write out any dirty pages first.
                 */
                if (PageDirty(page)) {
                        switch (pageout(page, mapping)) {
@@ -949,9 +962,10 @@ redo:
                                ; /* try to migrate the page below */
                        }
                }
                /*
-                 * If we have no buffer or can release the buffer
+                 * Buffers are managed in a filesystem specific way.
-                 * then do a simple migration.
+                 * We must have no buffers or drop them.
                 */
                if (!page_has_buffers(page) ||
                    try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +980,11 @@ redo:
                 * swap them out.
                 */
                if (pass > 4) {
+                        /*
+                         * Persistently unable to drop buffers..... As a
+                         * measure of last resort we fall back to
+                         * swap_page().
+                         */
                        unlock_page(newpage);
                        newpage = NULL;
                        rc = swap_page(page);
@@ -1176,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        struct page *page;
        struct pagevec pvec;
        int reclaim_mapped = 0;
-        long mapped_ratio;
-        long distress;
+        if (unlikely(sc->may_swap)) {
-        long swap_tendency;
+                long mapped_ratio;
+                long distress;
+                long swap_tendency;
+                /*
+                 * `distress' is a measure of how much trouble we're having
+                 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+                 */
+                distress = 100 >> zone->prev_priority;
+                /*
+                 * The point of this algorithm is to decide when to start
+                 * reclaiming mapped memory instead of just pagecache.  Work out
+                 * how much memory
+                 * is mapped.
+                 */
+                mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+                /*
+                 * Now decide how much we really want to unmap some pages.  The
+                 * mapped ratio is downgraded - just because there's a lot of
+                 * mapped memory doesn't necessarily mean that page reclaim
+                 * isn't succeeding.
+                 *
+                 * The distress ratio is important - we don't want to start
+                 * going oom.
+                 *
+                 * A 100% value of vm_swappiness overrides this algorithm
+                 * altogether.
+                 */
+                swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+                /*
+                 * Now use this metric to decide whether to start moving mapped
+                 * memory onto the inactive list.
+                 */
+                if (swap_tendency >= 100)
+                        reclaim_mapped = 1;
+        }
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        zone->nr_active -= pgmoved;
        spin_unlock_irq(&zone->lru_lock);
-        /*
-         * `distress' is a measure of how much trouble we're having reclaiming
-         * pages.  0 -> no problems.  100 -> great trouble.
-         */
-        distress = 100 >> zone->prev_priority;
-        /*
-         * The point of this algorithm is to decide when to start reclaiming
-         * mapped memory instead of just pagecache.  Work out how much memory
-         * is mapped.
-         */
-        mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-        /*
-         * Now decide how much we really want to unmap some pages.  The mapped
-         * ratio is downgraded - just because there's a lot of mapped memory
-         * doesn't necessarily mean that page reclaim isn't succeeding.
-         *
-         * The distress ratio is important - we don't want to start going oom.
-         *
-         * A 100% value of vm_swappiness overrides this algorithm altogether.
-         */
-        swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-        /*
-         * Now use this metric to decide whether to start moving mapped memory
-         * onto the inactive list.
-         */
-        if (swap_tendency >= 100)
-                reclaim_mapped = 1;
        while (!list_empty(&l_hold)) {
                cond_resched();
                page = lru_to_page(&l_hold);
@@ -1595,9 +1621,7 @@ scan:
                        sc.nr_reclaimed = 0;
                        sc.priority = priority;
                        sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-                        atomic_inc(&zone->reclaim_in_progress);
                        shrink_zone(zone, &sc);
-                        atomic_dec(&zone->reclaim_in_progress);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);