15 files changed, 220 insertions, 130 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..9701a501f769 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -631,7 +631,9 @@ repeat:
        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
        if (pagep) {
                page = radix_tree_deref_slot(pagep);
-                if (unlikely(!page || page == RADIX_TREE_RETRY))
+                if (unlikely(!page))
+                        goto out;
+                if (radix_tree_deref_retry(page))
                        goto repeat;
                if (!page_cache_get_speculative(page))
@@ -647,6 +649,7 @@ repeat:
                        goto repeat;
                }
        }
+out:
        rcu_read_unlock();
        return page;
@@ -764,12 +767,11 @@ repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_deref_retry(page)) {
-                 * this can only trigger if nr_found == 1, making livelock
+                        if (ret)
-                 * a non issue.
+                                start = pages[ret-1]->index;
-                 */
-                if (unlikely(page == RADIX_TREE_RETRY))
                        goto restart;
+                }
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -817,11 +819,7 @@ repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_deref_retry(page))
-                 * this can only trigger if nr_found == 1, making livelock
-                 * a non issue.
-                 */
-                if (unlikely(page == RADIX_TREE_RETRY))
                        goto restart;
                if (page->mapping == NULL || page->index != index)
@@ -874,11 +872,7 @@ repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_deref_retry(page))
-                 * this can only trigger if nr_found == 1, making livelock
-                 * a non issue.
-                 */
-                if (unlikely(page == RADIX_TREE_RETRY))
                        goto restart;
                if (!page_cache_get_speculative(page))
@@ -1016,6 +1010,9 @@ find_page:
                                goto page_not_up_to_date;
                        if (!trylock_page(page))
                                goto page_not_up_to_date;
+                        /* Did it get truncated before we got the lock? */
+                        if (!page->mapping)
+                                goto page_not_up_to_date_locked;
                        if (!mapping->a_ops->is_partially_uptodate(page,
                                                                desc, offset))
                                goto page_not_up_to_date_locked;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..2697806746d0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2380,8 +2380,11 @@ retry_avoidcopy:
         * When the original hugepage is shared one, it does not have
         * anon_vma prepared.
         */
-        if (unlikely(anon_vma_prepare(vma)))
+        if (unlikely(anon_vma_prepare(vma))) {
+                /* Caller expects lock to be held */
+                spin_lock(&mm->page_table_lock);
                return VM_FAULT_OOM;
+        }
        copy_huge_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
@@ -2665,7 +2668,8 @@ out_page_table_lock:
                unlock_page(pagecache_page);
                put_page(pagecache_page);
        }
-        unlock_page(page);
+        if (page != pagecache_page)
+                unlock_page(page);
 out_mutex:
        mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..dedb0aff673f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
 */
 static inline unsigned long page_order(struct page *page)
 {
-        VM_BUG_ON(!PageBuddy(page));
+        /* PageBuddy() must be checked by the caller */
        return page_private(page);
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..a9a534a38ac0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -269,13 +269,14 @@ enum move_type {
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
-        spinlock_t        lock; /* for from, to, moving_task */
+        spinlock_t        lock; /* for from, to */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
        unsigned long precharge;
        unsigned long moved_charge;
        unsigned long moved_swap;
        struct task_struct *moving_task;        /* a task moving charges */
+        struct mm_struct *mm;
        wait_queue_head_t waitq;                /* a waitq for other context */
 } mc = {
        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -1646,6 +1647,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
+                res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
@@ -1729,19 +1731,18 @@ again:
                rcu_read_lock();
                p = rcu_dereference(mm->owner);
-                VM_BUG_ON(!p);
                /*
-                 * because we don't have task_lock(), "p" can exit while
+                 * Because we don't have task_lock(), "p" can exit.
-                 * we're here. In that case, "mem" can point to root
+                 * In that case, "mem" can point to root or p can be NULL with
-                 * cgroup but never be NULL. (and task_struct itself is freed
+                 * race with swapoff. Then, we have small risk of mis-accouning.
-                 * by RCU, cgroup itself is RCU safe.) Then, we have small
+                 * But such kind of mis-account by race always happens because
-                 * risk here to get wrong cgroup. But such kind of mis-account
+                 * we don't have cgroup_mutex(). It's overkill and we allo that
-                 * by race always happens because we don't have cgroup_mutex().
+                 * small race, here.
-                 * It's overkill and we allow that small race, here.
+                 * (*) swapoff at el will charge against mm-struct not against
+                 * task-struct. So, mm->owner can be NULL.
                 */
                mem = mem_cgroup_from_task(p);
-                VM_BUG_ON(!mem);
+                if (!mem || mem_cgroup_is_root(mem)) {
-                if (mem_cgroup_is_root(mem)) {
                        rcu_read_unlock();
                        goto done;
                }
@@ -4445,7 +4446,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
        unsigned long precharge;
        struct vm_area_struct *vma;
-        down_read(&mm->mmap_sem);
+        /* We've already held the mmap_sem */
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                struct mm_walk mem_cgroup_count_precharge_walk = {
                        .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4457,7 +4458,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
                walk_page_range(vma->vm_start, vma->vm_end,
                                        &mem_cgroup_count_precharge_walk);
        }
-        up_read(&mm->mmap_sem);
        precharge = mc.precharge;
        mc.precharge = 0;
@@ -4508,11 +4508,16 @@ static void mem_cgroup_clear_mc(void)
                mc.moved_swap = 0;
        }
+        if (mc.mm) {
+                up_read(&mc.mm->mmap_sem);
+                mmput(mc.mm);
+        }
        spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
-        mc.moving_task = NULL;
        spin_unlock(&mc.lock);
+        mc.moving_task = NULL;
+        mc.mm = NULL;
        memcg_oom_recover(from);
        memcg_oom_recover(to);
        wake_up_all(&mc.waitq);
@@ -4537,26 +4542,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        return 0;
                /* We move charges only when we move a owner of the mm */
                if (mm->owner == p) {
+                        /*
+                         * We do all the move charge works under one mmap_sem to
+                         * avoid deadlock with down_write(&mmap_sem)
+                         * -> try_charge() -> if (mc.moving_task) -> sleep.
+                         */
+                        down_read(&mm->mmap_sem);
                        VM_BUG_ON(mc.from);
                        VM_BUG_ON(mc.to);
                        VM_BUG_ON(mc.precharge);
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
                        VM_BUG_ON(mc.moving_task);
+                        VM_BUG_ON(mc.mm);
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
                        mc.precharge = 0;
                        mc.moved_charge = 0;
                        mc.moved_swap = 0;
-                        mc.moving_task = current;
                        spin_unlock(&mc.lock);
+                        mc.moving_task = current;
+                        mc.mm = mm;
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)
                                mem_cgroup_clear_mc();
-                }
+                        /* We call up_read() and mmput() in clear_mc(). */
-                mmput(mm);
+                } else
+                        mmput(mm);
        }
        return ret;
 }
@@ -4644,7 +4660,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
        struct vm_area_struct *vma;
        lru_add_drain_all();
-        down_read(&mm->mmap_sem);
+        /* We've already held the mmap_sem */
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                int ret;
                struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4663,7 +4679,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
                         */
                        break;
        }
-        up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4672,17 +4687,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct task_struct *p,
                                bool threadgroup)
 {
-        struct mm_struct *mm;
+        if (!mc.mm)
-        if (!mc.to)
                /* no need to move charge */
                return;
-        mm = get_task_mm(p);
+        mem_cgroup_move_charge(mc.mm);
-        if (mm) {
-                mem_cgroup_move_charge(mm);
-                mmput(mm);
-        }
        mem_cgroup_clear_mc();
 }
 #else   /* !CONFIG_MMU */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dd186c1a5d53..6345dfe78d2c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -659,7 +659,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
 * Scanning pfn is much easier than scanning lru list.
 * Scan pfn from start to end and Find LRU page.
 */
-int scan_lru_pages(unsigned long start, unsigned long end)
+unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 {
        unsigned long pfn;
        struct page *page;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..c1002c68d617 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1588,7 +1588,7 @@ unsigned slab_node(struct mempolicy *policy)
                (void)first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->v.nodes,
                                                        &zone);
-                return zone->node;
+                return zone ? zone->node : numa_node_id();
        }
        default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..2cfa9bf1f0d4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -553,7 +553,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        int *result = NULL;
        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
-        int rcu_locked = 0;
        int charge = 0;
        struct mem_cgroup *mem = NULL;
        struct anon_vma *anon_vma = NULL;
@@ -605,20 +604,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        /*
         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
         * we cannot notice that anon_vma is freed while we migrates a page.
-         * This rcu_read_lock() delays freeing anon_vma pointer until the end
+         * This get_anon_vma() delays freeing anon_vma pointer until the end
         * of migration. File cache pages are no problem because of page_lock()
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
         */
        if (PageAnon(page)) {
-                rcu_read_lock();
+                /*
-                rcu_locked = 1;
+                 * Only page_lock_anon_vma() understands the subtleties of
+                 * getting a hold on an anon_vma from outside one of its mms.
-                /* Determine how to safely use anon_vma */
+                 */
-                if (!page_mapped(page)) {
+                anon_vma = page_lock_anon_vma(page);
-                        if (!PageSwapCache(page))
+                if (anon_vma) {
-                                goto rcu_unlock;
+                        /*
+                         * Take a reference count on the anon_vma if the
+                         * page is mapped so that it is guaranteed to
+                         * exist when the page is remapped later
+                         */
+                        get_anon_vma(anon_vma);
+                        page_unlock_anon_vma(anon_vma);
+                } else if (PageSwapCache(page)) {
                        /*
                         * We cannot be sure that the anon_vma of an unmapped
                         * swapcache page is safe to use because we don't
@@ -633,13 +638,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                         */
                        remap_swapcache = 0;
                } else {
-                        /*
+                        goto uncharge;
-                         * Take a reference count on the anon_vma if the
-                         * page is mapped so that it is guaranteed to
-                         * exist when the page is remapped later
-                         */
-                        anon_vma = page_anon_vma(page);
-                        get_anon_vma(anon_vma);
                }
        }
@@ -656,16 +655,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
         * free the metadata, so the page can be freed.
         */
        if (!page->mapping) {
-                if (!PageAnon(page) && page_has_private(page)) {
+                VM_BUG_ON(PageAnon(page));
-                        /*
+                if (page_has_private(page)) {
-                         * Go direct to try_to_free_buffers() here because
-                         * a) that's what try_to_release_page() would do anyway
-                         * b) we may be under rcu_read_lock() here, so we can't
-                         *    use GFP_KERNEL which is what try_to_release_page()
-                         *    needs to be effective.
-                         */
                        try_to_free_buffers(page);
-                        goto rcu_unlock;
+                        goto uncharge;
                }
                goto skip_unmap;
        }
@@ -679,14 +672,11 @@ skip_unmap:
        if (rc && remap_swapcache)
                remove_migration_ptes(page, page);
-rcu_unlock:
        /* Drop an anon_vma reference if we took one */
        if (anon_vma)
                drop_anon_vma(anon_vma);
-        if (rcu_locked)
-                rcu_read_unlock();
 uncharge:
        if (!charge)
                mem_cgroup_end_migration(mem, page, newpage);
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..283a0a84ea2c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2460,6 +2460,7 @@ int install_special_mapping(struct mm_struct *mm,
                            unsigned long addr, unsigned long len,
                            unsigned long vm_flags, struct page **pages)
 {
+        int ret;
        struct vm_area_struct *vma;
        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2477,16 +2478,23 @@ int install_special_mapping(struct mm_struct *mm,
        vma->vm_ops = &special_mapping_vmops;
        vma->vm_private_data = pages;
-        if (unlikely(insert_vm_struct(mm, vma))) {
+        ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
-                kmem_cache_free(vm_area_cachep, vma);
+        if (ret)
-                return -ENOMEM;
+                goto out;
-        }
+        ret = insert_vm_struct(mm, vma);
+        if (ret)
+                goto out;
        mm->total_vm += len >> PAGE_SHIFT;
        perf_event_mmap(vma);
        return 0;
+out:
+        kmem_cache_free(vm_area_cachep, vma);
+        return ret;
 }
 static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
        return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
-        unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-        /*
-         * While kswapd is awake, it is considered the zone is under some
-         * memory pressure. Under pressure, there is a risk that
-         * per-cpu-counter-drift will allow the min watermark to be breached
-         * potentially causing a live-lock. While kswapd is awake and
-         * free pages are low, get a better estimate for free pages
-         */
-        if (nr_free_pages < zone->percpu_drift_mark &&
-                        !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
-                return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-        return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2d1bf7cf8851..4c5133873097 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -211,6 +211,7 @@ success:
        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+        perf_event_mmap(vma);
        return 0;
 fail:
@@ -299,7 +300,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
                if (error)
                        goto out;
-                perf_event_mmap(vma);
                nstart = tmp;
                if (nstart < prev->vm_end)
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..acb3bd3c1cb9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1668,6 +1668,7 @@ void exit_mmap(struct mm_struct *mm)
                mm->mmap = vma->vm_next;
                delete_vma_from_mm(vma);
                delete_vma(mm, vma);
+                cond_resched();
        }
        kleave("");
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..985e072a3dd9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -103,19 +103,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 * only be modified with pm_mutex held, unless the suspend/hibernate code is
 * guaranteed not to run in parallel with that modification).
 */
-void set_gfp_allowed_mask(gfp_t mask)
+static gfp_t saved_gfp_mask;
+void pm_restore_gfp_mask(void)
 {
        WARN_ON(!mutex_is_locked(&pm_mutex));
-        gfp_allowed_mask = mask;
+        if (saved_gfp_mask) {
+                gfp_allowed_mask = saved_gfp_mask;
+                saved_gfp_mask = 0;
+        }
 }
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
 {
-        gfp_t ret = gfp_allowed_mask;
        WARN_ON(!mutex_is_locked(&pm_mutex));
-        gfp_allowed_mask &= ~mask;
+        WARN_ON(saved_gfp_mask);
-        return ret;
+        saved_gfp_mask = gfp_allowed_mask;
+        gfp_allowed_mask &= ~GFP_IOFS;
 }
 #endif /* CONFIG_PM_SLEEP */
@@ -530,7 +535,7 @@ static inline void __free_one_page(struct page *page,
         * so it's less likely to be used soon and more likely to be merged
         * as a higher order page
         */
-        if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
                combined_idx = __find_combined_index(page_idx, order);
                higher_page = page + combined_idx - page_idx;
@@ -1454,24 +1459,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int alloc_flags)
+                      int classzone_idx, int alloc_flags, long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-        long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
+        free_pages -= (1 << order) + 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-                return 0;
+                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
                free_pages -= z->free_area[o].nr_free << o;
@@ -1480,9 +1485,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min >>= 1;
                if (free_pages <= min)
-                        return 0;
+                        return false;
        }
-        return 1;
+        return true;
+}
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                        zone_page_state(z, NR_FREE_PAGES));
+}
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        long free_pages = zone_page_state(z, NR_FREE_PAGES);
+        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                                free_pages);
 }
 #ifdef CONFIG_NUMA
@@ -2436,7 +2460,7 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                        K(zone_nr_free_pages(zone)),
+                        K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a6..d8087f0db507 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
 static void purge_fragmented_blocks_allcpus(void);
 /*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+        atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+/*
 * Purges all lazily-freed vmap areas.
 *
 * If sync is 0 then don't purge if there is already a purge in progress.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..3e71cb1ee28c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
                if (zone->all_unreclaimable)
                        continue;
-                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
                                                                0, 0))
                        return 1;
        }
@@ -2169,7 +2169,7 @@ loop_again:
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
@@ -2215,7 +2215,7 @@ loop_again:
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.
                         */
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        8*high_wmark_pages(zone), end_zone, 0))
                                shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
@@ -2236,7 +2236,7 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2244,7 +2244,7 @@ loop_again:
                                 * means that we have a GFP_ATOMIC allocation
                                 * failure risk. Hurry up!
                                 */
-                                if (!zone_watermark_ok(zone, order,
+                                if (!zone_watermark_ok_safe(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
                        }
@@ -2378,7 +2378,9 @@ static int kswapd(void *p)
                                 */
                                if (!sleeping_prematurely(pgdat, order, remaining)) {
                                        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+                                        restore_pgdat_percpu_threshold(pgdat);
                                        schedule();
+                                        reduce_pgdat_percpu_threshold(pgdat);
                                } else {
                                        if (remaining)
                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order)
        if (!populated_zone(zone))
                return;
-        pgdat = zone->zone_pgdat;
+        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-        if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                return;
+        pgdat = zone->zone_pgdat;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
-        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
-        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                return;
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
+        if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+                return;
+        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..4d7faebb9b70 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat);
 #ifdef CONFIG_SMP
+static int calculate_pressure_threshold(struct zone *zone)
+{
+        int threshold;
+        int watermark_distance;
+        /*
+         * As vmstats are not up to date, there is drift between the estimated
+         * and real values. For high thresholds and a high number of CPUs, it
+         * is possible for the min watermark to be breached while the estimated
+         * value looks fine. The pressure threshold is a reduced value such
+         * that even the maximum amount of drift will not accidentally breach
+         * the min watermark
+         */
+        watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+        threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+        /*
+         * Maximum threshold is 125
+         */
+        threshold = min(125, threshold);
+        return threshold;
+}
 static int calculate_threshold(struct zone *zone)
 {
        int threshold;
@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void)
        }
 }
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+        struct zone *zone;
+        int cpu;
+        int threshold;
+        int i;
+        get_online_cpus();
+        for (i = 0; i < pgdat->nr_zones; i++) {
+                zone = &pgdat->node_zones[i];
+                if (!zone->percpu_drift_mark)
+                        continue;
+                threshold = calculate_pressure_threshold(zone);
+                for_each_online_cpu(cpu)
+                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                        = threshold;
+        }
+        put_online_cpus();
+}
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+        struct zone *zone;
+        int cpu;
+        int threshold;
+        int i;
+        get_online_cpus();
+        for (i = 0; i < pgdat->nr_zones; i++) {
+                zone = &pgdat->node_zones[i];
+                if (!zone->percpu_drift_mark)
+                        continue;
+                threshold = calculate_threshold(zone);
+                for_each_online_cpu(cpu)
+                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                        = threshold;
+        }
+        put_online_cpus();
+}
 /*
 * For use when we know that interrupts are disabled.
 */
@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu",
-                   zone_nr_free_pages(zone),
+                   zone_page_state(zone, NR_FREE_PAGES),
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),