Merge branch 'akpm' (updates from Andrew Morton)

Merge first patch-bomb from Andrew Morton: - various misc bits - I'm been patchmonkeying ocfs2 for a while, as Joel and Mark have been distracted. There has been quite a bit of activity. - About half the MM queue - Some backlight bits - Various lib/ updates - checkpatch updates - zillions more little rtc patches - ptrace - signals - exec - procfs - rapidio - nbd - aoe - pps - memstick - tools/testing/selftests updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (445 commits) tools/testing/selftests: don't assume the x bit is set on scripts selftests: add .gitignore for kcmp selftests: fix clean target in kcmp Makefile selftests: add .gitignore for vm selftests: add hugetlbfstest self-test: fix make clean selftests: exit 1 on failure kernel/resource.c: remove the unneeded assignment in function __find_resource aio: fix wrong comment in aio_complete() drivers/w1/slaves/w1_ds2408.c: add magic sequence to disable P0 test mode drivers/memstick/host/r592.c: convert to module_pci_driver drivers/memstick/host/jmb38x_ms: convert to module_pci_driver pps-gpio: add device-tree binding and support drivers/pps/clients/pps-gpio.c: convert to module_platform_driver drivers/pps/clients/pps-gpio.c: convert to devm_* helpers drivers/parport/share.c: use kzalloc Documentation/accounting/getdelays.c: avoid strncpy in accounting tool aoe: update internal version number to v83 aoe: update copyright date aoe: perform I/O completions in parallel ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-03 20:12:13 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-03 20:12:13 -0400
commit: 7f0ef0267e20d62d45d527911a993b1e998f4968 (patch)
tree: de51abc7da5903f59d83e23937f22420164c9477 /mm
parent: 862f0012549110d6f2586bf54b52ed4540cbff3a (diff)
parent: 9307c29524502c21f0e8a6d96d850b2f5bc0bd9a (diff)
22 files changed, 1021 insertions, 516 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f5e698e30d4a..7e28ecfa8aa4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -477,3 +477,15 @@ config FRONTSWAP
          and swap data is stored as normal on the matching swap device.
          If unsure, say Y to enable frontswap.
+config MEM_SOFT_DIRTY
+        bool "Track memory changes"
+        depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
+        select PROC_PAGE_MONITOR
+        help
+          This option enables memory changes tracking by introducing a
+          soft-dirty bit on pte-s. This bit it set when someone writes
+          into a page just as regular dirty bit, but unlike the latter
+          it can be cleared by hands.
+          See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 502517492258..d014ee5fcbbd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy);
 int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
                           unsigned int cap)
 {
-        char tmp[32];
        int err;
        bdi->name = name;
@@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
        if (err)
                return err;
-        sprintf(tmp, "%.28s%s", name, "-%d");
+        err = bdi_register(bdi, NULL, "%.28s-%ld", name,
-        err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
+                           atomic_long_inc_return(&bdi_seq));
        if (err) {
                bdi_destroy(bdi);
                return err;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb019ec2..6ab7744e692e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        return count;
 }
-static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+static int reset_managed_pages_done __initdata;
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
 {
        struct zone *z;
-        /*
+        if (reset_managed_pages_done)
-         * In free_area_init_core(), highmem zone's managed_pages is set to
+                return;
-         * present_pages, and bootmem allocator doesn't allocate from highmem
-         * zones. So there's no need to recalculate managed_pages because all
-         * highmem pages will be managed by the buddy system. Here highmem
-         * zone also includes highmem movable zone.
-         */
        for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
-                if (!is_highmem(z))
+                z->managed_pages = 0;
-                        z->managed_pages = 0;
 }
-/**
+void __init reset_all_zones_managed_pages(void)
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
-        register_page_bootmem_info_node(pgdat);
+        struct pglist_data *pgdat;
-        reset_node_lowmem_managed_pages(pgdat);
-        return free_all_bootmem_core(pgdat->bdata);
+        for_each_online_pgdat(pgdat)
+                reset_node_managed_pages(pgdat);
+        reset_managed_pages_done = 1;
 }
 /**
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void)
 {
        unsigned long total_pages = 0;
        bootmem_data_t *bdata;
-        struct pglist_data *pgdat;
-        for_each_online_pgdat(pgdat)
+        reset_all_zones_managed_pages();
-                reset_node_lowmem_managed_pages(pgdat);
        list_for_each_entry(bdata, &bdata_list, list)
                total_pages += free_all_bootmem_core(bdata);
+        totalram_pages += total_pages;
        return total_pages;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..d8b3b850150c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1429,7 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
        if (ret == 1) {
                pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
                VM_BUG_ON(!pmd_none(*new_pmd));
-                set_pmd_at(mm, new_addr, new_pmd, pmd);
+                set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
                spin_unlock(&mm->page_table_lock);
        }
 out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aed085ad11a8..83aff0a4d093 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
        hstate = hstate_vma(vma);
-        return 1UL << (hstate->order + PAGE_SHIFT);
+        return 1UL << huge_page_shift(hstate);
 }
 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
@@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void)
                 * side-effects, like CommitLimit going negative.
                 */
                if (h->order > (MAX_ORDER - 1))
-                        totalram_pages += 1 << h->order;
+                        adjust_managed_page_count(page, 1 << h->order);
        }
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..2e851f453814 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1148,6 +1148,58 @@ skip_node:
        return NULL;
 }
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+{
+        /*
+         * When a group in the hierarchy below root is destroyed, the
+         * hierarchy iterator can no longer be trusted since it might
+         * have pointed to the destroyed group.  Invalidate it.
+         */
+        atomic_inc(&root->dead_count);
+}
+static struct mem_cgroup *
+mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
+                     struct mem_cgroup *root,
+                     int *sequence)
+{
+        struct mem_cgroup *position = NULL;
+        /*
+         * A cgroup destruction happens in two stages: offlining and
+         * release.  They are separated by a RCU grace period.
+         *
+         * If the iterator is valid, we may still race with an
+         * offlining.  The RCU lock ensures the object won't be
+         * released, tryget will fail if we lost the race.
+         */
+        *sequence = atomic_read(&root->dead_count);
+        if (iter->last_dead_count == *sequence) {
+                smp_rmb();
+                position = iter->last_visited;
+                if (position && !css_tryget(&position->css))
+                        position = NULL;
+        }
+        return position;
+}
+static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
+                                   struct mem_cgroup *last_visited,
+                                   struct mem_cgroup *new_position,
+                                   int sequence)
+{
+        if (last_visited)
+                css_put(&last_visited->css);
+        /*
+         * We store the sequence count from the time @last_visited was
+         * loaded successfully instead of rereading it here so that we
+         * don't lose destruction events in between.  We could have
+         * raced with the destruction of @new_position after all.
+         */
+        iter->last_visited = new_position;
+        smp_wmb();
+        iter->last_dead_count = sequence;
+}
 /**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
@@ -1171,7 +1223,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 {
        struct mem_cgroup *memcg = NULL;
        struct mem_cgroup *last_visited = NULL;
-        unsigned long uninitialized_var(dead_count);
        if (mem_cgroup_disabled())
                return NULL;
@@ -1191,6 +1242,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
        rcu_read_lock();
        while (!memcg) {
                struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+                int uninitialized_var(seq);
                if (reclaim) {
                        int nid = zone_to_nid(reclaim->zone);
@@ -1204,37 +1256,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                goto out_unlock;
                        }
-                        /*
+                        last_visited = mem_cgroup_iter_load(iter, root, &seq);
-                         * If the dead_count mismatches, a destruction
-                         * has happened or is happening concurrently.
-                         * If the dead_count matches, a destruction
-                         * might still happen concurrently, but since
-                         * we checked under RCU, that destruction
-                         * won't free the object until we release the
-                         * RCU reader lock.  Thus, the dead_count
-                         * check verifies the pointer is still valid,
-                         * css_tryget() verifies the cgroup pointed to
-                         * is alive.
-                         */
-                        dead_count = atomic_read(&root->dead_count);
-                        if (dead_count == iter->last_dead_count) {
-                                smp_rmb();
-                                last_visited = iter->last_visited;
-                                if (last_visited &&
-                                    !css_tryget(&last_visited->css))
-                                        last_visited = NULL;
-                        }
                }
                memcg = __mem_cgroup_iter_next(root, last_visited);
                if (reclaim) {
-                        if (last_visited)
+                        mem_cgroup_iter_update(iter, last_visited, memcg, seq);
-                                css_put(&last_visited->css);
-                        iter->last_visited = memcg;
-                        smp_wmb();
-                        iter->last_dead_count = dead_count;
                        if (!memcg)
                                iter->generation++;
@@ -1448,11 +1476,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
        return ret;
 }
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
+bool task_in_mem_cgroup(struct task_struct *task,
+                        const struct mem_cgroup *memcg)
 {
-        int ret;
        struct mem_cgroup *curr = NULL;
        struct task_struct *p;
+        bool ret;
        p = find_lock_task_mm(task);
        if (p) {
@@ -1464,14 +1493,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
                 * killer still needs to detect if they have already been oom
                 * killed to prevent needlessly killing additional tasks.
                 */
-                task_lock(task);
+                rcu_read_lock();
                curr = mem_cgroup_from_task(task);
                if (curr)
                        css_get(&curr->css);
-                task_unlock(task);
+                rcu_read_unlock();
        }
        if (!curr)
-                return 0;
+                return false;
        /*
         * We should check use_hierarchy of "memcg" not "curr". Because checking
         * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -6317,14 +6346,14 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
        struct mem_cgroup *parent = memcg;
        while ((parent = parent_mem_cgroup(parent)))
-                atomic_inc(&parent->dead_count);
+                mem_cgroup_iter_invalidate(parent);
        /*
         * if the root memcg is not hierarchical we have to check it
         * explicitely.
         */
        if (!root_mem_cgroup->use_hierarchy)
-                atomic_inc(&root_mem_cgroup->dead_count);
+                mem_cgroup_iter_invalidate(root_mem_cgroup);
 }
 static void mem_cgroup_css_offline(struct cgroup *cont)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f1932f..2c13aa7a0164 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
        /*
         * Isolate the page, so that it doesn't get reallocated if it
-         * was free.
+         * was free. This flag should be kept set until the source page
+         * is freed and PG_hwpoison on it is set.
         */
        set_migratetype_isolate(p, true);
        /*
@@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
                /* Not a free page */
                ret = 1;
        }
-        unset_migratetype_isolate(p, MIGRATE_MOVABLE);
        unlock_memory_hotplug();
        return ret;
 }
@@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
                atomic_long_add(1 << compound_trans_order(hpage),
                                &num_poisoned_pages);
        }
-        /* keep elevated page count for bad page */
        return ret;
 }
@@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags)
                        atomic_long_inc(&num_poisoned_pages);
                }
        }
-        /* keep elevated page count for bad page */
+        unset_migratetype_isolate(page, MIGRATE_MOVABLE);
        return ret;
 }
@@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags)
                        if (ret > 0)
                                ret = -EIO;
                } else {
+                        /*
+                         * After page migration succeeds, the source page can
+                         * be trapped in pagevec and actual freeing is delayed.
+                         * Freeing code works differently based on PG_hwpoison,
+                         * so there's a race. We need to make sure that the
+                         * source page should be freed back to buddy before
+                         * setting PG_hwpoison.
+                         */
+                        if (!is_free_buddy_page(page))
+                                lru_add_drain_all();
+                        if (!is_free_buddy_page(page))
+                                drain_all_pages();
                        SetPageHWPoison(page);
+                        if (!is_free_buddy_page(page))
+                                pr_info("soft offline: %#lx: page leaked\n",
+                                        pfn);
                        atomic_long_inc(&num_poisoned_pages);
                }
        } else {
diff --git a/mm/memory.c b/mm/memory.c
index 95d0cce63583..b68812d682b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr);
 EXPORT_SYMBOL(mem_map);
 #endif
-unsigned long num_physpages;
 /*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
@@ -92,7 +91,6 @@ unsigned long num_physpages;
 */
 void * high_memory;
-EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 /*
@@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
+        unsigned long range_start = addr;
 again:
        init_rss_vec(rss);
@@ -1206,12 +1205,14 @@ again:
                force_flush = 0;
 #ifdef HAVE_GENERIC_MMU_GATHER
-                tlb->start = addr;
+                tlb->start = range_start;
-                tlb->end = end;
+                tlb->end = addr;
 #endif
                tlb_flush_mmu(tlb);
-                if (addr != end)
+                if (addr != end) {
+                        range_start = addr;
                        goto again;
+                }
        }
        return addr;
@@ -2904,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
                        details->first_index, details->last_index) {
                vba = vma->vm_pgoff;
-                vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
+                vea = vba + vma_pages(vma) - 1;
                /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
                zba = details->first_index;
                if (zba < vba)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 081b4d654ed6..f5ba127b2051 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
        res->end = start + size - 1;
        res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        if (request_resource(&iomem_resource, res) < 0) {
-                printk("System RAM resource %pR cannot be added\n", res);
+                pr_debug("System RAM resource %pR cannot be added\n", res);
                kfree(res);
                res = NULL;
        }
@@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info,  struct page *page,
        atomic_inc(&page->_count);
 }
-/* reference to __meminit __free_pages_bootmem is valid
+void put_page_bootmem(struct page *page)
- * so use __ref to tell modpost not to generate a warning */
-void __ref put_page_bootmem(struct page *page)
 {
        unsigned long type;
-        static DEFINE_MUTEX(ppb_lock);
        type = (unsigned long) page->lru.next;
        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page)
                ClearPagePrivate(page);
                set_page_private(page, 0);
                INIT_LIST_HEAD(&page->lru);
+                free_reserved_page(page);
-                /*
-                 * Please refer to comment for __free_pages_bootmem()
-                 * for why we serialize here.
-                 */
-                mutex_lock(&ppb_lock);
-                __free_pages_bootmem(page, 0);
-                mutex_unlock(&ppb_lock);
-                totalram_pages++;
        }
 }
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
@@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
        /* can't move pfns which are higher than @z2 */
        if (end_pfn > zone_end_pfn(z2))
                goto out_fail;
-        /* the move out part mast at the left most of @z2 */
+        /* the move out part must be at the left most of @z2 */
        if (start_pfn > z2->zone_start_pfn)
                goto out_fail;
        /* must included/overlap */
@@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
 void __online_page_set_limits(struct page *page)
 {
-        unsigned long pfn = page_to_pfn(page);
-        if (pfn >= num_physpages)
-                num_physpages = pfn + 1;
 }
 EXPORT_SYMBOL_GPL(__online_page_set_limits);
 void __online_page_increment_counters(struct page *page)
 {
-        totalram_pages++;
+        adjust_managed_page_count(page, 1);
-#ifdef CONFIG_HIGHMEM
-        if (PageHighMem(page))
-                totalhigh_pages++;
-#endif
 }
 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 void __online_page_free(struct page *page)
 {
-        ClearPageReserved(page);
+        __free_reserved_page(page);
-        init_page_count(page);
-        __free_page(page);
 }
 EXPORT_SYMBOL_GPL(__online_page_free);
@@ -918,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
+        unsigned long flags;
        unsigned long onlined_pages = 0;
        struct zone *zone;
        int need_zonelists_rebuild = 0;
@@ -994,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
                return ret;
        }
-        zone->managed_pages += onlined_pages;
        zone->present_pages += onlined_pages;
+        pgdat_resize_lock(zone->zone_pgdat, &flags);
        zone->zone_pgdat->node_present_pages += onlined_pages;
+        pgdat_resize_unlock(zone->zone_pgdat, &flags);
        if (onlined_pages) {
                node_states_set_node(zone_to_nid(zone), &arg);
                if (need_zonelists_rebuild)
@@ -1487,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
        unsigned long pfn, nr_pages, expire;
        long offlined_pages;
        int ret, drain, retry_max, node;
+        unsigned long flags;
        struct zone *zone;
        struct memory_notify arg;
@@ -1578,10 +1560,12 @@ repeat:
        /* reset pagetype flags and makes migrate type to be MOVABLE */
        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
        /* removal success */
-        zone->managed_pages -= offlined_pages;
+        adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
        zone->present_pages -= offlined_pages;
+        pgdat_resize_lock(zone->zone_pgdat, &flags);
        zone->zone_pgdat->node_present_pages -= offlined_pages;
-        totalram_pages -= offlined_pages;
+        pgdat_resize_unlock(zone->zone_pgdat, &flags);
        init_per_zone_wmark_min();
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02ea11e..633c08863fd8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
 #include <linux/init.h>
 #include <linux/kobject.h>
 #include <linux/export.h>
+#include <linux/memory.h>
+#include <linux/notifier.h>
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
 struct kobject *mm_kobj;
 EXPORT_SYMBOL_GPL(mm_kobj);
+#ifdef CONFIG_SMP
+s32 vm_committed_as_batch = 32;
+static void __meminit mm_compute_batch(void)
+{
+        u64 memsized_batch;
+        s32 nr = num_present_cpus();
+        s32 batch = max_t(s32, nr*2, 32);
+        /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
+        memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+        vm_committed_as_batch = max_t(s32, memsized_batch, batch);
+}
+static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
+                                        unsigned long action, void *arg)
+{
+        switch (action) {
+        case MEM_ONLINE:
+        case MEM_OFFLINE:
+                mm_compute_batch();
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block compute_batch_nb __meminitdata = {
+        .notifier_call = mm_compute_batch_notifier,
+        .priority = IPC_CALLBACK_PRI, /* use lowest priority */
+};
+static int __init mm_compute_batch_init(void)
+{
+        mm_compute_batch();
+        register_hotmemory_notifier(&compute_batch_nb);
+        return 0;
+}
+__initcall(mm_compute_batch_init);
+#endif
 static int __init mm_sysfs_init(void)
 {
        mm_kobj = kobject_create_and_add("mm", kernel_kobj);
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e1842fad..8468ffd05bae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
        if (is_mergeable_vma(vma, file, vm_flags) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
-                vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+                vm_pglen = vma_pages(vma);
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
                        return 1;
        }
diff --git a/mm/mremap.c b/mm/mremap.c
index 463a25705ac6..3708655378e9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                        continue;
                pte = ptep_get_and_clear(mm, old_addr, old_pte);
                pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
-                set_pte_at(mm, new_addr, new_pte, pte);
+                set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte));
        }
        arch_leave_lazy_mmu_mode();
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bdd3fa2fc73b..61107cf55bb3 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void)
        return count;
 }
-static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+static int reset_managed_pages_done __initdata;
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
 {
        struct zone *z;
-        /*
+        if (reset_managed_pages_done)
-         * In free_area_init_core(), highmem zone's managed_pages is set to
+                return;
-         * present_pages, and bootmem allocator doesn't allocate from highmem
-         * zones. So there's no need to recalculate managed_pages because all
-         * highmem pages will be managed by the buddy system. Here highmem
-         * zone also includes highmem movable zone.
-         */
        for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
-                if (!is_highmem(z))
+                z->managed_pages = 0;
-                        z->managed_pages = 0;
+}
+void __init reset_all_zones_managed_pages(void)
+{
+        struct pglist_data *pgdat;
+        for_each_online_pgdat(pgdat)
+                reset_node_managed_pages(pgdat);
+        reset_managed_pages_done = 1;
 }
 /**
@@ -160,17 +165,19 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
 */
 unsigned long __init free_all_bootmem(void)
 {
-        struct pglist_data *pgdat;
+        unsigned long pages;
-        for_each_online_pgdat(pgdat)
+        reset_all_zones_managed_pages();
-                reset_node_lowmem_managed_pages(pgdat);
        /*
         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
         */
-        return free_low_memory_core_early();
+        pages = free_low_memory_core_early();
+        totalram_pages += pages;
+        return pages;
 }
 /**
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884dcd6e7..e44e6e0a125c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -56,7 +56,6 @@
 void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
-unsigned long num_physpages;
 unsigned long highest_memmap_pfn;
 struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
@@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void)
 EXPORT_SYMBOL_GPL(vm_memory_committed);
 EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(num_physpages);
 /* list of mapped, potentially shareable regions */
 static struct kmem_cache *vm_region_jar;
@@ -282,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 long vread(char *buf, char *addr, unsigned long count)
 {
+        /* Don't allow overflow */
+        if ((unsigned long) buf + count < count)
+                count = -(unsigned long) buf;
        memcpy(buf, addr, count);
        return count;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fccf..327516b7aee9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,10 +61,14 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
+#include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
+/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
+static DEFINE_MUTEX(pcp_batch_high_lock);
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 };
 EXPORT_SYMBOL(node_states);
+/* Protect totalram_pages and zone->managed_pages */
+static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
@@ -739,14 +746,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        local_irq_restore(flags);
 }
-/*
+void __init __free_pages_bootmem(struct page *page, unsigned int order)
- * Read access to zone->managed_pages is safe because it's unsigned long,
- * but we still need to serialize writers. Currently all callers of
- * __free_pages_bootmem() except put_page_bootmem() should only be used
- * at boot time. So for shorter boot time, we shift the burden to
- * put_page_bootmem() to serialize writers.
- */
-void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
        unsigned int nr_pages = 1 << order;
        unsigned int loop;
@@ -781,11 +781,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
        set_page_refcounted(page);
        set_pageblock_migratetype(page, MIGRATE_CMA);
        __free_pages(page, pageblock_order);
-        totalram_pages += pageblock_nr_pages;
+        adjust_managed_page_count(page, pageblock_nr_pages);
-#ifdef CONFIG_HIGHMEM
-        if (PageHighMem(page))
-                totalhigh_pages += pageblock_nr_pages;
-#endif
 }
 #endif
@@ -1179,10 +1175,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
        unsigned long flags;
        int to_drain;
+        unsigned long batch;
        local_irq_save(flags);
-        if (pcp->count >= pcp->batch)
+        batch = ACCESS_ONCE(pcp->batch);
-                to_drain = pcp->batch;
+        if (pcp->count >= batch)
+                to_drain = batch;
        else
                to_drain = pcp->count;
        if (to_drain > 0) {
@@ -1350,8 +1348,9 @@ void free_hot_cold_page(struct page *page, int cold)
                list_add(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
-                free_pcppages_bulk(zone, pcp->batch, pcp);
+                unsigned long batch = ACCESS_ONCE(pcp->batch);
-                pcp->count -= pcp->batch;
+                free_pcppages_bulk(zone, batch, pcp);
+                pcp->count -= batch;
        }
 out:
@@ -2839,7 +2838,7 @@ EXPORT_SYMBOL(free_pages_exact);
 * nr_free_zone_pages() counts the number of counts pages which are beyond the
 * high watermark within all zones at or below a given zone index.  For each
 * zone, the number of pages is calculated as:
- *     present_pages - high_pages
+ *     managed_pages - high_pages
 */
 static unsigned long nr_free_zone_pages(int offset)
 {
@@ -2906,9 +2905,13 @@ EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
+        int zone_type;          /* needs to be signed */
+        unsigned long managed_pages = 0;
        pg_data_t *pgdat = NODE_DATA(nid);
-        val->totalram = pgdat->node_present_pages;
+        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+                managed_pages += pgdat->node_zones[zone_type].managed_pages;
+        val->totalram = managed_pages;
        val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
        val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3250,18 +3253,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
        static DEFINE_MUTEX(zl_order_mutex);
        mutex_lock(&zl_order_mutex);
-        if (write)
+        if (write) {
-                strcpy(saved_string, (char*)table->data);
+                if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                strcpy(saved_string, (char *)table->data);
+        }
        ret = proc_dostring(table, write, buffer, length, ppos);
        if (ret)
                goto out;
        if (write) {
                int oldval = user_zonelist_order;
-                if (__parse_numa_zonelist_order((char*)table->data)) {
+                ret = __parse_numa_zonelist_order((char *)table->data);
+                if (ret) {
                        /*
                         * bogus value.  restore saved string
                         */
-                        strncpy((char*)table->data, saved_string,
+                        strncpy((char *)table->data, saved_string,
                                NUMA_ZONELIST_ORDER_LEN);
                        user_zonelist_order = oldval;
                } else if (oldval != user_zonelist_order) {
@@ -3425,8 +3435,8 @@ static int default_zonelist_order(void)
                        z = &NODE_DATA(nid)->node_zones[zone_type];
                        if (populated_zone(z)) {
                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->present_pages;
+                                        low_kmem_size += z->managed_pages;
-                                total_size += z->present_pages;
+                                total_size += z->managed_pages;
                        } else if (zone_type == ZONE_NORMAL) {
                                /*
                                 * If any node has only lowmem, then node order
@@ -3705,12 +3715,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
                mminit_verify_zonelist();
                cpuset_init_current_mems_allowed();
        } else {
-                /* we have to stop all cpus to guarantee there is no user
-                   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
                if (zone)
                        setup_zone_pageset(zone);
 #endif
+                /* we have to stop all cpus to guarantee there is no user
+                   of zonelist */
                stop_machine(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
@@ -4032,7 +4042,40 @@ static int __meminit zone_batchsize(struct zone *zone)
 #endif
 }
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+/*
+ * pcp->high and pcp->batch values are related and dependent on one another:
+ * ->batch must never be higher then ->high.
+ * The following function updates them in a safe manner without read side
+ * locking.
+ *
+ * Any new users of pcp->batch and pcp->high should ensure they can cope with
+ * those fields changing asynchronously (acording the the above rule).
+ *
+ * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
+ * outside of boot time (or some other assurance that no concurrent updaters
+ * exist).
+ */
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
+                unsigned long batch)
+{
+       /* start with a fail safe value for batch */
+        pcp->batch = 1;
+        smp_wmb();
+       /* Update high, then batch, in order */
+        pcp->high = high;
+        smp_wmb();
+        pcp->batch = batch;
+}
+/* a companion to pageset_set_high() */
+static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+{
+        pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+}
+static void pageset_init(struct per_cpu_pageset *p)
 {
        struct per_cpu_pages *pcp;
        int migratetype;
@@ -4041,45 +4084,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp = &p->pcp;
        pcp->count = 0;
-        pcp->high = 6 * batch;
-        pcp->batch = max(1UL, 1 * batch);
        for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
                INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+        pageset_init(p);
+        pageset_set_batch(p, batch);
+}
 /*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
 * to the value high for the pageset p.
 */
+static void pageset_set_high(struct per_cpu_pageset *p,
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                                unsigned long high)
 {
-        struct per_cpu_pages *pcp;
+        unsigned long batch = max(1UL, high / 4);
+        if ((high / 4) > (PAGE_SHIFT * 8))
+                batch = PAGE_SHIFT * 8;
-        pcp = &p->pcp;
+        pageset_update(&p->pcp, high, batch);
-        pcp->high = high;
-        pcp->batch = max(1UL, high/4);
-        if ((high/4) > (PAGE_SHIFT * 8))
-                pcp->batch = PAGE_SHIFT * 8;
 }
-static void __meminit setup_zone_pageset(struct zone *zone)
+static void __meminit pageset_set_high_and_batch(struct zone *zone,
+                struct per_cpu_pageset *pcp)
 {
-        int cpu;
+        if (percpu_pagelist_fraction)
+                pageset_set_high(pcp,
-        zone->pageset = alloc_percpu(struct per_cpu_pageset);
+                        (zone->managed_pages /
+                                percpu_pagelist_fraction));
+        else
+                pageset_set_batch(pcp, zone_batchsize(zone));
+}
-        for_each_possible_cpu(cpu) {
+static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-                struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+{
+        struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
-                setup_pageset(pcp, zone_batchsize(zone));
+        pageset_init(pcp);
+        pageset_set_high_and_batch(zone, pcp);
+}
-                if (percpu_pagelist_fraction)
+static void __meminit setup_zone_pageset(struct zone *zone)
-                        setup_pagelist_highmark(pcp,
+{
-                                (zone->managed_pages /
+        int cpu;
-                                        percpu_pagelist_fraction));
+        zone->pageset = alloc_percpu(struct per_cpu_pageset);
-        }
+        for_each_possible_cpu(cpu)
+                zone_pageset_init(zone, cpu);
 }
 /*
@@ -5150,35 +5203,101 @@ early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-unsigned long free_reserved_area(unsigned long start, unsigned long end,
+void adjust_managed_page_count(struct page *page, long count)
-                                 int poison, char *s)
+{
+        spin_lock(&managed_page_count_lock);
+        page_zone(page)->managed_pages += count;
+        totalram_pages += count;
+#ifdef CONFIG_HIGHMEM
+        if (PageHighMem(page))
+                totalhigh_pages += count;
+#endif
+        spin_unlock(&managed_page_count_lock);
+}
+EXPORT_SYMBOL(adjust_managed_page_count);
+unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
-        unsigned long pages, pos;
+        void *pos;
+        unsigned long pages = 0;
-        pos = start = PAGE_ALIGN(start);
+        start = (void *)PAGE_ALIGN((unsigned long)start);
-        end &= PAGE_MASK;
+        end = (void *)((unsigned long)end & PAGE_MASK);
-        for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
+        for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
-                if (poison)
+                if ((unsigned int)poison <= 0xFF)
-                        memset((void *)pos, poison, PAGE_SIZE);
+                        memset(pos, poison, PAGE_SIZE);
-                free_reserved_page(virt_to_page((void *)pos));
+                free_reserved_page(virt_to_page(pos));
        }
        if (pages && s)
-                pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",
+                pr_info("Freeing %s memory: %ldK (%p - %p)\n",
                        s, pages << (PAGE_SHIFT - 10), start, end);
        return pages;
 }
+EXPORT_SYMBOL(free_reserved_area);
 #ifdef  CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
        __free_reserved_page(page);
        totalram_pages++;
+        page_zone(page)->managed_pages++;
        totalhigh_pages++;
 }
 #endif
+void __init mem_init_print_info(const char *str)
+{
+        unsigned long physpages, codesize, datasize, rosize, bss_size;
+        unsigned long init_code_size, init_data_size;
+        physpages = get_num_physpages();
+        codesize = _etext - _stext;
+        datasize = _edata - _sdata;
+        rosize = __end_rodata - __start_rodata;
+        bss_size = __bss_stop - __bss_start;
+        init_data_size = __init_end - __init_begin;
+        init_code_size = _einittext - _sinittext;
+        /*
+         * Detect special cases and adjust section sizes accordingly:
+         * 1) .init.* may be embedded into .data sections
+         * 2) .init.text.* may be out of [__init_begin, __init_end],
+         *    please refer to arch/tile/kernel/vmlinux.lds.S.
+         * 3) .rodata.* may be embedded into .text or .data sections.
+         */
+#define adj_init_size(start, end, size, pos, adj) \
+        if (start <= pos && pos < end && size > adj) \
+                size -= adj;
+        adj_init_size(__init_begin, __init_end, init_data_size,
+                     _sinittext, init_code_size);
+        adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+        adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+        adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+        adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+#undef  adj_init_size
+        printk("Memory: %luK/%luK available "
+               "(%luK kernel code, %luK rwdata, %luK rodata, "
+               "%luK init, %luK bss, %luK reserved"
+#ifdef  CONFIG_HIGHMEM
+               ", %luK highmem"
+#endif
+               "%s%s)\n",
+               nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
+               codesize >> 10, datasize >> 10, rosize >> 10,
+               (init_data_size + init_code_size) >> 10, bss_size >> 10,
+               (physpages - totalram_pages) << (PAGE_SHIFT-10),
+#ifdef  CONFIG_HIGHMEM
+               totalhigh_pages << (PAGE_SHIFT-10),
+#endif
+               str ? ", " : "", str ? str : "");
+}
 /**
 * set_dma_reserve - set the specified number of pages reserved in the first zone
 * @new_dma_reserve: The number of pages to mark reserved
@@ -5540,7 +5659,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
 * can have before it gets flushed back to buddy allocator.
 */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -5551,14 +5669,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (!write || (ret < 0))
                return ret;
+        mutex_lock(&pcp_batch_high_lock);
        for_each_populated_zone(zone) {
-                for_each_possible_cpu(cpu) {
+                unsigned long  high;
-                        unsigned long  high;
+                high = zone->managed_pages / percpu_pagelist_fraction;
-                        high = zone->managed_pages / percpu_pagelist_fraction;
+                for_each_possible_cpu(cpu)
-                        setup_pagelist_highmark(
+                        pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
-                                per_cpu_ptr(zone->pageset, cpu), high);
+                                         high);
-                }
        }
+        mutex_unlock(&pcp_batch_high_lock);
        return 0;
 }
@@ -6047,32 +6167,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
-static int __meminit __zone_pcp_update(void *data)
+/*
-{
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
-        struct zone *zone = data;
+ * page high values need to be recalulated.
-        int cpu;
+ */
-        unsigned long batch = zone_batchsize(zone), flags;
-        for_each_possible_cpu(cpu) {
-                struct per_cpu_pageset *pset;
-                struct per_cpu_pages *pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
-                pcp = &pset->pcp;
-                local_irq_save(flags);
-                if (pcp->count > 0)
-                        free_pcppages_bulk(zone, pcp->count, pcp);
-                drain_zonestat(zone, pset);
-                setup_pageset(pset, batch);
-                local_irq_restore(flags);
-        }
-        return 0;
-}
 void __meminit zone_pcp_update(struct zone *zone)
 {
-        stop_machine(__zone_pcp_update, zone, NULL);
+        unsigned cpu;
+        mutex_lock(&pcp_batch_high_lock);
+        for_each_possible_cpu(cpu)
+                pageset_set_high_and_batch(zone,
+                                per_cpu_ptr(zone->pageset, cpu));
+        mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
@@ -6142,6 +6248,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                list_del(&page->lru);
                rmv_page_order(page);
                zone->free_area[order].nr_free--;
+#ifdef CONFIG_HIGHMEM
+                if (PageHighMem(page))
+                        totalhigh_pages -= 1 << order;
+#endif
                for (i = 0; i < (1 << order); i++)
                        SetPageReserved((page+i));
                pfn += (1 << order);
diff --git a/mm/page_io.c b/mm/page_io.c
index a8a3ef45fed7..ba05b64e5d8d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/frontswap.h>
 #include <linux/aio.h>
+#include <linux/blkdev.h>
 #include <asm/pgtable.h>
 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err)
                                imajor(bio->bi_bdev->bd_inode),
                                iminor(bio->bi_bdev->bd_inode),
                                (unsigned long long)bio->bi_sector);
-        } else {
+                goto out;
-                SetPageUptodate(page);
        }
+        SetPageUptodate(page);
+        /*
+         * There is no guarantee that the page is in swap cache - the software
+         * suspend code (at least) uses end_swap_bio_read() against a non-
+         * swapcache page.  So we must check PG_swapcache before proceeding with
+         * this optimization.
+         */
+        if (likely(PageSwapCache(page))) {
+                struct swap_info_struct *sis;
+                sis = page_swap_info(page);
+                if (sis->flags & SWP_BLKDEV) {
+                        /*
+                         * The swap subsystem performs lazy swap slot freeing,
+                         * expecting that the page will be swapped out again.
+                         * So we can avoid an unnecessary write if the page
+                         * isn't redirtied.
+                         * This is good for real swap storage because we can
+                         * reduce unnecessary I/O and enhance wear-leveling
+                         * if an SSD is used as the as swap device.
+                         * But if in-memory swap device (eg zram) is used,
+                         * this causes a duplicated copy between uncompressed
+                         * data in VM-owned memory and compressed data in
+                         * zram-owned memory.  So let's free zram-owned memory
+                         * and make the VM-owned decompressed page *dirty*,
+                         * so the page should be swapped out somewhere again if
+                         * we again wish to reclaim it.
+                         */
+                        struct gendisk *disk = sis->bdev->bd_disk;
+                        if (disk->fops->swap_slot_free_notify) {
+                                swp_entry_t entry;
+                                unsigned long offset;
+                                entry.val = page_private(page);
+                                offset = swp_offset(entry);
+                                SetPageDirty(page);
+                                disk->fops->swap_slot_free_notify(sis->bdev,
+                                                offset);
+                        }
+                }
+        }
+out:
        unlock_page(page);
        bio_put(bio);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 6280da86b5d6..e22ceeb6e5ec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page,
        else
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
-        if (!mlocked_vma_newpage(vma, page))
+        if (!mlocked_vma_newpage(vma, page)) {
-                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
+                SetPageActive(page);
-        else
+                lru_cache_add(page);
+        } else
                add_page_to_unevictable_list(page);
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 1c91f0d3f6ab..3194ec414728 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -481,6 +481,9 @@ void __init sparse_init(void)
        struct page **map_map;
 #endif
+        /* see include/linux/mmzone.h 'struct mem_section' definition */
+        BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
        set_pageblock_order();
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71d6841..4a1d0d2c52fa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,10 +34,13 @@
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/pagemap.h>
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
-static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
@@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
                SetPageActive(page);
                lru += LRU_ACTIVE;
                add_page_to_lru_list(page, lruvec, lru);
+                trace_mm_lru_activate(page, page_to_pfn(page));
                __count_vm_event(PGACTIVATE);
                update_page_reclaim_stat(lruvec, file, 1);
@@ -428,6 +432,33 @@ void activate_page(struct page *page)
 }
 #endif
+static void __lru_cache_activate_page(struct page *page)
+{
+        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+        int i;
+        /*
+         * Search backwards on the optimistic assumption that the page being
+         * activated has just been added to this pagevec. Note that only
+         * the local pagevec is examined as a !PageLRU page could be in the
+         * process of being released, reclaimed, migrated or on a remote
+         * pagevec that is currently being drained. Furthermore, marking
+         * a remote pagevec's page PageActive potentially hits a race where
+         * a page is marked PageActive just after it is added to the inactive
+         * list causing accounting errors and BUG_ON checks to trigger.
+         */
+        for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
+                struct page *pagevec_page = pvec->pages[i];
+                if (pagevec_page == page) {
+                        SetPageActive(page);
+                        break;
+                }
+        }
+        put_cpu_var(lru_add_pvec);
+}
 /*
 * Mark a page as having seen activity.
 *
@@ -438,8 +469,18 @@ void activate_page(struct page *page)
 void mark_page_accessed(struct page *page)
 {
        if (!PageActive(page) && !PageUnevictable(page) &&
-                        PageReferenced(page) && PageLRU(page)) {
+                        PageReferenced(page)) {
-                activate_page(page);
+                /*
+                 * If the page is on the LRU, queue it for activation via
+                 * activate_page_pvecs. Otherwise, assume the page is on a
+                 * pagevec, mark it active and it'll be moved to the active
+                 * LRU on the next drain.
+                 */
+                if (PageLRU(page))
+                        activate_page(page);
+                else
+                        __lru_cache_activate_page(page);
                ClearPageReferenced(page);
        } else if (!PageReferenced(page)) {
                SetPageReferenced(page);
@@ -448,42 +489,37 @@ void mark_page_accessed(struct page *page)
 EXPORT_SYMBOL(mark_page_accessed);
 /*
- * Order of operations is important: flush the pagevec when it's already
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
- * full, not when adding the last page, to make sure that last page is
+ * to add the page to the [in]active [file|anon] list is deferred until the
- * not added to the LRU directly when passed to this function. Because
+ * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
- * mark_page_accessed() (called after this when writing) only activates
+ * have the page added to the active list using mark_page_accessed().
- * pages that are on the LRU, linear writes in subpage chunks would see
- * every PAGEVEC_SIZE page activated, which is unexpected.
 */
-void __lru_cache_add(struct page *page, enum lru_list lru)
+void __lru_cache_add(struct page *page)
 {
-        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
+        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
        page_cache_get(page);
        if (!pagevec_space(pvec))
-                __pagevec_lru_add(pvec, lru);
+                __pagevec_lru_add(pvec);
        pagevec_add(pvec, page);
-        put_cpu_var(lru_add_pvecs);
+        put_cpu_var(lru_add_pvec);
 }
 EXPORT_SYMBOL(__lru_cache_add);
 /**
- * lru_cache_add_lru - add a page to a page list
+ * lru_cache_add - add a page to a page list
 * @page: the page to be added to the LRU.
- * @lru: the LRU list to which the page is added.
 */
-void lru_cache_add_lru(struct page *page, enum lru_list lru)
+void lru_cache_add(struct page *page)
 {
        if (PageActive(page)) {
                VM_BUG_ON(PageUnevictable(page));
-                ClearPageActive(page);
        } else if (PageUnevictable(page)) {
                VM_BUG_ON(PageActive(page));
-                ClearPageUnevictable(page);
        }
-        VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
+        VM_BUG_ON(PageLRU(page));
-        __lru_cache_add(page, lru);
+        __lru_cache_add(page);
 }
 /**
@@ -583,15 +619,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
 */
 void lru_add_drain_cpu(int cpu)
 {
-        struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
+        struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
-        struct pagevec *pvec;
-        int lru;
-        for_each_lru(lru) {
+        if (pagevec_count(pvec))
-                pvec = &pvecs[lru - LRU_BASE];
+                __pagevec_lru_add(pvec);
-                if (pagevec_count(pvec))
-                        __pagevec_lru_add(pvec, lru);
-        }
        pvec = &per_cpu(lru_rotate_pvecs, cpu);
        if (pagevec_count(pvec)) {
@@ -708,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold)
                        del_page_from_lru_list(page, lruvec, page_off_lru(page));
                }
+                /* Clear Active bit in case of parallel mark_page_accessed */
+                ClearPageActive(page);
                list_add(&page->lru, &pages_to_free);
        }
        if (zone)
@@ -795,30 +829,26 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
                                 void *arg)
 {
-        enum lru_list lru = (enum lru_list)arg;
+        int file = page_is_file_cache(page);
-        int file = is_file_lru(lru);
+        int active = PageActive(page);
-        int active = is_active_lru(lru);
+        enum lru_list lru = page_lru(page);
-        VM_BUG_ON(PageActive(page));
        VM_BUG_ON(PageUnevictable(page));
        VM_BUG_ON(PageLRU(page));
        SetPageLRU(page);
-        if (active)
-                SetPageActive(page);
        add_page_to_lru_list(page, lruvec, lru);
        update_page_reclaim_stat(lruvec, file, active);
+        trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
 }
 /*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
-void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+void __pagevec_lru_add(struct pagevec *pvec)
 {
-        VM_BUG_ON(is_unevictable_lru(lru));
+        pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
-        pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
 }
 EXPORT_SYMBOL(__pagevec_lru_add);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746af55b8455..36af6eeaa67e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
-                if (si->flags & SWP_DISCARDABLE) {
+                if (si->flags & SWP_PAGE_DISCARD) {
                        /*
                         * Start range check on racing allocations, in case
                         * they overlap the cluster we eventually decide on
@@ -322,7 +322,7 @@ checks:
        if (si->lowest_alloc) {
                /*
-                 * Only set when SWP_DISCARDABLE, and there's a scan
+                 * Only set when SWP_PAGE_DISCARD, and there's a scan
                 * for a free cluster in progress or just completed.
                 */
                if (found_free_cluster) {
@@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
        return nr_extents;
 }
+/*
+ * Helper to sys_swapon determining if a given swap
+ * backing device queue supports DISCARD operations.
+ */
+static bool swap_discardable(struct swap_info_struct *si)
+{
+        struct request_queue *q = bdev_get_queue(si->bdev);
+        if (!q || !blk_queue_discard(q))
+                return false;
+        return true;
+}
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
        struct swap_info_struct *p;
@@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        p->flags |= SWP_SOLIDSTATE;
                        p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
                }
-                if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
-                        p->flags |= SWP_DISCARDABLE;
+                if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+                        /*
+                         * When discard is enabled for swap with no particular
+                         * policy flagged, we set all swap discard flags here in
+                         * order to sustain backward compatibility with older
+                         * swapon(8) releases.
+                         */
+                        p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+                                     SWP_PAGE_DISCARD);
+                        /*
+                         * By flagging sys_swapon, a sysadmin can tell us to
+                         * either do single-time area discards only, or to just
+                         * perform discards for released swap page-clusters.
+                         * Now it's time to adjust the p->flags accordingly.
+                         */
+                        if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+                                p->flags &= ~SWP_PAGE_DISCARD;
+                        else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+                                p->flags &= ~SWP_AREA_DISCARD;
+                        /* issue a swapon-time discard if it's still required */
+                        if (p->flags & SWP_AREA_DISCARD) {
+                                int err = discard_swap(p);
+                                if (unlikely(err))
+                                        printk(KERN_ERR
+                                               "swapon: discard_swap(%p): %d\n",
+                                                p, err);
+                        }
+                }
        }
        mutex_lock(&swapon_mutex);
@@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        enable_swap_info(p, prio, swap_map, frontswap_map);
        printk(KERN_INFO "Adding %uk swap on %s.  "
-                        "Priority:%d extents:%d across:%lluk %s%s%s\n",
+                        "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
                p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
                (p->flags & SWP_DISCARDABLE) ? "D" : "",
+                (p->flags & SWP_AREA_DISCARD) ? "s" : "",
+                (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
                (frontswap_map) ? "FS" : "");
        mutex_unlock(&swapon_mutex);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d365724feb05..91a10472a39a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
                va = rb_entry(n, struct vmap_area, rb_node);
                if (addr < va->va_start)
                        n = n->rb_left;
-                else if (addr > va->va_start)
+                else if (addr >= va->va_end)
                        n = n->rb_right;
                else
                        return va;
@@ -1322,13 +1322,6 @@ static void clear_vm_unlist(struct vm_struct *vm)
        vm->flags &= ~VM_UNLIST;
 }
-static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-                              unsigned long flags, const void *caller)
-{
-        setup_vmalloc_vm(vm, va, flags, caller);
-        clear_vm_unlist(vm);
-}
 static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
                unsigned long end, int node, gfp_t gfp_mask, const void *caller)
@@ -1337,16 +1330,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
        struct vm_struct *area;
        BUG_ON(in_interrupt());
-        if (flags & VM_IOREMAP) {
+        if (flags & VM_IOREMAP)
-                int bit = fls(size);
+                align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
-                if (bit > IOREMAP_MAX_ORDER)
-                        bit = IOREMAP_MAX_ORDER;
-                else if (bit < PAGE_SHIFT)
-                        bit = PAGE_SHIFT;
-                align = 1ul << bit;
-        }
        size = PAGE_ALIGN(size);
        if (unlikely(!size))
@@ -1367,16 +1352,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
                return NULL;
        }
-        /*
+        setup_vmalloc_vm(area, va, flags, caller);
-         * When this function is called from __vmalloc_node_range,
-         * we add VM_UNLIST flag to avoid accessing uninitialized
-         * members of vm_struct such as pages and nr_pages fields.
-         * They will be set later.
-         */
-        if (flags & VM_UNLIST)
-                setup_vmalloc_vm(area, va, flags, caller);
-        else
-                insert_vmalloc_vm(area, va, flags, caller);
        return area;
 }
@@ -1476,10 +1452,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
        if (!addr)
                return;
-        if ((PAGE_SIZE-1) & (unsigned long)addr) {
+        if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
-                WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+                        addr));
                return;
-        }
        area = remove_vm_area(addr);
        if (unlikely(!area)) {
@@ -2148,42 +2123,43 @@ finished:
 }
 /**
- *      remap_vmalloc_range  -  map vmalloc pages to userspace
+ *      remap_vmalloc_range_partial  -  map vmalloc pages to userspace
- *      @vma:           vma to cover (map full range of vma)
+ *      @vma:           vma to cover
- *      @addr:          vmalloc memory
+ *      @uaddr:         target user address to start at
- *      @pgoff:         number of pages into addr before first page to map
+ *      @kaddr:         virtual address of vmalloc kernel memory
+ *      @size:          size of map area
 *
 *      Returns:        0 for success, -Exxx on failure
 *
- *      This function checks that addr is a valid vmalloc'ed area, and
+ *      This function checks that @kaddr is a valid vmalloc'ed area,
- *      that it is big enough to cover the vma. Will return failure if
+ *      and that it is big enough to cover the range starting at
- *      that criteria isn't met.
+ *      @uaddr in @vma. Will return failure if that criteria isn't
+ *      met.
 *
 *      Similar to remap_pfn_range() (see mm/memory.c)
 */
-int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
-                                                unsigned long pgoff)
+                                void *kaddr, unsigned long size)
 {
        struct vm_struct *area;
-        unsigned long uaddr = vma->vm_start;
-        unsigned long usize = vma->vm_end - vma->vm_start;
-        if ((PAGE_SIZE-1) & (unsigned long)addr)
+        size = PAGE_ALIGN(size);
+        if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
                return -EINVAL;
-        area = find_vm_area(addr);
+        area = find_vm_area(kaddr);
        if (!area)
                return -EINVAL;
        if (!(area->flags & VM_USERMAP))
                return -EINVAL;
-        if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+        if (kaddr + size > area->addr + area->size)
                return -EINVAL;
-        addr += pgoff << PAGE_SHIFT;
        do {
-                struct page *page = vmalloc_to_page(addr);
+                struct page *page = vmalloc_to_page(kaddr);
                int ret;
                ret = vm_insert_page(vma, uaddr, page);
@@ -2191,14 +2167,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                        return ret;
                uaddr += PAGE_SIZE;
-                addr += PAGE_SIZE;
+                kaddr += PAGE_SIZE;
-                usize -= PAGE_SIZE;
+                size -= PAGE_SIZE;
-        } while (usize > 0);
+        } while (size > 0);
        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
+EXPORT_SYMBOL(remap_vmalloc_range_partial);
+/**
+ *      remap_vmalloc_range  -  map vmalloc pages to userspace
+ *      @vma:           vma to cover (map full range of vma)
+ *      @addr:          vmalloc memory
+ *      @pgoff:         number of pages into addr before first page to map
+ *
+ *      Returns:        0 for success, -Exxx on failure
+ *
+ *      This function checks that addr is a valid vmalloc'ed area, and
+ *      that it is big enough to cover the vma. Will return failure if
+ *      that criteria isn't met.
+ *
+ *      Similar to remap_pfn_range() (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+                                                unsigned long pgoff)
+{
+        return remap_vmalloc_range_partial(vma, vma->vm_start,
+                                           addr + (pgoff << PAGE_SHIFT),
+                                           vma->vm_end - vma->vm_start);
+}
 EXPORT_SYMBOL(remap_vmalloc_range);
 /*
@@ -2512,8 +2511,8 @@ found:
        /* insert all vm's */
        for (area = 0; area < nr_vms; area++)
-                insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+                setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
-                                  pcpu_get_vm_areas);
+                                 pcpu_get_vm_areas);
        kfree(vas);
        return vms;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee4..99b3ac7771ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 void putback_lru_page(struct page *page)
 {
        int lru;
-        int active = !!TestClearPageActive(page);
        int was_unevictable = PageUnevictable(page);
        VM_BUG_ON(PageLRU(page));
@@ -561,8 +560,8 @@ redo:
                 * unevictable page on [in]active list.
                 * We know how to handle that.
                 */
-                lru = active + page_lru_base_type(page);
+                lru = page_lru_base_type(page);
-                lru_cache_add_lru(page, lru);
+                lru_cache_add(page);
        } else {
                /*
                 * Put unevictable pages directly on zone's unevictable
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
+/* Check if a page is dirty or under writeback */
+static void page_check_dirty_writeback(struct page *page,
+                                       bool *dirty, bool *writeback)
+{
+        struct address_space *mapping;
+        /*
+         * Anonymous pages are not handled by flushers and must be written
+         * from reclaim context. Do not stall reclaim based on them
+         */
+        if (!page_is_file_cache(page)) {
+                *dirty = false;
+                *writeback = false;
+                return;
+        }
+        /* By default assume that the page flags are accurate */
+        *dirty = PageDirty(page);
+        *writeback = PageWriteback(page);
+        /* Verify dirty/writeback state if the filesystem supports it */
+        if (!page_has_private(page))
+                return;
+        mapping = page_mapping(page);
+        if (mapping && mapping->a_ops->is_dirty_writeback)
+                mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+}
 /*
 * shrink_page_list() returns the number of reclaimed pages
 */
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct scan_control *sc,
                                      enum ttu_flags ttu_flags,
                                      unsigned long *ret_nr_dirty,
+                                      unsigned long *ret_nr_unqueued_dirty,
+                                      unsigned long *ret_nr_congested,
                                      unsigned long *ret_nr_writeback,
+                                      unsigned long *ret_nr_immediate,
                                      bool force_reclaim)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+        unsigned long nr_unqueued_dirty = 0;
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_writeback = 0;
+        unsigned long nr_immediate = 0;
        cond_resched();
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                struct page *page;
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
+                bool dirty, writeback;
                cond_resched();
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+                /*
+                 * The number of dirty pages determines if a zone is marked
+                 * reclaim_congested which affects wait_iff_congested. kswapd
+                 * will stall and start writing pages if the tail of the LRU
+                 * is all dirty unqueued pages.
+                 */
+                page_check_dirty_writeback(page, &dirty, &writeback);
+                if (dirty || writeback)
+                        nr_dirty++;
+                if (dirty && !writeback)
+                        nr_unqueued_dirty++;
+                /*
+                 * Treat this page as congested if the underlying BDI is or if
+                 * pages are cycling through the LRU so quickly that the
+                 * pages marked for immediate reclaim are making it to the
+                 * end of the LRU a second time.
+                 */
+                mapping = page_mapping(page);
+                if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
+                    (writeback && PageReclaim(page)))
+                        nr_congested++;
+                /*
+                 * If a page at the tail of the LRU is under writeback, there
+                 * are three cases to consider.
+                 *
+                 * 1) If reclaim is encountering an excessive number of pages
+                 *    under writeback and this page is both under writeback and
+                 *    PageReclaim then it indicates that pages are being queued
+                 *    for IO but are being recycled through the LRU before the
+                 *    IO can complete. Waiting on the page itself risks an
+                 *    indefinite stall if it is impossible to writeback the
+                 *    page due to IO error or disconnected storage so instead
+                 *    note that the LRU is being scanned too quickly and the
+                 *    caller can stall after page list has been processed.
+                 *
+                 * 2) Global reclaim encounters a page, memcg encounters a
+                 *    page that is not marked for immediate reclaim or
+                 *    the caller does not have __GFP_IO. In this case mark
+                 *    the page for immediate reclaim and continue scanning.
+                 *
+                 *    __GFP_IO is checked  because a loop driver thread might
+                 *    enter reclaim, and deadlock if it waits on a page for
+                 *    which it is needed to do the write (loop masks off
+                 *    __GFP_IO|__GFP_FS for this reason); but more thought
+                 *    would probably show more reasons.
+                 *
+                 *    Don't require __GFP_FS, since we're not going into the
+                 *    FS, just waiting on its writeback completion. Worryingly,
+                 *    ext4 gfs2 and xfs allocate pages with
+                 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+                 *    may_enter_fs here is liable to OOM on them.
+                 *
+                 * 3) memcg encounters a page that is not already marked
+                 *    PageReclaim. memcg does not have any dirty pages
+                 *    throttling so we could easily OOM just because too many
+                 *    pages are in writeback and there is nothing else to
+                 *    reclaim. Wait for the writeback to complete.
+                 */
                if (PageWriteback(page)) {
-                        /*
+                        /* Case 1 above */
-                         * memcg doesn't have any dirty pages throttling so we
+                        if (current_is_kswapd() &&
-                         * could easily OOM just because too many pages are in
+                            PageReclaim(page) &&
-                         * writeback and there is nothing else to reclaim.
+                            zone_is_reclaim_writeback(zone)) {
-                         *
+                                nr_immediate++;
-                         * Check __GFP_IO, certainly because a loop driver
+                                goto keep_locked;
-                         * thread might enter reclaim, and deadlock if it waits
-                         * on a page for which it is needed to do the write
+                        /* Case 2 above */
-                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                        } else if (global_reclaim(sc) ||
-                         * but more thought would probably show more reasons.
-                         *
-                         * Don't require __GFP_FS, since we're not going into
-                         * the FS, just waiting on its writeback completion.
-                         * Worryingly, ext4 gfs2 and xfs allocate pages with
-                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-                         * testing may_enter_fs here is liable to OOM on them.
-                         */
-                        if (global_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
                                goto keep_locked;
+                        /* Case 3 above */
+                        } else {
+                                wait_on_page_writeback(page);
                        }
-                        wait_on_page_writeback(page);
                }
                if (!force_reclaim)
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
                        may_enter_fs = 1;
-                }
-                mapping = page_mapping(page);
+                        /* Adding to swap updated mapping */
+                        mapping = page_mapping(page);
+                }
                /*
                 * The page is mapped into the page tables of one or more
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
                if (PageDirty(page)) {
-                        nr_dirty++;
                        /*
                         * Only kswapd can writeback filesystem pages to
-                         * avoid risk of stack overflow but do not writeback
+                         * avoid risk of stack overflow but only writeback
-                         * unless under significant pressure.
+                         * if many dirty pages have been encountered.
                         */
                        if (page_is_file_cache(page) &&
                                        (!current_is_kswapd() ||
-                                         sc->priority >= DEF_PRIORITY - 2)) {
+                                         !zone_is_reclaim_dirty(zone))) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principal to deactivate_page()
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        /* Page is dirty, try to write it out here */
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
-                                nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
@@ -946,22 +1034,16 @@ keep:
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
-        /*
-         * Tag a zone as congested if all the dirty pages encountered were
-         * backed by a congested BDI. In this case, reclaimers should just
-         * back off and wait for congestion to clear because further reclaim
-         * will encounter the same problem
-         */
-        if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
-                zone_set_flag(zone, ZONE_CONGESTED);
        free_hot_cold_page_list(&free_pages, 1);
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
        mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
+        *ret_nr_congested += nr_congested;
+        *ret_nr_unqueued_dirty += nr_unqueued_dirty;
        *ret_nr_writeback += nr_writeback;
+        *ret_nr_immediate += nr_immediate;
        return nr_reclaimed;
 }
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
                .priority = DEF_PRIORITY,
                .may_unmap = 1,
        };
-        unsigned long ret, dummy1, dummy2;
+        unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
        struct page *page, *next;
        LIST_HEAD(clean_pages);
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
        }
        ret = shrink_page_list(&clean_pages, zone, &sc,
-                                TTU_UNMAP|TTU_IGNORE_ACCESS,
+                        TTU_UNMAP|TTU_IGNORE_ACCESS,
-                                &dummy1, &dummy2, true);
+                        &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
        list_splice(&clean_pages, page_list);
        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
        return ret;
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        unsigned long nr_reclaimed = 0;
        unsigned long nr_taken;
        unsigned long nr_dirty = 0;
+        unsigned long nr_congested = 0;
+        unsigned long nr_unqueued_dirty = 0;
        unsigned long nr_writeback = 0;
+        unsigned long nr_immediate = 0;
        isolate_mode_t isolate_mode = 0;
        int file = is_file_lru(lru);
        struct zone *zone = lruvec_zone(lruvec);
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                return 0;
        nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
-                                        &nr_dirty, &nr_writeback, false);
+                                &nr_dirty, &nr_unqueued_dirty, &nr_congested,
+                                &nr_writeback, &nr_immediate,
+                                false);
        spin_lock_irq(&zone->lru_lock);
@@ -1357,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * same way balance_dirty_pages() manages.
         *
         * This scales the number of dirty pages that must be under writeback
-         * before throttling depending on priority. It is a simple backoff
+         * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff
         * function that has the most effect in the range DEF_PRIORITY to
         * DEF_PRIORITY-2 which is the priority reclaim is considered to be
         * in trouble and reclaim is considered to be in trouble.
@@ -1368,9 +1455,53 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * ...
         * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
         *                     isolated page is PageWriteback
+         *
+         * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
+         * of pages under pages flagged for immediate reclaim and stall if any
+         * are encountered in the nr_immediate check below.
         */
        if (nr_writeback && nr_writeback >=
                        (nr_taken >> (DEF_PRIORITY - sc->priority)))
+                zone_set_flag(zone, ZONE_WRITEBACK);
+        /*
+         * memcg will stall in page writeback so only consider forcibly
+         * stalling for global reclaim
+         */
+        if (global_reclaim(sc)) {
+                /*
+                 * Tag a zone as congested if all the dirty pages scanned were
+                 * backed by a congested BDI and wait_iff_congested will stall.
+                 */
+                if (nr_dirty && nr_dirty == nr_congested)
+                        zone_set_flag(zone, ZONE_CONGESTED);
+                /*
+                 * If dirty pages are scanned that are not queued for IO, it
+                 * implies that flushers are not keeping up. In this case, flag
+                 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
+                 * pages from reclaim context. It will forcibly stall in the
+                 * next check.
+                 */
+                if (nr_unqueued_dirty == nr_taken)
+                        zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+                /*
+                 * In addition, if kswapd scans pages marked marked for
+                 * immediate reclaim and under writeback (nr_immediate), it
+                 * implies that pages are cycling through the LRU faster than
+                 * they are written so also forcibly stall.
+                 */
+                if (nr_unqueued_dirty == nr_taken || nr_immediate)
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
+        }
+        /*
+         * Stall direct reclaim for IO completions if underlying BDIs or zone
+         * is congested. Allow kswapd to continue until it starts encountering
+         * unqueued dirty pages or cycling through the LRU too quickly.
+         */
+        if (!sc->hibernation_mode && !current_is_kswapd())
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1822,17 +1953,25 @@ out:
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
+        unsigned long targets[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list lru;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct blk_plug plug;
+        bool scan_adjusted = false;
        get_scan_count(lruvec, sc, nr);
+        /* Record the original scan target for proportional adjustments later */
+        memcpy(targets, nr, sizeof(nr));
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
+                unsigned long nr_anon, nr_file, percentage;
+                unsigned long nr_scanned;
                for_each_evictable_lru(lru) {
                        if (nr[lru]) {
                                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
@@ -1842,17 +1981,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
                                                            lruvec, sc);
                        }
                }
+                if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+                        continue;
                /*
-                 * On large memory systems, scan >> priority can become
+                 * For global direct reclaim, reclaim only the number of pages
-                 * really large. This is fine for the starting priority;
+                 * requested. Less care is taken to scan proportionally as it
-                 * we want to put equal scanning pressure on each zone.
+                 * is more important to minimise direct reclaim stall latency
-                 * However, if the VM has a harder time of freeing pages,
+                 * than it is to properly age the LRU lists.
-                 * with multiple processes reclaiming pages, the total
-                 * freeing target can get unreasonably large.
                 */
-                if (nr_reclaimed >= nr_to_reclaim &&
+                if (global_reclaim(sc) && !current_is_kswapd())
-                    sc->priority < DEF_PRIORITY)
                        break;
+                /*
+                 * For kswapd and memcg, reclaim at least the number of pages
+                 * requested. Ensure that the anon and file LRUs shrink
+                 * proportionally what was requested by get_scan_count(). We
+                 * stop reclaiming one LRU and reduce the amount scanning
+                 * proportional to the original scan target.
+                 */
+                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+                if (nr_file > nr_anon) {
+                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
+                                                targets[LRU_ACTIVE_ANON] + 1;
+                        lru = LRU_BASE;
+                        percentage = nr_anon * 100 / scan_target;
+                } else {
+                        unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
+                                                targets[LRU_ACTIVE_FILE] + 1;
+                        lru = LRU_FILE;
+                        percentage = nr_file * 100 / scan_target;
+                }
+                /* Stop scanning the smaller of the LRU */
+                nr[lru] = 0;
+                nr[lru + LRU_ACTIVE] = 0;
+                /*
+                 * Recalculate the other LRU scan count based on its original
+                 * scan target and the percentage scanning already complete
+                 */
+                lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
+                nr_scanned = targets[lru] - nr[lru];
+                nr[lru] = targets[lru] * (100 - percentage) / 100;
+                nr[lru] -= min(nr[lru], nr_scanned);
+                lru += LRU_ACTIVE;
+                nr_scanned = targets[lru] - nr[lru];
+                nr[lru] = targets[lru] * (100 - percentage) / 100;
+                nr[lru] -= min(nr[lru], nr_scanned);
+                scan_adjusted = true;
        }
        blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;
@@ -2222,17 +2404,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                                WB_REASON_TRY_TO_FREE_PAGES);
                        sc->may_writepage = 1;
                }
-                /* Take a nap, wait for some writeback to complete */
-                if (!sc->hibernation_mode && sc->nr_scanned &&
-                    sc->priority < DEF_PRIORITY - 2) {
-                        struct zone *preferred_zone;
-                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
-                                                &cpuset_current_mems_allowed,
-                                                &preferred_zone);
-                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
-                }
        } while (--sc->priority >= 0);
 out:
@@ -2601,6 +2772,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 }
 /*
+ * kswapd shrinks the zone by the number of pages required to reach
+ * the high watermark.
+ *
+ * Returns true if kswapd scanned at least the requested number of pages to
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
+ */
+static bool kswapd_shrink_zone(struct zone *zone,
+                               int classzone_idx,
+                               struct scan_control *sc,
+                               unsigned long lru_pages,
+                               unsigned long *nr_attempted)
+{
+        unsigned long nr_slab;
+        int testorder = sc->order;
+        unsigned long balance_gap;
+        struct reclaim_state *reclaim_state = current->reclaim_state;
+        struct shrink_control shrink = {
+                .gfp_mask = sc->gfp_mask,
+        };
+        bool lowmem_pressure;
+        /* Reclaim above the high watermark. */
+        sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+        /*
+         * Kswapd reclaims only single pages with compaction enabled. Trying
+         * too hard to reclaim until contiguous free pages have become
+         * available can hurt performance by evicting too much useful data
+         * from memory. Do not reclaim more than needed for compaction.
+         */
+        if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+                        compaction_suitable(zone, sc->order) !=
+                                COMPACT_SKIPPED)
+                testorder = 0;
+        /*
+         * We put equal pressure on every zone, unless one zone has way too
+         * many pages free already. The "too many pages" is defined as the
+         * high wmark plus a "gap" where the gap is either the low
+         * watermark or 1% of the zone, whichever is smaller.
+         */
+        balance_gap = min(low_wmark_pages(zone),
+                (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                KSWAPD_ZONE_BALANCE_GAP_RATIO);
+        /*
+         * If there is no low memory pressure or the zone is balanced then no
+         * reclaim is necessary
+         */
+        lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
+        if (!lowmem_pressure && zone_balanced(zone, testorder,
+                                                balance_gap, classzone_idx))
+                return true;
+        shrink_zone(zone, sc);
+        reclaim_state->reclaimed_slab = 0;
+        nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+        /* Account for the number of pages attempted to reclaim */
+        *nr_attempted += sc->nr_to_reclaim;
+        if (nr_slab == 0 && !zone_reclaimable(zone))
+                zone->all_unreclaimable = 1;
+        zone_clear_flag(zone, ZONE_WRITEBACK);
+        /*
+         * If a zone reaches its high watermark, consider it to be no longer
+         * congested. It's possible there are dirty pages backed by congested
+         * BDIs but as pressure is relieved, speculatively avoid congestion
+         * waits.
+         */
+        if (!zone->all_unreclaimable &&
+            zone_balanced(zone, testorder, 0, classzone_idx)) {
+                zone_clear_flag(zone, ZONE_CONGESTED);
+                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+        }
+        return sc->nr_scanned >= sc->nr_to_reclaim;
+}
+/*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
@@ -2624,35 +2880,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                                        int *classzone_idx)
 {
-        bool pgdat_is_balanced = false;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
-        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
+                .priority = DEF_PRIORITY,
                .may_unmap = 1,
                .may_swap = 1,
-                /*
+                .may_writepage = !laptop_mode,
-                 * kswapd doesn't want to be bailed out while reclaim. because
-                 * we want to put equal scanning pressure on each zone.
-                 */
-                .nr_to_reclaim = ULONG_MAX,
                .order = order,
                .target_mem_cgroup = NULL,
        };
-        struct shrink_control shrink = {
-                .gfp_mask = sc.gfp_mask,
-        };
-loop_again:
-        sc.priority = DEF_PRIORITY;
-        sc.nr_reclaimed = 0;
-        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
        do {
                unsigned long lru_pages = 0;
+                unsigned long nr_attempted = 0;
+                bool raise_priority = true;
+                bool pgdat_needs_compaction = (order > 0);
+                sc.nr_reclaimed = 0;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2689,23 +2938,46 @@ loop_again:
                                end_zone = i;
                                break;
                        } else {
-                                /* If balanced, clear the congested flag */
+                                /*
+                                 * If balanced, clear the dirty and congested
+                                 * flags
+                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
+                                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
                        }
                }
-                if (i < 0) {
+                if (i < 0)
-                        pgdat_is_balanced = true;
                        goto out;
-                }
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
+                        if (!populated_zone(zone))
+                                continue;
                        lru_pages += zone_reclaimable_pages(zone);
+                        /*
+                         * If any zone is currently balanced then kswapd will
+                         * not call compaction as it is expected that the
+                         * necessary pages are already available.
+                         */
+                        if (pgdat_needs_compaction &&
+                                        zone_watermark_ok(zone, order,
+                                                low_wmark_pages(zone),
+                                                *classzone_idx, 0))
+                                pgdat_needs_compaction = false;
                }
                /*
+                 * If we're getting trouble reclaiming, start doing writepage
+                 * even in laptop mode.
+                 */
+                if (sc.priority < DEF_PRIORITY - 2)
+                        sc.may_writepage = 1;
+                /*
                 * Now scan the zone in the dma->highmem direction, stopping
                 * at the last zone which needs scanning.
                 *
@@ -2716,8 +2988,6 @@ loop_again:
                 */
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                        int nr_slab, testorder;
-                        unsigned long balance_gap;
                        if (!populated_zone(zone))
                                continue;
@@ -2738,65 +3008,14 @@ loop_again:
                        sc.nr_reclaimed += nr_soft_reclaimed;
                        /*
-                         * We put equal pressure on every zone, unless
+                         * There should be no need to raise the scanning
-                         * one zone has way too many pages free
+                         * priority if enough pages are already being scanned
-                         * already. The "too many pages" is defined
+                         * that that high watermark would be met at 100%
-                         * as the high wmark plus a "gap" where the
+                         * efficiency.
-                         * gap is either the low watermark or 1%
-                         * of the zone, whichever is smaller.
                         */
-                        balance_gap = min(low_wmark_pages(zone),
+                        if (kswapd_shrink_zone(zone, end_zone, &sc,
-                                (zone->managed_pages +
+                                        lru_pages, &nr_attempted))
-                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                                raise_priority = false;
-                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
-                        /*
-                         * Kswapd reclaims only single pages with compaction
-                         * enabled. Trying too hard to reclaim until contiguous
-                         * free pages have become available can hurt performance
-                         * by evicting too much useful data from memory.
-                         * Do not reclaim more than needed for compaction.
-                         */
-                        testorder = order;
-                        if (IS_ENABLED(CONFIG_COMPACTION) && order &&
-                                        compaction_suitable(zone, order) !=
-                                                COMPACT_SKIPPED)
-                                testorder = 0;
-                        if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-                            !zone_balanced(zone, testorder,
-                                           balance_gap, end_zone)) {
-                                shrink_zone(zone, &sc);
-                                reclaim_state->reclaimed_slab = 0;
-                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                                sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                                if (nr_slab == 0 && !zone_reclaimable(zone))
-                                        zone->all_unreclaimable = 1;
-                        }
-                        /*
-                         * If we're getting trouble reclaiming, start doing
-                         * writepage even in laptop mode.
-                         */
-                        if (sc.priority < DEF_PRIORITY - 2)
-                                sc.may_writepage = 1;
-                        if (zone->all_unreclaimable) {
-                                if (end_zone && end_zone == i)
-                                        end_zone--;
-                                continue;
-                        }
-                        if (zone_balanced(zone, testorder, 0, end_zone))
-                                /*
-                                 * If a zone reaches its high watermark,
-                                 * consider it to be no longer congested. It's
-                                 * possible there are dirty pages backed by
-                                 * congested BDIs but as pressure is relieved,
-                                 * speculatively avoid congestion waits
-                                 */
-                                zone_clear_flag(zone, ZONE_CONGESTED);
                }
                /*
@@ -2808,74 +3027,38 @@ loop_again:
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up(&pgdat->pfmemalloc_wait);
-                if (pgdat_balanced(pgdat, order, *classzone_idx)) {
-                        pgdat_is_balanced = true;
-                        break;          /* kswapd: all done */
-                }
                /*
-                 * We do this so kswapd doesn't build up large priorities for
+                 * Fragmentation may mean that the system cannot be rebalanced
-                 * example when it is freeing in parallel with allocators. It
+                 * for high-order allocations in all zones. If twice the
-                 * matches the direct reclaim path behaviour in terms of impact
+                 * allocation size has been reclaimed and the zones are still
-                 * on zone->*_priority.
+                 * not balanced then recheck the watermarks at order-0 to
+                 * prevent kswapd reclaiming excessively. Assume that a
+                 * process requested a high-order can direct reclaim/compact.
                 */
-                if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
+                if (order && sc.nr_reclaimed >= 2UL << order)
-                        break;
+                        order = sc.order = 0;
-        } while (--sc.priority >= 0);
-out:
-        if (!pgdat_is_balanced) {
-                cond_resched();
-                try_to_freeze();
+                /* Check if kswapd should be suspending */
+                if (try_to_freeze() || kthread_should_stop())
+                        break;
                /*
-                 * Fragmentation may mean that the system cannot be
+                 * Compact if necessary and kswapd is reclaiming at least the
-                 * rebalanced for high-order allocations in all zones.
+                 * high watermark number of pages as requsted
-                 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
-                 * it means the zones have been fully scanned and are still
-                 * not balanced. For high-order allocations, there is
-                 * little point trying all over again as kswapd may
-                 * infinite loop.
-                 *
-                 * Instead, recheck all watermarks at order-0 as they
-                 * are the most important. If watermarks are ok, kswapd will go
-                 * back to sleep. High-order users can still perform direct
-                 * reclaim if they wish.
                 */
-                if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
+                if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
-                        order = sc.order = 0;
-                goto loop_again;
-        }
-        /*
-         * If kswapd was reclaiming at a higher order, it has the option of
-         * sleeping without all zones being balanced. Before it does, it must
-         * ensure that the watermarks for order-0 on *all* zones are met and
-         * that the congestion flags are cleared. The congestion flag must
-         * be cleared as kswapd is the only mechanism that clears the flag
-         * and it is potentially going to sleep here.
-         */
-        if (order) {
-                int zones_need_compaction = 1;
-                for (i = 0; i <= end_zone; i++) {
-                        struct zone *zone = pgdat->node_zones + i;
-                        if (!populated_zone(zone))
-                                continue;
-                        /* Check if the memory needs to be defragmented. */
-                        if (zone_watermark_ok(zone, order,
-                                    low_wmark_pages(zone), *classzone_idx, 0))
-                                zones_need_compaction = 0;
-                }
-                if (zones_need_compaction)
                        compact_pgdat(pgdat, order);
-        }
+                /*
+                 * Raise priority if scanning rate is too low or there was no
+                 * progress in reclaiming pages
+                 */
+                if (raise_priority || !sc.nr_reclaimed)
+                        sc.priority--;
+        } while (sc.priority >= 1 &&
+                 !pgdat_balanced(pgdat, order, *classzone_idx));
+out:
        /*
         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-07-03 20:12:13 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-07-03 20:12:13 -0400
commit	7f0ef0267e20d62d45d527911a993b1e998f4968 (patch)
tree	de51abc7da5903f59d83e23937f22420164c9477 /mm
parent	862f0012549110d6f2586bf54b52ed4540cbff3a (diff)
parent	9307c29524502c21f0e8a6d96d850b2f5bc0bd9a (diff)