Merge branch 'akpm' (patches from Andrew Morton)

Merge first patch-bomb from Andrew Morton: "Quite a lot of other stuff is banked up awaiting further next->mainline merging, but this batch contains: - Lots of random misc patches - OCFS2 - Most of MM - backlight updates - lib/ updates - printk updates - checkpatch updates - epoll tweaking - rtc updates - hfs - hfsplus - documentation - procfs - update gcov to gcc-4.7 format - IPC" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (269 commits) ipc, msg: fix message length check for negative values ipc/util.c: remove unnecessary work pending test devpts: plug the memory leak in kill_sb ./Makefile: export initial ramdisk compression config option init/Kconfig: add option to disable kernel compression drivers: w1: make w1_slave::flags long to avoid memory corruption drivers/w1/masters/ds1wm.cuse dev_get_platdata() drivers/memstick/core/ms_block.c: fix unreachable state in h_msb_read_page() drivers/memstick/core/mspro_block.c: fix attributes array allocation drivers/pps/clients/pps-gpio.c: remove redundant of_match_ptr kernel/panic.c: reduce 1 byte usage for print tainted buffer gcov: reuse kbasename helper kernel/gcov/fs.c: use pr_warn() kernel/module.c: use pr_foo() gcov: compile specific gcov implementation based on gcc version gcov: add support for gcc 4.7 gcov format gcov: move gcov structs definitions to a gcc version specific file kernel/taskstats.c: return -ENOMEM when alloc memory fails in add_del_listener() kernel/taskstats.c: add nla_nest_cancel() for failure processing between nla_nest_start() and nla_nest_end() kernel/sysctl_binary.c: use scnprintf() instead of snprintf() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-13 01:45:43 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-13 01:45:43 -0500
commit: 5cbb3d216e2041700231bcfc383ee5f8b7fc8b74 (patch)
tree: a738fa82dbcefa9bd283c08bc67f38827be63937 /mm
parent: 9bc9ccd7db1c9f043f75380b5a5b94912046a60e (diff)
parent: 4e9b45a19241354daec281d7a785739829b52359 (diff)
28 files changed, 569 insertions, 390 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 394838f489eb..3f4ffda152bb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,11 +153,18 @@ config MOVABLE_NODE
        help
          Allow a node to have only movable memory.  Pages used by the kernel,
          such as direct mapping pages cannot be migrated.  So the corresponding
-          memory device cannot be hotplugged.  This option allows users to
+          memory device cannot be hotplugged.  This option allows the following
-          online all the memory of a node as movable memory so that the whole
+          two things:
-          node can be hotplugged.  Users who don't use the memory hotplug
+          - When the system is booting, node full of hotpluggable memory can
-          feature are fine with this option on since they don't online memory
+          be arranged to have only movable memory so that the whole node can
-          as movable.
+          be hot-removed. (need movable_node boot option specified).
+          - After the system is up, the option allows users to online all the
+          memory of a node as movable memory so that the whole node can be
+          hot-removed.
+          Users who don't use the memory hotplug feature are fine with this
+          option on since they don't specify movable_node boot option or they
+          don't online memory as movable.
          Say Y here if you want to hotplug a whole node.
          Say N here if you want kernel to use memory on all nodes evenly.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 6ab7744e692e..90bd3507b413 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
        struct page *page;
-        unsigned long start, end, pages, count = 0;
+        unsigned long *map, start, end, pages, count = 0;
        if (!bdata->node_bootmem_map)
                return 0;
+        map = bdata->node_bootmem_map;
        start = bdata->node_min_pfn;
        end = bdata->node_low_pfn;
@@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                bdata - bootmem_node_data, start, end);
        while (start < end) {
-                unsigned long *map, idx, vec;
+                unsigned long idx, vec;
                unsigned shift;
-                map = bdata->node_bootmem_map;
                idx = start - bdata->node_min_pfn;
                shift = idx & (BITS_PER_LONG - 1);
                /*
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
        /* update goal according ...MAX_DMA32_PFN */
-        end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+        end_pfn = pgdat_end_pfn(pgdat);
        if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
            (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
diff --git a/mm/compaction.c b/mm/compaction.c
index b5326b141a25..805165bcd3dd 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page)
 }
 /*
- * Isolate free pages onto a private freelist. Caller must hold zone->lock.
+ * Isolate free pages onto a private freelist. If @strict is true, will abort
- * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
+ * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
- * pages inside of the pageblock (even though it may still end up isolating
+ * (even though it may still end up isolating some pages).
- * some pages).
 */
 static unsigned long isolate_freepages_block(struct compact_control *cc,
                                unsigned long blockpfn,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2612f60f53ee..0556c6a44959 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -27,11 +27,12 @@
 #include "internal.h"
 /*
- * By default transparent hugepage support is enabled for all mappings
+ * By default transparent hugepage support is disabled in order that avoid
- * and khugepaged scans all mappings. Defrag is only invoked by
+ * to risk increase the memory footprint of applications without a guaranteed
- * khugepaged hugepage allocations and by page faults inside
+ * benefit. When transparent hugepage support is enabled, is for all mappings,
- * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * and khugepaged scans all mappings.
- * allocations.
+ * Defrag is invoked by khugepaged hugepage allocations and by page faults
+ * for all hugepage allocations.
 */
 unsigned long transparent_hugepage_flags __read_mostly =
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -758,14 +759,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
                               HPAGE_PMD_ORDER, vma, haddr, nd);
 }
-#ifndef CONFIG_NUMA
-static inline struct page *alloc_hugepage(int defrag)
-{
-        return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-                           HPAGE_PMD_ORDER);
-}
-#endif
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct page *zero_page)
@@ -2198,7 +2191,34 @@ static void khugepaged_alloc_sleep(void)
                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
 }
+static int khugepaged_node_load[MAX_NUMNODES];
 #ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+        static int last_khugepaged_target_node = NUMA_NO_NODE;
+        int nid, target_node = 0, max_value = 0;
+        /* find first node with max normal pages hit */
+        for (nid = 0; nid < MAX_NUMNODES; nid++)
+                if (khugepaged_node_load[nid] > max_value) {
+                        max_value = khugepaged_node_load[nid];
+                        target_node = nid;
+                }
+        /* do some balance if several nodes have the same hit record */
+        if (target_node <= last_khugepaged_target_node)
+                for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+                                nid++)
+                        if (max_value == khugepaged_node_load[nid]) {
+                                target_node = nid;
+                                break;
+                        }
+        last_khugepaged_target_node = target_node;
+        return target_node;
+}
 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 {
        if (IS_ERR(*hpage)) {
@@ -2232,9 +2252,8 @@ static struct page
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-        *hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+        *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
-                                      node, __GFP_OTHER_NODE);
+                khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
        /*
         * After allocating the hugepage, release the mmap_sem read lock in
         * preparation for taking it in write mode.
@@ -2250,6 +2269,17 @@ static struct page
        return *hpage;
 }
 #else
+static int khugepaged_find_target_node(void)
+{
+        return 0;
+}
+static inline struct page *alloc_hugepage(int defrag)
+{
+        return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+                           HPAGE_PMD_ORDER);
+}
 static struct page *khugepaged_alloc_hugepage(bool *wait)
 {
        struct page *hpage;
@@ -2456,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        if (pmd_trans_huge(*pmd))
                goto out;
+        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
@@ -2472,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                if (unlikely(!page))
                        goto out_unmap;
                /*
-                 * Chose the node of the first page. This could
+                 * Record which node the original page is from and save this
-                 * be more sophisticated and look at more pages,
+                 * information to khugepaged_node_load[].
-                 * but isn't for now.
+                 * Khupaged will allocate hugepage from the node has the max
+                 * hit record.
                 */
-                if (node == NUMA_NO_NODE)
+                node = page_to_nid(page);
-                        node = page_to_nid(page);
+                khugepaged_node_load[node]++;
                VM_BUG_ON(PageCompound(page));
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
@@ -2492,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                ret = 1;
 out_unmap:
        pte_unmap_unlock(pte, ptl);
-        if (ret)
+        if (ret) {
+                node = khugepaged_find_target_node();
                /* collapse_huge_page will return with the mmap_sem released */
                collapse_huge_page(mm, address, hpage, vma, node);
+        }
 out:
        return ret;
 }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e126b0ef9ad2..31f01c5011e5 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
        }
        spin_lock_irqsave(&object->lock, flags);
-        if (ptr + size > object->pointer + object->size) {
+        if (size == SIZE_MAX) {
+                size = object->pointer + object->size - ptr;
+        } else if (ptr + size > object->pointer + object->size) {
                kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
                dump_object_info(object);
                kmem_cache_free(scan_area_cache, area);
diff --git a/mm/ksm.c b/mm/ksm.c
index 0bea2b262a47..175fff79dc95 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
                         * Allocate stable and unstable together:
                         * MAXSMP NODES_SHIFT 10 will use 16kB.
                         */
-                        buf = kcalloc(nr_node_ids + nr_node_ids,
+                        buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
-                                sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
+                                      GFP_KERNEL);
                        /* Let us assume that RB_ROOT is NULL is zero */
                        if (!buf)
                                err = -ENOMEM;
diff --git a/mm/memblock.c b/mm/memblock.c
index 0ac412a0a7ee..53e477bb5558 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,8 @@
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
+#include <asm-generic/sections.h>
 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
        .reserved.cnt           = 1,    /* empty dummy entry */
        .reserved.max           = INIT_MEMBLOCK_REGIONS,
+        .bottom_up              = false,
        .current_limit          = MEMBLOCK_ALLOC_ANYWHERE,
 };
@@ -82,6 +85,73 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
        return (i < type->cnt) ? i : -1;
 }
+/*
+ * __memblock_find_range_bottom_up - find free area utility in bottom-up
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area bottom-up.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
+                                phys_addr_t size, phys_addr_t align, int nid)
+{
+        phys_addr_t this_start, this_end, cand;
+        u64 i;
+        for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+                this_start = clamp(this_start, start, end);
+                this_end = clamp(this_end, start, end);
+                cand = round_up(this_start, align);
+                if (cand < this_end && this_end - cand >= size)
+                        return cand;
+        }
+        return 0;
+}
+/**
+ * __memblock_find_range_top_down - find free area utility, in top-down
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area top-down.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
+                               phys_addr_t size, phys_addr_t align, int nid)
+{
+        phys_addr_t this_start, this_end, cand;
+        u64 i;
+        for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+                this_start = clamp(this_start, start, end);
+                this_end = clamp(this_end, start, end);
+                if (this_end < size)
+                        continue;
+                cand = round_down(this_end - size, align);
+                if (cand >= this_start)
+                        return cand;
+        }
+        return 0;
+}
 /**
 * memblock_find_in_range_node - find free area in given range and node
 * @start: start of candidate range
@@ -92,15 +162,23 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 *
 * Find @size free area aligned to @align in the specified range and node.
 *
+ * When allocation direction is bottom-up, the @start should be greater
+ * than the end of the kernel image. Otherwise, it will be trimmed. The
+ * reason is that we want the bottom-up allocation just near the kernel
+ * image so it is highly likely that the allocated memory and the kernel
+ * will reside in the same node.
+ *
+ * If bottom-up allocation failed, will try to allocate memory top-down.
+ *
 * RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
 */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align, int nid)
 {
-        phys_addr_t this_start, this_end, cand;
+        int ret;
-        u64 i;
+        phys_addr_t kernel_end;
        /* pump up @end */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -109,19 +187,39 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
        /* avoid allocating the first page */
        start = max_t(phys_addr_t, start, PAGE_SIZE);
        end = max(start, end);
+        kernel_end = __pa_symbol(_end);
-        for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+        /*
-                this_start = clamp(this_start, start, end);
+         * try bottom-up allocation only when bottom-up mode
-                this_end = clamp(this_end, start, end);
+         * is set and @end is above the kernel image.
+         */
+        if (memblock_bottom_up() && end > kernel_end) {
+                phys_addr_t bottom_up_start;
-                if (this_end < size)
+                /* make sure we will allocate above the kernel */
-                        continue;
+                bottom_up_start = max(start, kernel_end);
-                cand = round_down(this_end - size, align);
+                /* ok, try bottom-up allocation first */
-                if (cand >= this_start)
+                ret = __memblock_find_range_bottom_up(bottom_up_start, end,
-                        return cand;
+                                                      size, align, nid);
+                if (ret)
+                        return ret;
+                /*
+                 * we always limit bottom-up allocation above the kernel,
+                 * but top-down allocation doesn't have the limit, so
+                 * retrying top-down allocation may succeed when bottom-up
+                 * allocation failed.
+                 *
+                 * bottom-up allocation is expected to be fail very rarely,
+                 * so we use WARN_ONCE() here to see the stack trace if
+                 * fail happens.
+                 */
+                WARN_ONCE(1, "memblock: bottom-up allocation failed, "
+                             "memory hotunplug may be affected\n");
        }
-        return 0;
+        return __memblock_find_range_top_down(start, end, size, align, nid);
 }
 /**
@@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
 * Find @size free area aligned to @align in the specified range.
 *
 * RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
 */
 phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 796820925de0..f20a57b7faf2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,6 +59,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/tcp_memcontrol.h>
+#include "slab.h"
 #include <asm/uaccess.h>
@@ -2968,7 +2969,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
        VM_BUG_ON(p->is_root_cache);
        cachep = p->root_cache;
-        return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+        return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
 }
 #ifdef CONFIG_SLABINFO
@@ -2997,21 +2998,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
        struct res_counter *fail_res;
        struct mem_cgroup *_memcg;
        int ret = 0;
-        bool may_oom;
        ret = res_counter_charge(&memcg->kmem, size, &fail_res);
        if (ret)
                return ret;
-        /*
-         * Conditions under which we can wait for the oom_killer. Those are
-         * the same conditions tested by the core page allocator
-         */
-        may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
        _memcg = memcg;
        ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
-                                      &_memcg, may_oom);
+                                      &_memcg, oom_gfp_allowed(gfp));
        if (ret == -EINTR)  {
                /*
@@ -3151,7 +3145,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 {
        struct memcg_cache_params *cur_params = s->memcg_params;
-        VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+        VM_BUG_ON(!is_root_cache(s));
        if (num_groups > memcg_limited_groups_array_size) {
                int i;
@@ -3412,7 +3406,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
        idx = memcg_cache_id(memcg);
        mutex_lock(&memcg_cache_mutex);
-        new_cachep = cachep->memcg_params->memcg_caches[idx];
+        new_cachep = cache_from_memcg_idx(cachep, idx);
        if (new_cachep) {
                css_put(&memcg->css);
                goto out;
@@ -3458,8 +3452,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
         * we'll take the set_limit_mutex to protect ourselves against this.
         */
        mutex_lock(&set_limit_mutex);
-        for (i = 0; i < memcg_limited_groups_array_size; i++) {
+        for_each_memcg_cache_index(i) {
-                c = s->memcg_params->memcg_caches[i];
+                c = cache_from_memcg_idx(s, i);
                if (!c)
                        continue;
@@ -3592,8 +3586,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
         * code updating memcg_caches will issue a write barrier to match this.
         */
        read_barrier_depends();
-        if (likely(cachep->memcg_params->memcg_caches[idx])) {
+        if (likely(cache_from_memcg_idx(cachep, idx))) {
-                cachep = cachep->memcg_params->memcg_caches[idx];
+                cachep = cache_from_memcg_idx(cachep, idx);
                goto out;
        }
@@ -5389,45 +5383,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
                                struct cftype *cft, struct seq_file *m)
 {
+        struct numa_stat {
+                const char *name;
+                unsigned int lru_mask;
+        };
+        static const struct numa_stat stats[] = {
+                { "total", LRU_ALL },
+                { "file", LRU_ALL_FILE },
+                { "anon", LRU_ALL_ANON },
+                { "unevictable", BIT(LRU_UNEVICTABLE) },
+        };
+        const struct numa_stat *stat;
        int nid;
-        unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
+        unsigned long nr;
-        unsigned long node_nr;
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
+        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
-        seq_printf(m, "total=%lu", total_nr);
+                nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
-        for_each_node_state(nid, N_MEMORY) {
+                seq_printf(m, "%s=%lu", stat->name, nr);
-                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
+                for_each_node_state(nid, N_MEMORY) {
-                seq_printf(m, " N%d=%lu", nid, node_nr);
+                        nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
-        }
+                                                          stat->lru_mask);
-        seq_putc(m, '\n');
+                        seq_printf(m, " N%d=%lu", nid, nr);
+                }
-        file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
+                seq_putc(m, '\n');
-        seq_printf(m, "file=%lu", file_nr);
+        }
-        for_each_node_state(nid, N_MEMORY) {
-                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
-                                LRU_ALL_FILE);
+                struct mem_cgroup *iter;
-                seq_printf(m, " N%d=%lu", nid, node_nr);
-        }
+                nr = 0;
-        seq_putc(m, '\n');
+                for_each_mem_cgroup_tree(iter, memcg)
+                        nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
-        anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
+                seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
-        seq_printf(m, "anon=%lu", anon_nr);
+                for_each_node_state(nid, N_MEMORY) {
-        for_each_node_state(nid, N_MEMORY) {
+                        nr = 0;
-                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+                        for_each_mem_cgroup_tree(iter, memcg)
-                                LRU_ALL_ANON);
+                                nr += mem_cgroup_node_nr_lru_pages(
-                seq_printf(m, " N%d=%lu", nid, node_nr);
+                                        iter, nid, stat->lru_mask);
+                        seq_printf(m, " N%d=%lu", nid, nr);
+                }
+                seq_putc(m, '\n');
        }
-        seq_putc(m, '\n');
-        unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
-        seq_printf(m, "unevictable=%lu", unevictable_nr);
-        for_each_node_state(nid, N_MEMORY) {
-                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
-                                BIT(LRU_UNEVICTABLE));
-                seq_printf(m, " N%d=%lu", nid, node_nr);
-        }
-        seq_putc(m, '\n');
        return 0;
 }
 #endif /* CONFIG_NUMA */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bf3351b5115e..f9d78ec7831f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1423,19 +1423,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
                return 1;
        /*
-         * The lock_memory_hotplug prevents a race with memory hotplug.
-         * This is a big hammer, a better would be nicer.
-         */
-        lock_memory_hotplug();
-        /*
-         * Isolate the page, so that it doesn't get reallocated if it
-         * was free. This flag should be kept set until the source page
-         * is freed and PG_hwpoison on it is set.
-         */
-        if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
-                set_migratetype_isolate(p, true);
-        /*
         * When the target page is a free hugepage, just remove it
         * from free hugepage list.
         */
@@ -1455,7 +1442,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
                /* Not a free page */
                ret = 1;
        }
-        unlock_memory_hotplug();
        return ret;
 }
@@ -1654,15 +1640,28 @@ int soft_offline_page(struct page *page, int flags)
                }
        }
+        /*
+         * The lock_memory_hotplug prevents a race with memory hotplug.
+         * This is a big hammer, a better would be nicer.
+         */
+        lock_memory_hotplug();
+        /*
+         * Isolate the page, so that it doesn't get reallocated if it
+         * was free. This flag should be kept set until the source page
+         * is freed and PG_hwpoison on it is set.
+         */
+        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+                set_migratetype_isolate(page, true);
        ret = get_any_page(page, pfn, flags);
-        if (ret < 0)
+        unlock_memory_hotplug();
-                goto unset;
+        if (ret > 0) { /* for in-use pages */
-        if (ret) { /* for in-use pages */
                if (PageHuge(page))
                        ret = soft_offline_huge_page(page, flags);
                else
                        ret = __soft_offline_page(page, flags);
-        } else { /* for free pages */
+        } else if (ret == 0) { /* for free pages */
                if (PageHuge(page)) {
                        set_page_hwpoison_huge_page(hpage);
                        dequeue_hwpoisoned_huge_page(hpage);
@@ -1673,7 +1672,6 @@ int soft_offline_page(struct page *page, int flags)
                        atomic_long_inc(&num_poisoned_pages);
                }
        }
-unset:
        unset_migratetype_isolate(page, MIGRATE_MOVABLE);
        return ret;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 33a3dbec3cc8..bf8665849a5f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 /*
 * This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
 */
 void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3870e2..489f235502db 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,6 +31,7 @@
 #include <linux/firmware-map.h>
 #include <linux/stop_machine.h>
 #include <linux/hugetlb.h>
+#include <linux/memblock.h>
 #include <asm/tlbflush.h>
@@ -365,8 +366,7 @@ out_fail:
 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
                            unsigned long end_pfn)
 {
-        unsigned long old_pgdat_end_pfn =
+        unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
-                pgdat->node_start_pfn + pgdat->node_spanned_pages;
        if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
                pgdat->node_start_pfn = start_pfn;
@@ -402,13 +402,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 static int __meminit __add_section(int nid, struct zone *zone,
                                        unsigned long phys_start_pfn)
 {
-        int nr_pages = PAGES_PER_SECTION;
        int ret;
        if (pfn_valid(phys_start_pfn))
                return -EEXIST;
-        ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
+        ret = sparse_add_one_section(zone, phys_start_pfn);
        if (ret < 0)
                return ret;
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 static void shrink_pgdat_span(struct pglist_data *pgdat,
                              unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
+        unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
-        unsigned long pgdat_end_pfn =
+        unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
-                pgdat->node_start_pfn + pgdat->node_spanned_pages;
+        unsigned long pgdat_end_pfn = p;
        unsigned long pfn;
        struct mem_section *ms;
        int nid = pgdat->node_id;
@@ -935,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        arg.nr_pages = nr_pages;
        node_states_check_changes_online(nr_pages, zone, &arg);
-        nid = page_to_nid(pfn_to_page(pfn));
+        nid = pfn_to_nid(pfn);
        ret = memory_notify(MEM_GOING_ONLINE, &arg);
        ret = notifier_to_errno(ret);
@@ -1044,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
-/*
+/**
+ * try_online_node - online a node if offlined
+ *
 * called by cpu_up() to online a node without onlined memory.
 */
-int mem_online_node(int nid)
+int try_online_node(int nid)
 {
        pg_data_t       *pgdat;
        int     ret;
+        if (node_online(nid))
+                return 0;
        lock_memory_hotplug();
        pgdat = hotadd_new_pgdat(nid, 0);
        if (!pgdat) {
+                pr_err("Cannot online node %d due to NULL pgdat\n", nid);
                ret = -ENOMEM;
                goto out;
        }
@@ -1062,6 +1067,12 @@ int mem_online_node(int nid)
        ret = register_one_node(nid);
        BUG_ON(ret);
+        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+                mutex_lock(&zonelists_mutex);
+                build_all_zonelists(NULL, NULL);
+                mutex_unlock(&zonelists_mutex);
+        }
 out:
        unlock_memory_hotplug();
        return ret;
@@ -1412,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 }
 #endif /* CONFIG_MOVABLE_NODE */
+static int __init cmdline_parse_movable_node(char *p)
+{
+#ifdef CONFIG_MOVABLE_NODE
+        /*
+         * Memory used by the kernel cannot be hot-removed because Linux
+         * cannot migrate the kernel pages. When memory hotplug is
+         * enabled, we should prevent memblock from allocating memory
+         * for the kernel.
+         *
+         * ACPI SRAT records all hotpluggable memory ranges. But before
+         * SRAT is parsed, we don't know about it.
+         *
+         * The kernel image is loaded into memory at very early time. We
+         * cannot prevent this anyway. So on NUMA system, we set any
+         * node the kernel resides in as un-hotpluggable.
+         *
+         * Since on modern servers, one node could have double-digit
+         * gigabytes memory, we can assume the memory around the kernel
+         * image is also un-hotpluggable. So before SRAT is parsed, just
+         * allocate memory near the kernel image to try the best to keep
+         * the kernel away from hotpluggable memory.
+         */
+        memblock_set_bottom_up(true);
+#else
+        pr_warn("movable_node option not supported\n");
+#endif
+        return 0;
+}
+early_param("movable_node", cmdline_parse_movable_node);
 /* check which state of node_states will be changed when offline memory */
 static void node_states_check_changes_offline(unsigned long nr_pages,
                struct zone *zone, struct memory_notify *arg)
@@ -1702,7 +1743,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
+static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
        int ret = !is_memblock_offlined(mem);
@@ -1854,7 +1895,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
         * if this is not the case.
         */
        ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-                                is_memblock_offlined_cb);
+                                check_memblock_offlined_cb);
        if (ret) {
                unlock_memory_hotplug();
                BUG();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71cb253368cb..4cc19f6ab6c6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1125,7 +1125,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s,d;
-                int source = -1;
+                int source = NUMA_NO_NODE;
                int dest = 0;
                for_each_node_mask(s, tmp) {
@@ -1160,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                        if (!node_isset(dest, tmp))
                                break;
                }
-                if (source == -1)
+                if (source == NUMA_NO_NODE)
                        break;
                node_clear(source, tmp);
@@ -1835,7 +1835,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
        unsigned nnodes = nodes_weight(pol->v.nodes);
        unsigned target;
        int c;
-        int nid = -1;
+        int nid = NUMA_NO_NODE;
        if (!nnodes)
                return numa_node_id();
@@ -1872,11 +1872,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 /*
 * Return the bit number of a random bit set in the nodemask.
- * (returns -1 if nodemask is empty)
+ * (returns NUMA_NO_NODE if nodemask is empty)
 */
 int node_random(const nodemask_t *maskp)
 {
-        int w, bit = -1;
+        int w, bit = NUMA_NO_NODE;
        w = nodes_weight(*maskp);
        if (w)
@@ -2914,62 +2914,45 @@ out:
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
 *
- * Convert a mempolicy into a string.
+ * Convert @pol into a string.  If @buffer is too short, truncate the string.
- * Returns the number of characters in buffer (if positive)
+ * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
- * or an error (negative)
+ * longest flag, "relative", and to display at least a few node ids.
 */
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
        char *p = buffer;
-        int l;
+        nodemask_t nodes = NODE_MASK_NONE;
-        nodemask_t nodes;
+        unsigned short mode = MPOL_DEFAULT;
-        unsigned short mode;
+        unsigned short flags = 0;
-        unsigned short flags = pol ? pol->flags : 0;
-        /*
-         * Sanity check:  room for longest mode, flag and some nodes
-         */
-        VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
-        if (!pol || pol == &default_policy)
+        if (pol && pol != &default_policy) {
-                mode = MPOL_DEFAULT;
-        else
                mode = pol->mode;
+                flags = pol->flags;
+        }
        switch (mode) {
        case MPOL_DEFAULT:
-                nodes_clear(nodes);
                break;
        case MPOL_PREFERRED:
-                nodes_clear(nodes);
                if (flags & MPOL_F_LOCAL)
                        mode = MPOL_LOCAL;
                else
                        node_set(pol->v.preferred_node, nodes);
                break;
        case MPOL_BIND:
-                /* Fall through */
        case MPOL_INTERLEAVE:
                nodes = pol->v.nodes;
                break;
        default:
-                return -EINVAL;
+                WARN_ON_ONCE(1);
+                snprintf(p, maxlen, "unknown");
+                return;
        }
-        l = strlen(policy_modes[mode]);
+        p += snprintf(p, maxlen, policy_modes[mode]);
-        if (buffer + maxlen < p + l + 1)
-                return -ENOSPC;
-        strcpy(p, policy_modes[mode]);
-        p += l;
        if (flags & MPOL_MODE_FLAGS) {
-                if (buffer + maxlen < p + 2)
+                p += snprintf(p, buffer + maxlen - p, "=");
-                        return -ENOSPC;
-                *p++ = '=';
                /*
                 * Currently, the only defined flags are mutually exclusive
@@ -2981,10 +2964,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
        }
        if (!nodes_empty(nodes)) {
-                if (buffer + maxlen < p + 2)
+                p += snprintf(p, buffer + maxlen - p, ":");
-                        return -ENOSPC;
-                *p++ = ':';
                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
        }
-        return p - buffer;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index ab199dfc9e26..5a6baddde15d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                goto error;
        }
-        allowed = (totalram_pages - hugetlb_total_pages())
+        allowed = vm_commit_limit();
-                * sysctl_overcommit_ratio / 100;
        /*
         * Reserve some for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-        allowed += total_swap_pages;
        /*
         * Don't let a single process grow so big a user can't recover
@@ -1856,7 +1854,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
        struct vm_area_struct *vma;
        struct vm_unmapped_area_info info;
-        if (len > TASK_SIZE)
+        if (len > TASK_SIZE - mmap_min_addr)
                return -ENOMEM;
        if (flags & MAP_FIXED)
@@ -1865,14 +1863,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma(mm, addr);
-                if (TASK_SIZE - len >= addr &&
+                if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vma->vm_start))
                        return addr;
        }
        info.flags = 0;
        info.length = len;
-        info.low_limit = TASK_UNMAPPED_BASE;
+        info.low_limit = mm->mmap_base;
        info.high_limit = TASK_SIZE;
        info.align_mask = 0;
        return vm_unmapped_area(&info);
@@ -1895,7 +1893,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        struct vm_unmapped_area_info info;
        /* requested length too big for entire address space */
-        if (len > TASK_SIZE)
+        if (len > TASK_SIZE - mmap_min_addr)
                return -ENOMEM;
        if (flags & MAP_FIXED)
@@ -1905,14 +1903,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma(mm, addr);
-                if (TASK_SIZE - len >= addr &&
+                if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
                                (!vma || addr + len <= vma->vm_start))
                        return addr;
        }
        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
-        info.low_limit = PAGE_SIZE;
+        info.low_limit = max(PAGE_SIZE, mmap_min_addr);
        info.high_limit = mm->mmap_base;
        info.align_mask = 0;
        addr = vm_unmapped_area(&info);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a597f2ffcd6f..26667971c824 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -112,6 +112,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
        pmd_t *pmd;
        unsigned long next;
        unsigned long pages = 0;
+        unsigned long nr_huge_updates = 0;
        pmd = pmd_offset(pud, addr);
        do {
@@ -126,9 +127,10 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                                                newprot, prot_numa);
                                if (nr_ptes) {
-                                        if (nr_ptes == HPAGE_PMD_NR)
+                                        if (nr_ptes == HPAGE_PMD_NR) {
-                                                pages++;
+                                                pages += HPAGE_PMD_NR;
+                                                nr_huge_updates++;
+                                        }
                                        continue;
                                }
                        }
@@ -141,6 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                pages += this_pages;
        } while (pmd++, addr = next, addr != end);
+        if (nr_huge_updates)
+                count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
        return pages;
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 61107cf55bb3..2c254d374655 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 static void __init __free_pages_memory(unsigned long start, unsigned long end)
 {
-        unsigned long i, start_aligned, end_aligned;
+        int order;
-        int order = ilog2(BITS_PER_LONG);
-        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+        while (start < end) {
-        end_aligned = end & ~(BITS_PER_LONG - 1);
+                order = min(MAX_ORDER - 1UL, __ffs(start));
-        if (end_aligned <= start_aligned) {
+                while (start + (1UL << order) > end)
-                for (i = start; i < end; i++)
+                        order--;
-                        __free_pages_bootmem(pfn_to_page(i), 0);
-                return;
+                __free_pages_bootmem(pfn_to_page(start), order);
-        }
-        for (i = start; i < start_aligned; i++)
-                __free_pages_bootmem(pfn_to_page(i), 0);
-        for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+                start += (1UL << order);
-                __free_pages_bootmem(pfn_to_page(i), order);
+        }
-        for (i = end_aligned; i < end; i++)
-                __free_pages_bootmem(pfn_to_page(i), 0);
 }
 static unsigned long __init __free_memory_core(phys_addr_t start,
diff --git a/mm/nommu.c b/mm/nommu.c
index 9e6cb02cba64..fec093adad9c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                goto error;
        }
-        allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+        allowed = vm_commit_limit();
        /*
         * Reserve some 3% for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-        allowed += total_swap_pages;
        /*
         * Don't let a single process grow so big a user can't recover
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 73d812f16dde..580a5f075ed0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly;
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
+        if (unlikely(page_group_by_mobility_disabled &&
-        if (unlikely(page_group_by_mobility_disabled))
+                     migratetype < MIGRATE_PCPTYPES))
                migratetype = MIGRATE_UNMOVABLE;
        set_pageblock_flags_group(page, (unsigned long)migratetype,
@@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
 {
        int current_order = page_order(page);
+        /*
+         * When borrowing from MIGRATE_CMA, we need to release the excess
+         * buddy pages to CMA itself.
+         */
        if (is_migrate_cma(fallback_type))
                return fallback_type;
@@ -1091,21 +1095,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                        list_del(&page->lru);
                        rmv_page_order(page);
-                        /*
-                         * Borrow the excess buddy pages as well, irrespective
-                         * of whether we stole freepages, or took ownership of
-                         * the pageblock or not.
-                         *
-                         * Exception: When borrowing from MIGRATE_CMA, release
-                         * the excess buddy pages to CMA itself.
-                         */
                        expand(zone, page, order, current_order, area,
-                               is_migrate_cma(migratetype)
+                               new_type);
-                             ? migratetype : start_migratetype);
-                        trace_mm_page_alloc_extfrag(page, order,
+                        trace_mm_page_alloc_extfrag(page, order, current_order,
-                                current_order, start_migratetype, migratetype,
+                                start_migratetype, migratetype, new_type);
-                                new_type == start_migratetype);
                        return page;
                }
@@ -1711,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 * comments in mmzone.h.  Reduces cache footprint of zonelist scans
 * that have to skip over a lot of full or unallowed zones.
 *
- * If the zonelist cache is present in the passed in zonelist, then
+ * If the zonelist cache is present in the passed zonelist, then
 * returns a pointer to the allowed node mask (either the current
 * tasks mems_allowed, or node_states[N_MEMORY].)
 *
@@ -2593,7 +2587,7 @@ rebalance:
         * running out of options and have to consider going OOM
         */
        if (!did_some_progress) {
-                if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+                if (oom_gfp_allowed(gfp_mask)) {
                        if (oom_killer_disabled)
                                goto nopage;
                        /* Coredumps can quickly deplete all memory reserves */
@@ -3881,8 +3875,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
        return ffz(~size);
 }
-#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
 * Check if a pageblock contains reserved pages
 */
@@ -4266,7 +4258,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
         */
        zone->pageset = &boot_pageset;
-        if (zone->present_pages)
+        if (populated_zone(zone))
                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
                        zone->name, zone->present_pages,
                                         zone_batchsize(zone));
@@ -5160,7 +5152,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
        for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
                struct zone *zone = &pgdat->node_zones[zone_type];
-                if (zone->present_pages) {
+                if (populated_zone(zone)) {
                        node_set_state(nid, N_HIGH_MEMORY);
                        if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
                            zone_type <= ZONE_NORMAL)
diff --git a/mm/readahead.c b/mm/readahead.c
index e4ed04149785..7cdbb44aa90b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping,
                   unsigned long req_size)
 {
        unsigned long max = max_sane_readahead(ra->ra_pages);
+        pgoff_t prev_offset;
        /*
         * start of file
@@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping,
        /*
         * sequential cache miss
+         * trivial case: (offset - prev_offset) == 1
+         * unaligned reads: (offset - prev_offset) == 0
         */
-        if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
+        prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
+        if (offset - prev_offset <= 1UL)
                goto initial_readahead;
        /*
@@ -569,7 +573,7 @@ static ssize_t
 do_readahead(struct address_space *mapping, struct file *filp,
             pgoff_t index, unsigned long nr)
 {
-        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+        if (!mapping || !mapping->a_ops)
                return -EINVAL;
        force_page_cache_readahead(mapping, filp, index, nr);
diff --git a/mm/slab.c b/mm/slab.c
index 2580db062df9..0c8967bb2018 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3982,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        VM_BUG_ON(!mutex_is_locked(&slab_mutex));
        for_each_memcg_cache_index(i) {
-                c = cache_from_memcg(cachep, i);
+                c = cache_from_memcg_idx(cachep, i);
                if (c)
                        /* return value determined by the parent cache only */
                        __do_tune_cpucache(c, limit, batchcount, shared, gfp);
diff --git a/mm/slab.h b/mm/slab.h
index a535033f7e9a..0859c4241ba1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s)
        return s->name;
 }
-static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
        if (!s->memcg_params)
                return NULL;
@@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s)
        return s->name;
 }
-static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
        return NULL;
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2e98af703ea..0b7bb399b0e4 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -571,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
                return;
        for_each_memcg_cache_index(i) {
-                c = cache_from_memcg(s, i);
+                c = cache_from_memcg_idx(s, i);
                if (!c)
                        continue;
diff --git a/mm/slub.c b/mm/slub.c
index c3eb3d3ca835..92737a0b787b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4983,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                 * through the descendants with best-effort propagation.
                 */
                for_each_memcg_cache_index(i) {
-                        struct kmem_cache *c = cache_from_memcg(s, i);
+                        struct kmem_cache *c = cache_from_memcg_idx(s, i);
                        if (c)
                                attribute->store(c, buf, len);
                }
diff --git a/mm/sparse.c b/mm/sparse.c
index 4ac1d7ef548f..8cc7be0e9590 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -590,33 +590,32 @@ void __init sparse_init(void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
-                                                 unsigned long nr_pages)
 {
        /* This will make the necessary allocations eventually. */
        return sparse_mem_map_populate(pnum, nid);
 }
-static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+static void __kfree_section_memmap(struct page *memmap)
 {
        unsigned long start = (unsigned long)memmap;
-        unsigned long end = (unsigned long)(memmap + nr_pages);
+        unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
        vmemmap_free(start, end);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap)
 {
        unsigned long start = (unsigned long)memmap;
-        unsigned long end = (unsigned long)(memmap + nr_pages);
+        unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
        vmemmap_free(start, end);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #else
-static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+static struct page *__kmalloc_section_memmap(void)
 {
        struct page *page, *ret;
-        unsigned long memmap_size = sizeof(struct page) * nr_pages;
+        unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
        page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
        if (page)
@@ -634,28 +633,30 @@ got_map_ptr:
        return ret;
 }
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
-                                                  unsigned long nr_pages)
 {
-        return __kmalloc_section_memmap(nr_pages);
+        return __kmalloc_section_memmap();
 }
-static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+static void __kfree_section_memmap(struct page *memmap)
 {
        if (is_vmalloc_addr(memmap))
                vfree(memmap);
        else
                free_pages((unsigned long)memmap,
-                           get_order(sizeof(struct page) * nr_pages));
+                           get_order(sizeof(struct page) * PAGES_PER_SECTION));
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap)
 {
        unsigned long maps_section_nr, removing_section_nr, i;
-        unsigned long magic;
+        unsigned long magic, nr_pages;
        struct page *page = virt_to_page(memmap);
+        nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+                >> PAGE_SHIFT;
        for (i = 0; i < nr_pages; i++, page++) {
                magic = (unsigned long) page->lru.next;
@@ -684,8 +685,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 * set.  If this is <=0, then that means that the passed-in
 * map was not consumed and must be freed.
 */
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
-                           int nr_pages)
 {
        unsigned long section_nr = pfn_to_section_nr(start_pfn);
        struct pglist_data *pgdat = zone->zone_pgdat;
@@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
        ret = sparse_index_init(section_nr, pgdat->node_id);
        if (ret < 0 && ret != -EEXIST)
                return ret;
-        memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
+        memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
        if (!memmap)
                return -ENOMEM;
        usemap = __kmalloc_section_usemap();
        if (!usemap) {
-                __kfree_section_memmap(memmap, nr_pages);
+                __kfree_section_memmap(memmap);
                return -ENOMEM;
        }
@@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                goto out;
        }
-        memset(memmap, 0, sizeof(struct page) * nr_pages);
+        memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
        ms->section_mem_map |= SECTION_MARKED_PRESENT;
@@ -729,7 +729,7 @@ out:
        pgdat_resize_unlock(pgdat, &flags);
        if (ret <= 0) {
                kfree(usemap);
-                __kfree_section_memmap(memmap, nr_pages);
+                __kfree_section_memmap(memmap);
        }
        return ret;
 }
@@ -759,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 {
        struct page *usemap_page;
-        unsigned long nr_pages;
        if (!usemap)
                return;
@@ -771,7 +770,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
        if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
                kfree(usemap);
                if (memmap)
-                        __kfree_section_memmap(memmap, PAGES_PER_SECTION);
+                        __kfree_section_memmap(memmap);
                return;
        }
@@ -780,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
         * on the section which has pgdat at boot time. Just keep it as is now.
         */
-        if (memmap) {
+        if (memmap)
-                nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+                free_map_bootmem(memmap);
-                        >> PAGE_SHIFT;
-                free_map_bootmem(memmap, nr_pages);
-        }
 }
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index de7c904e52e5..612a7c9795f6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -707,7 +707,7 @@ noswap:
        return (swp_entry_t) {0};
 }
-/* The only caller of this function is now susupend routine */
+/* The only caller of this function is now suspend routine */
 swp_entry_t get_swap_page_of_type(int type)
 {
        struct swap_info_struct *si;
@@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 }
 /*
- * Caller has made sure that the swapdevice corresponding to entry
+ * Caller has made sure that the swap device corresponding to entry
 * is still around or has not been recycled.
 */
 void swap_free(swp_entry_t entry)
@@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page)
         * original page might be freed under memory pressure, then
         * later read back in from swap, now with the wrong data.
         *
-         * Hibration suspends storage while it is writing the image
+         * Hibernation suspends storage while it is writing the image
         * to disk so check that here.
         */
        if (pm_suspended_storage())
@@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
         * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
         * of unmatched parts which look like swp_pte, so unuse_pte must
         * recheck under pte lock.  Scanning without pte lock lets it be
-         * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+         * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
         */
        pte = pte_offset_map(pmd, addr);
        do {
@@ -1924,17 +1924,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->cluster_info = NULL;
        p->flags = 0;
        frontswap_map = frontswap_map_get(p);
-        frontswap_map_set(p, NULL);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        frontswap_invalidate_area(type);
+        frontswap_map_set(p, NULL);
        mutex_unlock(&swapon_mutex);
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
        vfree(swap_map);
        vfree(cluster_info);
        vfree(frontswap_map);
-        /* Destroy swap account informatin */
+        /* Destroy swap account information */
        swap_cgroup_swapoff(type);
        inode = mapping->host;
@@ -2786,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        /*
         * We are fortunate that although vmalloc_to_page uses pte_offset_map,
-         * no architecture is using highmem pages for kernel pagetables: so it
+         * no architecture is using highmem pages for kernel page tables: so it
-         * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
+         * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
         */
        head = vmalloc_to_page(si->swap_map + offset);
        offset &= ~PAGE_MASK;
diff --git a/mm/util.c b/mm/util.c
index eaf63fc2c92f..f7bc2096071c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,9 @@
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mman.h>
+#include <linux/hugetlb.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page)
        return mapping;
 }
+/*
+ * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
+ */
+unsigned long vm_commit_limit(void)
+{
+        return ((totalram_pages - hugetlb_total_pages())
+                * sysctl_overcommit_ratio / 100) + total_swap_pages;
+}
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 107454312d5e..0fdf96803c5b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
        if (unlikely(!va))
                return ERR_PTR(-ENOMEM);
+        /*
+         * Only scan the relevant parts containing pointers to other objects
+         * to avoid false negatives.
+         */
+        kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
 retry:
        spin_lock(&vmap_area_lock);
        /*
@@ -1546,7 +1552,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, pgprot_t prot,
                            int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-                                 pgprot_t prot, int node, const void *caller)
+                                 pgprot_t prot, int node)
 {
        const int order = 0;
        struct page **pages;
@@ -1560,13 +1566,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
                pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
-                                PAGE_KERNEL, node, caller);
+                                PAGE_KERNEL, node, area->caller);
                area->flags |= VM_VPAGES;
        } else {
                pages = kmalloc_node(array_size, nested_gfp, node);
        }
        area->pages = pages;
-        area->caller = caller;
        if (!area->pages) {
                remove_vm_area(area->addr);
                kfree(area);
@@ -1577,7 +1582,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                struct page *page;
                gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
-                if (node < 0)
+                if (node == NUMA_NO_NODE)
                        page = alloc_page(tmp_mask);
                else
                        page = alloc_pages_node(node, tmp_mask, order);
@@ -1634,9 +1639,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        if (!area)
                goto fail;
-        addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+        addr = __vmalloc_area_node(area, gfp_mask, prot, node);
        if (!addr)
-                goto fail;
+                return NULL;
        /*
         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -1646,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        clear_vm_uninitialized_flag(area);
        /*
-         * A ref_count = 3 is needed because the vm_struct and vmap_area
+         * A ref_count = 2 is needed because vm_struct allocated in
-         * structures allocated in the __get_vm_area_node() function contain
+         * __get_vm_area_node() contains a reference to the virtual address of
-         * references to the virtual address of the vmalloc'ed block.
+         * the vmalloc'ed block.
         */
-        kmemleak_alloc(addr, real_size, 3, gfp_mask);
+        kmemleak_alloc(addr, real_size, 2, gfp_mask);
        return addr;
@@ -2563,6 +2568,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
                if (!counters)
                        return;
+                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+                smp_rmb();
+                if (v->flags & VM_UNINITIALIZED)
+                        return;
                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
                for (nr = 0; nr < v->nr_pages; nr++)
@@ -2579,23 +2589,15 @@ static int s_show(struct seq_file *m, void *p)
        struct vmap_area *va = p;
        struct vm_struct *v;
-        if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+        /*
+         * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
+         * behalf of vmap area is being tear down or vm_map_ram allocation.
+         */
+        if (!(va->flags & VM_VM_AREA))
                return 0;
-        if (!(va->flags & VM_VM_AREA)) {
-                seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
-                        (void *)va->va_start, (void *)va->va_end,
-                                        va->va_end - va->va_start);
-                return 0;
-        }
        v = va->vm;
-        /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
-        smp_rmb();
-        if (v->flags & VM_UNINITIALIZED)
-                return 0;
        seq_printf(m, "0x%pK-0x%pK %7ld",
                v->addr, v->addr + v->size, v->size);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9bb314577911..72496140ac08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -812,6 +812,7 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_NUMA_BALANCING
        "numa_pte_updates",
+        "numa_huge_pte_updates",
        "numa_hint_faults",
        "numa_hint_faults_local",
        "numa_pages_migrated",
@@ -1229,6 +1230,20 @@ static void start_cpu_timer(int cpu)
        schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
 }
+static void vmstat_cpu_dead(int node)
+{
+        int cpu;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                if (cpu_to_node(cpu) == node)
+                        goto end;
+        node_clear_state(node, N_CPU);
+end:
+        put_online_cpus();
+}
 /*
 * Use the cpu notifier to insure that the thresholds are recalculated
 * when necessary.
@@ -1258,6 +1273,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                refresh_zone_stat_thresholds();
+                vmstat_cpu_dead(cpu_to_node(cpu));
                break;
        default:
                break;
@@ -1276,8 +1292,12 @@ static int __init setup_vmstat(void)
        register_cpu_notifier(&vmstat_notifier);
-        for_each_online_cpu(cpu)
+        get_online_cpus();
+        for_each_online_cpu(cpu) {
                start_cpu_timer(cpu);
+                node_set_state(cpu_to_node(cpu), N_CPU);
+        }
+        put_online_cpus();
 #endif
 #ifdef CONFIG_PROC_FS
        proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/mm/zswap.c b/mm/zswap.c
index d93510c6aa2d..5a63f78a5601 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
        if (!entry)
                return NULL;
        entry->refcount = 1;
+        RB_CLEAR_NODE(&entry->rbnode);
        return entry;
 }
@@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
        kmem_cache_free(zswap_entry_cache, entry);
 }
-/* caller must hold the tree lock */
-static void zswap_entry_get(struct zswap_entry *entry)
-{
-        entry->refcount++;
-}
-/* caller must hold the tree lock */
-static int zswap_entry_put(struct zswap_entry *entry)
-{
-        entry->refcount--;
-        return entry->refcount;
-}
 /*********************************
 * rbtree functions
 **********************************/
@@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
        return 0;
 }
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+{
+        if (!RB_EMPTY_NODE(&entry->rbnode)) {
+                rb_erase(&entry->rbnode, root);
+                RB_CLEAR_NODE(&entry->rbnode);
+        }
+}
+/*
+ * Carries out the common pattern of freeing and entry's zsmalloc allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_free_entry(struct zswap_tree *tree,
+                        struct zswap_entry *entry)
+{
+        zbud_free(tree->pool, entry->handle);
+        zswap_entry_cache_free(entry);
+        atomic_dec(&zswap_stored_pages);
+        zswap_pool_pages = zbud_get_pool_size(tree->pool);
+}
+/* caller must hold the tree lock */
+static void zswap_entry_get(struct zswap_entry *entry)
+{
+        entry->refcount++;
+}
+/* caller must hold the tree lock
+* remove from the tree and free it, if nobody reference the entry
+*/
+static void zswap_entry_put(struct zswap_tree *tree,
+                        struct zswap_entry *entry)
+{
+        int refcount = --entry->refcount;
+        BUG_ON(refcount < 0);
+        if (refcount == 0) {
+                zswap_rb_erase(&tree->rbroot, entry);
+                zswap_free_entry(tree, entry);
+        }
+}
+/* caller must hold the tree lock */
+static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
+                                pgoff_t offset)
+{
+        struct zswap_entry *entry = NULL;
+        entry = zswap_rb_search(root, offset);
+        if (entry)
+                zswap_entry_get(entry);
+        return entry;
+}
 /*********************************
 * per-cpu code
 **********************************/
@@ -368,18 +411,6 @@ static bool zswap_is_full(void)
                zswap_pool_pages);
 }
-/*
- * Carries out the common pattern of freeing and entry's zsmalloc allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
-static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
-{
-        zbud_free(tree->pool, entry->handle);
-        zswap_entry_cache_free(entry);
-        atomic_dec(&zswap_stored_pages);
-        zswap_pool_pages = zbud_get_pool_size(tree->pool);
-}
 /*********************************
 * writeback code
 **********************************/
@@ -387,7 +418,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
 enum zswap_get_swap_ret {
        ZSWAP_SWAPCACHE_NEW,
        ZSWAP_SWAPCACHE_EXIST,
-        ZSWAP_SWAPCACHE_NOMEM
+        ZSWAP_SWAPCACHE_FAIL,
 };
 /*
@@ -401,9 +432,10 @@ enum zswap_get_swap_ret {
 * added to the swap cache, and returned in retpage.
 *
 * If success, the swap cache page is returned in retpage
- * Returns 0 if page was already in the swap cache, page is not locked
+ * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
- * Returns 1 if the new page needs to be populated, page is locked
+ * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
- * Returns <0 on error
+ *     the new page is added to swapcache and locked
+ * Returns ZSWAP_SWAPCACHE_FAIL on error
 */
 static int zswap_get_swap_cache_page(swp_entry_t entry,
                                struct page **retpage)
@@ -475,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
        if (new_page)
                page_cache_release(new_page);
        if (!found_page)
-                return ZSWAP_SWAPCACHE_NOMEM;
+                return ZSWAP_SWAPCACHE_FAIL;
        *retpage = found_page;
        return ZSWAP_SWAPCACHE_EXIST;
 }
@@ -502,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
        struct page *page;
        u8 *src, *dst;
        unsigned int dlen;
-        int ret, refcount;
+        int ret;
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_NONE,
        };
@@ -517,23 +549,22 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
        /* find and ref zswap entry */
        spin_lock(&tree->lock);
-        entry = zswap_rb_search(&tree->rbroot, offset);
+        entry = zswap_entry_find_get(&tree->rbroot, offset);
        if (!entry) {
                /* entry was invalidated */
                spin_unlock(&tree->lock);
                return 0;
        }
-        zswap_entry_get(entry);
        spin_unlock(&tree->lock);
        BUG_ON(offset != entry->offset);
        /* try to allocate swap cache page */
        switch (zswap_get_swap_cache_page(swpentry, &page)) {
-        case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
+        case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
                ret = -ENOMEM;
                goto fail;
-        case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
+        case ZSWAP_SWAPCACHE_EXIST:
                /* page is already in the swap cache, ignore for now */
                page_cache_release(page);
                ret = -EEXIST;
@@ -556,43 +587,44 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
                SetPageUptodate(page);
        }
+        /* move it to the tail of the inactive list after end_writeback */
+        SetPageReclaim(page);
        /* start writeback */
        __swap_writepage(page, &wbc, end_swap_bio_write);
        page_cache_release(page);
        zswap_written_back_pages++;
        spin_lock(&tree->lock);
        /* drop local reference */
-        zswap_entry_put(entry);
+        zswap_entry_put(tree, entry);
-        /* drop the initial reference from entry creation */
-        refcount = zswap_entry_put(entry);
        /*
-         * There are three possible values for refcount here:
+        * There are two possible situations for entry here:
-         * (1) refcount is 1, load is in progress, unlink from rbtree,
+        * (1) refcount is 1(normal case),  entry is valid and on the tree
-         *     load will free
+        * (2) refcount is 0, entry is freed and not on the tree
-         * (2) refcount is 0, (normal case) entry is valid,
+        *     because invalidate happened during writeback
-         *     remove from rbtree and free entry
+        *  search the tree and free the entry if find entry
-         * (3) refcount is -1, invalidate happened during writeback,
+        */
-         *     free entry
+        if (entry == zswap_rb_search(&tree->rbroot, offset))
-         */
+                zswap_entry_put(tree, entry);
-        if (refcount >= 0) {
-                /* no invalidate yet, remove from rbtree */
-                rb_erase(&entry->rbnode, &tree->rbroot);
-        }
        spin_unlock(&tree->lock);
-        if (refcount <= 0) {
-                /* free the entry */
-                zswap_free_entry(tree, entry);
-                return 0;
-        }
-        return -EAGAIN;
+        goto end;
+        /*
+        * if we get here due to ZSWAP_SWAPCACHE_EXIST
+        * a load may happening concurrently
+        * it is safe and okay to not free the entry
+        * if we free the entry in the following put
+        * it it either okay to return !0
+        */
 fail:
        spin_lock(&tree->lock);
-        zswap_entry_put(entry);
+        zswap_entry_put(tree, entry);
        spin_unlock(&tree->lock);
+end:
        return ret;
 }
@@ -676,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
                if (ret == -EEXIST) {
                        zswap_duplicate_entry++;
                        /* remove from rbtree */
-                        rb_erase(&dupentry->rbnode, &tree->rbroot);
+                        zswap_rb_erase(&tree->rbroot, dupentry);
-                        if (!zswap_entry_put(dupentry)) {
+                        zswap_entry_put(tree, dupentry);
-                                /* free */
-                                zswap_free_entry(tree, dupentry);
-                        }
                }
        } while (ret == -EEXIST);
        spin_unlock(&tree->lock);
@@ -709,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
        struct zswap_entry *entry;
        u8 *src, *dst;
        unsigned int dlen;
-        int refcount, ret;
+        int ret;
        /* find */
        spin_lock(&tree->lock);
-        entry = zswap_rb_search(&tree->rbroot, offset);
+        entry = zswap_entry_find_get(&tree->rbroot, offset);
        if (!entry) {
                /* entry was written back */
                spin_unlock(&tree->lock);
                return -1;
        }
-        zswap_entry_get(entry);
        spin_unlock(&tree->lock);
        /* decompress */
@@ -734,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
        BUG_ON(ret);
        spin_lock(&tree->lock);
-        refcount = zswap_entry_put(entry);
+        zswap_entry_put(tree, entry);
-        if (likely(refcount)) {
-                spin_unlock(&tree->lock);
-                return 0;
-        }
        spin_unlock(&tree->lock);
-        /*
-         * We don't have to unlink from the rbtree because
-         * zswap_writeback_entry() or zswap_frontswap_invalidate page()
-         * has already done this for us if we are the last reference.
-         */
-        /* free */
-        zswap_free_entry(tree, entry);
        return 0;
 }
@@ -758,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
 {
        struct zswap_tree *tree = zswap_trees[type];
        struct zswap_entry *entry;
-        int refcount;
        /* find */
        spin_lock(&tree->lock);
@@ -770,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
        }
        /* remove from rbtree */
-        rb_erase(&entry->rbnode, &tree->rbroot);
+        zswap_rb_erase(&tree->rbroot, entry);
        /* drop the initial reference from entry creation */
-        refcount = zswap_entry_put(entry);
+        zswap_entry_put(tree, entry);
        spin_unlock(&tree->lock);
-        if (refcount) {
-                /* writeback in progress, writeback will free */
-                return;
-        }
-        /* free */
-        zswap_free_entry(tree, entry);
 }
 /* frees all zswap entries for the given swap type */
@@ -797,11 +803,8 @@ static void zswap_frontswap_invalidate_area(unsigned type)
        /* walk the tree and free everything */
        spin_lock(&tree->lock);
-        rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
+        rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
-                zbud_free(tree->pool, entry->handle);
+                zswap_free_entry(tree, entry);
-                zswap_entry_cache_free(entry);
-                atomic_dec(&zswap_stored_pages);
-        }
        tree->rbroot = RB_ROOT;
        spin_unlock(&tree->lock);
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-13 01:45:43 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-13 01:45:43 -0500
commit	5cbb3d216e2041700231bcfc383ee5f8b7fc8b74 (patch)
tree	a738fa82dbcefa9bd283c08bc67f38827be63937 /mm
parent	9bc9ccd7db1c9f043f75380b5a5b94912046a60e (diff)
parent	4e9b45a19241354daec281d7a785739829b52359 (diff)