Merge branch 'akpm' (patches from Andrew)

Merge second patchbomb from Andrew Morton: - the rest of MM - various misc bits - add ability to run /sbin/reboot at reboot time - printk/vsprintf changes - fiddle with seq_printf() return value * akpm: (114 commits) parisc: remove use of seq_printf return value lru_cache: remove use of seq_printf return value tracing: remove use of seq_printf return value cgroup: remove use of seq_printf return value proc: remove use of seq_printf return value s390: remove use of seq_printf return value cris fasttimer: remove use of seq_printf return value cris: remove use of seq_printf return value openrisc: remove use of seq_printf return value ARM: plat-pxa: remove use of seq_printf return value nios2: cpuinfo: remove use of seq_printf return value microblaze: mb: remove use of seq_printf return value ipc: remove use of seq_printf return value rtc: remove use of seq_printf return value power: wakeup: remove use of seq_printf return value x86: mtrr: if: remove use of seq_printf return value linux/bitmap.h: improve BITMAP_{LAST,FIRST}_WORD_MASK MAINTAINERS: CREDITS: remove Stefano Brivio from B43 .mailmap: add Ricardo Ribalda CREDITS: add Ricardo Ribalda Delgado ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-04-15 19:39:15 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-04-15 19:39:15 -0400
commit: eea3a00264cf243a28e4331566ce67b86059339d (patch)
tree: 487f16389e0dfa32e9caa7604d1274a7dcda8f04 /mm
parent: e7c82412433a8039616c7314533a0a1c025d99bf (diff)
parent: e693d73c20ffdb06840c9378f367bad849ac0d5d (diff)
30 files changed, 1453 insertions, 583 deletions
diff --git a/mm/cma.c b/mm/cma.c
index 47203faaf65e..3a7a67b93394 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -23,6 +23,7 @@
 #  define DEBUG
 #endif
 #endif
+#define CREATE_TRACE_POINTS
 #include <linux/memblock.h>
 #include <linux/err.h>
@@ -34,6 +35,7 @@
 #include <linux/cma.h>
 #include <linux/highmem.h>
 #include <linux/io.h>
+#include <trace/events/cma.h>
 #include "cma.h"
@@ -414,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
                start = bitmap_no + mask + 1;
        }
+        trace_cma_alloc(page ? pfn : -1UL, page, count, align);
        pr_debug("%s(): returned %p\n", __func__, page);
        return page;
 }
@@ -446,6 +450,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
        free_contig_range(pfn, count);
        cma_clear_bitmap(cma, pfn, count);
+        trace_cma_release(pfn, pages, count);
        return true;
 }
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 0b377536ccde..7621ee34daa0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -30,9 +30,44 @@ static int cma_debugfs_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+static int cma_used_get(void *data, u64 *val)
+{
+        struct cma *cma = data;
+        unsigned long used;
+        mutex_lock(&cma->lock);
+        /* pages counter is smaller than sizeof(int) */
+        used = bitmap_weight(cma->bitmap, (int)cma->count);
+        mutex_unlock(&cma->lock);
+        *val = (u64)used << cma->order_per_bit;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
+static int cma_maxchunk_get(void *data, u64 *val)
+{
+        struct cma *cma = data;
+        unsigned long maxchunk = 0;
+        unsigned long start, end = 0;
+        mutex_lock(&cma->lock);
+        for (;;) {
+                start = find_next_zero_bit(cma->bitmap, cma->count, end);
+                if (start >= cma->count)
+                        break;
+                end = find_next_bit(cma->bitmap, cma->count, start);
+                maxchunk = max(end - start, maxchunk);
+        }
+        mutex_unlock(&cma->lock);
+        *val = (u64)maxchunk << cma->order_per_bit;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
 static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
 {
        spin_lock(&cma->mem_head_lock);
@@ -91,7 +126,6 @@ static int cma_free_write(void *data, u64 val)
        return cma_free_mem(cma, pages);
 }
 DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
 static int cma_alloc_mem(struct cma *cma, int count)
@@ -124,7 +158,6 @@ static int cma_alloc_write(void *data, u64 val)
        return cma_alloc_mem(cma, pages);
 }
 DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
 static void cma_debugfs_add_one(struct cma *cma, int idx)
@@ -149,6 +182,8 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
                                &cma->count, &cma_debugfs_fops);
        debugfs_create_file("order_per_bit", S_IRUGO, tmp,
                                &cma->order_per_bit, &cma_debugfs_fops);
+        debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops);
+        debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops);
        u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
        debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
diff --git a/mm/compaction.c b/mm/compaction.c
index a18201a8124e..018f08da99a2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc)
        return false;
 }
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-        /* If the page is a large free page, then disallow migration */
-        if (PageBuddy(page)) {
-                /*
-                 * We are checking page_order without zone->lock taken. But
-                 * the only small danger is that we skip a potentially suitable
-                 * pageblock, so it's not worth to check order for valid range.
-                 */
-                if (page_order_unsafe(page) >= pageblock_order)
-                        return false;
-        }
-        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-        if (migrate_async_suitable(get_pageblock_migratetype(page)))
-                return true;
-        /* Otherwise skip the block */
-        return false;
-}
 /*
 * Isolate free pages onto a private freelist. If @strict is true, will abort
 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
@@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+        /* If the page is a large free page, then disallow migration */
+        if (PageBuddy(page)) {
+                /*
+                 * We are checking page_order without zone->lock taken. But
+                 * the only small danger is that we skip a potentially suitable
+                 * pageblock, so it's not worth to check order for valid range.
+                 */
+                if (page_order_unsafe(page) >= pageblock_order)
+                        return false;
+        }
+        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+        if (migrate_async_suitable(get_pageblock_migratetype(page)))
+                return true;
+        /* Otherwise skip the block */
+        return false;
+}
 /*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
@@ -1047,6 +1048,12 @@ typedef enum {
 } isolate_migrate_t;
 /*
+ * Allow userspace to control policy on scanning the unevictable LRU for
+ * compactable pages.
+ */
+int sysctl_compact_unevictable_allowed __read_mostly = 1;
+/*
 * Isolate all pages that can be migrated from the first suitable block,
 * starting at the block pointed to by the migrate scanner pfn within
 * compact_control.
@@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        unsigned long low_pfn, end_pfn;
        struct page *page;
        const isolate_mode_t isolate_mode =
+                (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
                (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
        /*
@@ -1598,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                INIT_LIST_HEAD(&cc->freepages);
                INIT_LIST_HEAD(&cc->migratepages);
+                /*
+                 * When called via /proc/sys/vm/compact_memory
+                 * this makes sure we compact the whole zone regardless of
+                 * cached scanner positions.
+                 */
+                if (cc->order == -1)
+                        __reset_isolation_suitable(zone);
                if (cc->order == -1 || !compaction_deferred(zone, cc->order))
                        compact_zone(zone, cc);
diff --git a/mm/gup.c b/mm/gup.c
index ca7b607ab671..6297f6bccfb1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1019,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                 *
                 * for an example see gup_get_pte in arch/x86/mm/gup.c
                 */
-                pte_t pte = ACCESS_ONCE(*ptep);
+                pte_t pte = READ_ONCE(*ptep);
                struct page *page;
                /*
@@ -1309,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
        local_irq_save(flags);
        pgdp = pgd_offset(mm, addr);
        do {
-                pgd_t pgd = ACCESS_ONCE(*pgdp);
+                pgd_t pgd = READ_ONCE(*pgdp);
                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3afb5cbe1312..078832cf3636 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
+static void khugepaged_slab_exit(void);
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void)
        int nr_zones = 0;
        unsigned long recommended_min;
-        if (!khugepaged_enabled())
-                return 0;
        for_each_populated_zone(zone)
                nr_zones++;
@@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void)
        setup_per_zone_wmarks();
        return 0;
 }
-late_initcall(set_recommended_min_free_kbytes);
-static int start_khugepaged(void)
+static int start_stop_khugepaged(void)
 {
        int err = 0;
        if (khugepaged_enabled()) {
@@ -156,6 +153,7 @@ static int start_khugepaged(void)
                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
+                        goto fail;
                }
                if (!list_empty(&khugepaged_scan.mm_head))
@@ -166,7 +164,7 @@ static int start_khugepaged(void)
                kthread_stop(khugepaged_thread);
                khugepaged_thread = NULL;
        }
+fail:
        return err;
 }
@@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void)
        struct page *zero_page;
 retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
-                return ACCESS_ONCE(huge_zero_page);
+                return READ_ONCE(huge_zero_page);
        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
@@ -202,7 +200,7 @@ retry:
        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
-        return ACCESS_ONCE(huge_zero_page);
+        return READ_ONCE(huge_zero_page);
 }
 static void put_huge_zero_page(void)
@@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj,
                int err;
                mutex_lock(&khugepaged_mutex);
-                err = start_khugepaged();
+                err = start_stop_khugepaged();
                mutex_unlock(&khugepaged_mutex);
                if (err)
@@ -634,27 +632,38 @@ static int __init hugepage_init(void)
        err = hugepage_init_sysfs(&hugepage_kobj);
        if (err)
-                return err;
+                goto err_sysfs;
        err = khugepaged_slab_init();
        if (err)
-                goto out;
+                goto err_slab;
-        register_shrinker(&huge_zero_page_shrinker);
+        err = register_shrinker(&huge_zero_page_shrinker);
+        if (err)
+                goto err_hzp_shrinker;
        /*
         * By default disable transparent hugepages on smaller systems,
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
-        if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+        if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
                transparent_hugepage_flags = 0;
+                return 0;
+        }
-        start_khugepaged();
+        err = start_stop_khugepaged();
+        if (err)
+                goto err_khugepaged;
        return 0;
-out:
+err_khugepaged:
+        unregister_shrinker(&huge_zero_page_shrinker);
+err_hzp_shrinker:
+        khugepaged_slab_exit();
+err_slab:
        hugepage_exit_sysfs(hugepage_kobj);
+err_sysfs:
        return err;
 }
 subsys_initcall(hugepage_init);
@@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long haddr, pmd_t *pmd,
-                                        struct page *page)
+                                        struct page *page, gfp_t gfp)
 {
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
@@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        VM_BUG_ON_PAGE(!PageCompound(page), page);
-        if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+        if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
                return VM_FAULT_OOM;
        pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
-        if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+        if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long haddr;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
+        gfp_t huge_gfp;                 /* for allocation and charge */
        ptl = pmd_lockptr(mm, pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 alloc:
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
-                gfp_t gfp;
+                huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+                new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
-                gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
-                new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
        } else
                new_page = NULL;
@@ -1130,8 +1138,7 @@ alloc:
                goto out;
        }
-        if (unlikely(mem_cgroup_try_charge(new_page, mm,
+        if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
-                                           GFP_TRANSHUGE, &memcg))) {
                put_page(new_page);
                if (page) {
                        split_huge_page(page);
@@ -1976,6 +1983,11 @@ static int __init khugepaged_slab_init(void)
        return 0;
 }
+static void __init khugepaged_slab_exit(void)
+{
+        kmem_cache_destroy(mm_slot_cache);
+}
 static inline struct mm_slot *alloc_mm_slot(void)
 {
        if (!mm_slot_cache)     /* initialization failed */
@@ -2323,19 +2335,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
        return true;
 }
-static struct page
+static struct page *
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                       struct vm_area_struct *vma, unsigned long address,
                       int node)
 {
-        gfp_t flags;
        VM_BUG_ON_PAGE(*hpage, *hpage);
-        /* Only allocate from the target node */
-        flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
-                __GFP_THISNODE;
        /*
         * Before allocating the hugepage, release the mmap_sem read lock.
         * The allocation can take potentially a long time if it involves
@@ -2344,7 +2350,7 @@ static struct page
         */
        up_read(&mm->mmap_sem);
-        *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER);
+        *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
@@ -2397,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
        return true;
 }
-static struct page
+static struct page *
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                       struct vm_area_struct *vma, unsigned long address,
                       int node)
 {
        up_read(&mm->mmap_sem);
        VM_BUG_ON(!*hpage);
        return  *hpage;
 }
 #endif
@@ -2438,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm,
        struct mem_cgroup *memcg;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
+        gfp_t gfp;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        /* Only allocate from the target node */
+        gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+                __GFP_THISNODE;
        /* release the mmap_sem read lock. */
-        new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+        new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
        if (!new_page)
                return;
        if (unlikely(mem_cgroup_try_charge(new_page, mm,
-                                           GFP_TRANSHUGE, &memcg)))
+                                           gfp, &memcg)))
                return;
        /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8874c8ad55aa..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
        bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
        spin_unlock(&spool->lock);
        /* If no pages are used, and no other handles to the subpool
-         * remain, free the subpool the subpool remain */
+         * remain, give up any reservations mased on minimum size and
-        if (free)
+         * free the subpool */
+        if (free) {
+                if (spool->min_hpages != -1)
+                        hugetlb_acct_memory(spool->hstate,
+                                                -spool->min_hpages);
                kfree(spool);
+        }
 }
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+                                                long min_hpages)
 {
        struct hugepage_subpool *spool;
-        spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+        spool = kzalloc(sizeof(*spool), GFP_KERNEL);
        if (!spool)
                return NULL;
        spin_lock_init(&spool->lock);
        spool->count = 1;
-        spool->max_hpages = nr_blocks;
+        spool->max_hpages = max_hpages;
-        spool->used_hpages = 0;
+        spool->hstate = h;
+        spool->min_hpages = min_hpages;
+        if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+                kfree(spool);
+                return NULL;
+        }
+        spool->rsv_hpages = min_hpages;
        return spool;
 }
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
        unlock_or_release_subpool(spool);
 }
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request.  Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward).  The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                                      long delta)
 {
-        int ret = 0;
+        long ret = delta;
        if (!spool)
-                return 0;
+                return ret;
        spin_lock(&spool->lock);
-        if ((spool->used_hpages + delta) <= spool->max_hpages) {
-                spool->used_hpages += delta;
+        if (spool->max_hpages != -1) {          /* maximum size accounting */
-        } else {
+                if ((spool->used_hpages + delta) <= spool->max_hpages)
-                ret = -ENOMEM;
+                        spool->used_hpages += delta;
+                else {
+                        ret = -ENOMEM;
+                        goto unlock_ret;
+                }
+        }
+        if (spool->min_hpages != -1) {          /* minimum size accounting */
+                if (delta > spool->rsv_hpages) {
+                        /*
+                         * Asking for more reserves than those already taken on
+                         * behalf of subpool.  Return difference.
+                         */
+                        ret = delta - spool->rsv_hpages;
+                        spool->rsv_hpages = 0;
+                } else {
+                        ret = 0;        /* reserves already accounted for */
+                        spool->rsv_hpages -= delta;
+                }
        }
-        spin_unlock(&spool->lock);
+unlock_ret:
+        spin_unlock(&spool->lock);
        return ret;
 }
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
 {
+        long ret = delta;
        if (!spool)
-                return;
+                return delta;
        spin_lock(&spool->lock);
-        spool->used_hpages -= delta;
-        /* If hugetlbfs_put_super couldn't free spool due to
+        if (spool->max_hpages != -1)            /* maximum size accounting */
-        * an outstanding quota reference, free it now. */
+                spool->used_hpages -= delta;
+        if (spool->min_hpages != -1) {          /* minimum size accounting */
+                if (spool->rsv_hpages + delta <= spool->min_hpages)
+                        ret = 0;
+                else
+                        ret = spool->rsv_hpages + delta - spool->min_hpages;
+                spool->rsv_hpages += delta;
+                if (spool->rsv_hpages > spool->min_hpages)
+                        spool->rsv_hpages = spool->min_hpages;
+        }
+        /*
+         * If hugetlbfs_put_super couldn't free spool due to an outstanding
+         * quota reference, free it now.
+         */
        unlock_or_release_subpool(spool);
+        return ret;
 }
 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
        return NULL;
 }
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHuge(page), page);
+        return PageHead(page) && PagePrivate(&page[1]);
+}
+/* never called for tail page */
+static void set_page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+        SetPagePrivate(&page[1]);
+}
+static void clear_page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+        ClearPagePrivate(&page[1]);
+}
 void free_huge_page(struct page *page)
 {
        /*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
        restore_reserve = PagePrivate(page);
        ClearPagePrivate(page);
+        /*
+         * A return code of zero implies that the subpool will be under its
+         * minimum size if the reservation is not restored after page is free.
+         * Therefore, force restore_reserve operation.
+         */
+        if (hugepage_subpool_put_pages(spool, 1) == 0)
+                restore_reserve = true;
        spin_lock(&hugetlb_lock);
+        clear_page_huge_active(page);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
        if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
-        hugepage_subpool_put_pages(spool, 1);
 }
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1386,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (chg < 0)
                return ERR_PTR(-ENOMEM);
        if (chg || avoid_reserve)
-                if (hugepage_subpool_get_pages(spool, 1))
+                if (hugepage_subpool_get_pages(spool, 1) < 0)
                        return ERR_PTR(-ENOSPC);
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        struct resv_map *resv = vma_resv_map(vma);
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve, start, end;
+        long gbl_reserve;
        if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return;
@@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        kref_put(&resv->refs, resv_map_release);
        if (reserve) {
-                hugetlb_acct_memory(h, -reserve);
+                /*
-                hugepage_subpool_put_pages(spool, reserve);
+                 * Decrement reserve counts.  The global reserve count may be
+                 * adjusted if the subpool has a minimum size.
+                 */
+                gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+                hugetlb_acct_memory(h, -gbl_reserve);
        }
 }
@@ -2891,6 +2998,7 @@ retry_avoidcopy:
        copy_user_huge_page(new_page, old_page, address, vma,
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
+        set_page_huge_active(new_page);
        mmun_start = address & huge_page_mask(h);
        mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3111,7 @@ retry:
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
+                set_page_huge_active(page);
                if (vma->vm_flags & VM_MAYSHARE) {
                        int err;
@@ -3447,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
+        long gbl_reserve;
        /*
         * Only apply hugepage reservation if asked. At fault time, an
@@ -3483,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
                goto out_err;
        }
-        /* There must be enough pages in the subpool for the mapping */
+        /*
-        if (hugepage_subpool_get_pages(spool, chg)) {
+         * There must be enough pages in the subpool for the mapping. If
+         * the subpool has a minimum size, there may be some global
+         * reservations already in place (gbl_reserve).
+         */
+        gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+        if (gbl_reserve < 0) {
                ret = -ENOSPC;
                goto out_err;
        }
@@ -3493,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
         * Check enough hugepages are available for the reservation.
         * Hand the pages back to the subpool if there are not
         */
-        ret = hugetlb_acct_memory(h, chg);
+        ret = hugetlb_acct_memory(h, gbl_reserve);
        if (ret < 0) {
-                hugepage_subpool_put_pages(spool, chg);
+                /* put back original number of pages, chg */
+                (void)hugepage_subpool_put_pages(spool, chg);
                goto out_err;
        }
@@ -3525,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        struct resv_map *resv_map = inode_resv_map(inode);
        long chg = 0;
        struct hugepage_subpool *spool = subpool_inode(inode);
+        long gbl_reserve;
        if (resv_map)
                chg = region_truncate(resv_map, offset);
@@ -3532,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
-        hugepage_subpool_put_pages(spool, (chg - freed));
+        /*
-        hugetlb_acct_memory(h, -(chg - freed));
+         * If the subpool has a minimum size, the number of global
+         * reservations to be released may be adjusted.
+         */
+        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+        hugetlb_acct_memory(h, -gbl_reserve);
 }
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3775,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 #ifdef CONFIG_MEMORY_FAILURE
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
-        struct page *page;
-        struct page *tmp;
-        struct hstate *h = page_hstate(hpage);
-        int nid = page_to_nid(hpage);
-        list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
-                if (page == hpage)
-                        return 1;
-        return 0;
-}
 /*
 * This function is called from memory failure code.
 * Assume the caller holds page lock of the head page.
@@ -3800,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
        int ret = -EBUSY;
        spin_lock(&hugetlb_lock);
-        if (is_hugepage_on_freelist(hpage)) {
+        /*
+         * Just checking !page_huge_active is not enough, because that could be
+         * an isolated/hwpoisoned hugepage (which have >0 refcount).
+         */
+        if (!page_huge_active(hpage) && !page_count(hpage)) {
                /*
                 * Hwpoisoned hugepage isn't linked to activelist or freelist,
                 * but dangling hpage->lru can trigger list-debug warnings
@@ -3820,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
 bool isolate_huge_page(struct page *page, struct list_head *list)
 {
+        bool ret = true;
        VM_BUG_ON_PAGE(!PageHead(page), page);
-        if (!get_page_unless_zero(page))
-                return false;
        spin_lock(&hugetlb_lock);
+        if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+                ret = false;
+                goto unlock;
+        }
+        clear_page_huge_active(page);
        list_move_tail(&page->lru, list);
+unlock:
        spin_unlock(&hugetlb_lock);
-        return true;
+        return ret;
 }
 void putback_active_hugepage(struct page *page)
 {
        VM_BUG_ON_PAGE(!PageHead(page), page);
        spin_lock(&hugetlb_lock);
+        set_page_huge_active(page);
        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
        spin_unlock(&hugetlb_lock);
        put_page(page);
 }
-bool is_hugepage_active(struct page *page)
-{
-        VM_BUG_ON_PAGE(!PageHuge(page), page);
-        /*
-         * This function can be called for a tail page because the caller,
-         * scan_movable_pages, scans through a given pfn-range which typically
-         * covers one memory block. In systems using gigantic hugepage (1GB
-         * for x86_64,) a hugepage is larger than a memory block, and we don't
-         * support migrating such large hugepages for now, so return false
-         * when called for tail pages.
-         */
-        if (PageTail(page))
-                return false;
-        /*
-         * Refcount of a hwpoisoned hugepages is 1, but they are not active,
-         * so we should return false for them.
-         */
-        if (unlikely(PageHWPoison(page)))
-                return false;
-        return page_count(page) > 0;
-}
diff --git a/mm/internal.h b/mm/internal.h
index edaab69a9c35..a25e359a4039 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -224,13 +224,13 @@ static inline unsigned long page_order(struct page *page)
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
- * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
-#define page_order_unsafe(page)         ACCESS_ONCE(page_private(page))
+#define page_order_unsafe(page)         READ_ONCE(page_private(page))
 static inline bool is_cow_mapping(vm_flags_t flags)
 {
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 936d81661c47..6c513a63ea84 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size)
                kasan_kmalloc(page->slab_cache, object, size);
 }
+void kasan_kfree(void *ptr)
+{
+        struct page *page;
+        page = virt_to_head_page(ptr);
+        if (unlikely(!PageSlab(page)))
+                kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+                                KASAN_FREE_PAGE);
+        else
+                kasan_slab_free(page->slab_cache, ptr);
+}
 void kasan_kfree_large(const void *ptr)
 {
        struct page *page = virt_to_page(ptr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 4162dce2eb44..7ee101eaacdf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
        expected_mapping = (void *)stable_node +
                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 again:
-        kpfn = ACCESS_ONCE(stable_node->kpfn);
+        kpfn = READ_ONCE(stable_node->kpfn);
        page = pfn_to_page(kpfn);
        /*
@@ -551,7 +551,7 @@ again:
         * but on Alpha we need to be more careful.
         */
        smp_read_barrier_depends();
-        if (ACCESS_ONCE(page->mapping) != expected_mapping)
+        if (READ_ONCE(page->mapping) != expected_mapping)
                goto stale;
        /*
@@ -577,14 +577,14 @@ again:
                cpu_relax();
        }
-        if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+        if (READ_ONCE(page->mapping) != expected_mapping) {
                put_page(page);
                goto stale;
        }
        if (lock_it) {
                lock_page(page);
-                if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+                if (READ_ONCE(page->mapping) != expected_mapping) {
                        unlock_page(page);
                        put_page(page);
                        goto stale;
@@ -600,7 +600,7 @@ stale:
         * before checking whether node->kpfn has been changed.
         */
        smp_rmb();
-        if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+        if (READ_ONCE(stable_node->kpfn) != kpfn)
                goto again;
        remove_node_from_stable_tree(stable_node);
        return NULL;
diff --git a/mm/memblock.c b/mm/memblock.c
index 3f37a0bca5d5..9318b567ed79 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
        return memblock_add_range(&memblock.memory, base, size, nid, 0);
 }
+static int __init_memblock memblock_add_region(phys_addr_t base,
+                                                phys_addr_t size,
+                                                int nid,
+                                                unsigned long flags)
+{
+        struct memblock_type *_rgn = &memblock.memory;
+        memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
+                     (unsigned long long)base,
+                     (unsigned long long)base + size - 1,
+                     flags, (void *)_RET_IP_);
+        return memblock_add_range(_rgn, base, size, nid, flags);
+}
 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
-        return memblock_add_range(&memblock.memory, base, size,
+        return memblock_add_region(base, size, MAX_NUMNODES, 0);
-                                   MAX_NUMNODES, 0);
 }
 /**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c3f09b2dda5f..14c2f2017e37 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -259,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
- *
- * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark. May be even add a low water mark, such that
- * no reclaim occurs from a cgroup at it's low water mark, this is
- * a feature that will be implemented much later in the future.
 */
 struct mem_cgroup {
        struct cgroup_subsys_state css;
@@ -460,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
        return memcg->css.id;
 }
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock().  The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 {
        struct cgroup_subsys_state *css;
@@ -673,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 {
        unsigned long nr_pages = page_counter_read(&memcg->memory);
-        unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+        unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
        unsigned long excess = 0;
        if (nr_pages > soft_limit)
@@ -1041,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                        goto out_unlock;
                do {
-                        pos = ACCESS_ONCE(iter->position);
+                        pos = READ_ONCE(iter->position);
                        /*
                         * A racing update may change the position and
                         * put the last reference, hence css_tryget(),
@@ -1358,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
        unsigned long limit;
        count = page_counter_read(&memcg->memory);
-        limit = ACCESS_ONCE(memcg->memory.limit);
+        limit = READ_ONCE(memcg->memory.limit);
        if (count < limit)
                margin = limit - count;
        if (do_swap_account) {
                count = page_counter_read(&memcg->memsw);
-                limit = ACCESS_ONCE(memcg->memsw.limit);
+                limit = READ_ONCE(memcg->memsw.limit);
                if (count <= limit)
                        margin = min(margin, limit - count);
        }
@@ -2349,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 }
 /*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock().  The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
-        /* ID 0 is unused ID */
-        if (!id)
-                return NULL;
-        return mem_cgroup_from_id(id);
-}
-/*
 * try_get_mem_cgroup_from_page - look up page's memcg association
 * @page: the page
 *
@@ -2388,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
                ent.val = page_private(page);
                id = lookup_swap_cgroup_id(ent);
                rcu_read_lock();
-                memcg = mem_cgroup_lookup(id);
+                memcg = mem_cgroup_from_id(id);
                if (memcg && !css_tryget_online(&memcg->css))
                        memcg = NULL;
                rcu_read_unlock();
@@ -2650,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
                return cachep;
        memcg = get_mem_cgroup_from_mm(current->mm);
-        kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+        kmemcg_id = READ_ONCE(memcg->kmemcg_id);
        if (kmemcg_id < 0)
                goto out;
@@ -5020,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
         * tunable will only affect upcoming migrations, not the current one.
         * So we need to save it, and keep it going.
         */
-        move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+        move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
        if (move_flags) {
                struct mm_struct *mm;
                struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5254,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
 static int memory_low_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        unsigned long low = ACCESS_ONCE(memcg->low);
+        unsigned long low = READ_ONCE(memcg->low);
        if (low == PAGE_COUNTER_MAX)
                seq_puts(m, "max\n");
@@ -5284,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
 static int memory_high_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        unsigned long high = ACCESS_ONCE(memcg->high);
+        unsigned long high = READ_ONCE(memcg->high);
        if (high == PAGE_COUNTER_MAX)
                seq_puts(m, "max\n");
@@ -5314,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 static int memory_max_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        unsigned long max = ACCESS_ONCE(memcg->memory.limit);
+        unsigned long max = READ_ONCE(memcg->memory.limit);
        if (max == PAGE_COUNTER_MAX)
                seq_puts(m, "max\n");
@@ -5869,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
        id = swap_cgroup_record(entry, 0);
        rcu_read_lock();
-        memcg = mem_cgroup_lookup(id);
+        memcg = mem_cgroup_from_id(id);
        if (memcg) {
                if (!mem_cgroup_is_root(memcg))
                        page_counter_uncharge(&memcg->memsw, 1);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d487f8dc6d39..d9359b770cd9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -521,6 +521,52 @@ static const char *action_name[] = {
        [RECOVERED] = "Recovered",
 };
+enum action_page_type {
+        MSG_KERNEL,
+        MSG_KERNEL_HIGH_ORDER,
+        MSG_SLAB,
+        MSG_DIFFERENT_COMPOUND,
+        MSG_POISONED_HUGE,
+        MSG_HUGE,
+        MSG_FREE_HUGE,
+        MSG_UNMAP_FAILED,
+        MSG_DIRTY_SWAPCACHE,
+        MSG_CLEAN_SWAPCACHE,
+        MSG_DIRTY_MLOCKED_LRU,
+        MSG_CLEAN_MLOCKED_LRU,
+        MSG_DIRTY_UNEVICTABLE_LRU,
+        MSG_CLEAN_UNEVICTABLE_LRU,
+        MSG_DIRTY_LRU,
+        MSG_CLEAN_LRU,
+        MSG_TRUNCATED_LRU,
+        MSG_BUDDY,
+        MSG_BUDDY_2ND,
+        MSG_UNKNOWN,
+};
+static const char * const action_page_types[] = {
+        [MSG_KERNEL]                    = "reserved kernel page",
+        [MSG_KERNEL_HIGH_ORDER]         = "high-order kernel page",
+        [MSG_SLAB]                      = "kernel slab page",
+        [MSG_DIFFERENT_COMPOUND]        = "different compound page after locking",
+        [MSG_POISONED_HUGE]             = "huge page already hardware poisoned",
+        [MSG_HUGE]                      = "huge page",
+        [MSG_FREE_HUGE]                 = "free huge page",
+        [MSG_UNMAP_FAILED]              = "unmapping failed page",
+        [MSG_DIRTY_SWAPCACHE]           = "dirty swapcache page",
+        [MSG_CLEAN_SWAPCACHE]           = "clean swapcache page",
+        [MSG_DIRTY_MLOCKED_LRU]         = "dirty mlocked LRU page",
+        [MSG_CLEAN_MLOCKED_LRU]         = "clean mlocked LRU page",
+        [MSG_DIRTY_UNEVICTABLE_LRU]     = "dirty unevictable LRU page",
+        [MSG_CLEAN_UNEVICTABLE_LRU]     = "clean unevictable LRU page",
+        [MSG_DIRTY_LRU]                 = "dirty LRU page",
+        [MSG_CLEAN_LRU]                 = "clean LRU page",
+        [MSG_TRUNCATED_LRU]             = "already truncated LRU page",
+        [MSG_BUDDY]                     = "free buddy page",
+        [MSG_BUDDY_2ND]                 = "free buddy page (2nd try)",
+        [MSG_UNKNOWN]                   = "unknown page",
+};
 /*
 * XXX: It is possible that a page is isolated from LRU cache,
 * and then kept in swap cache or failed to remove from page cache.
@@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 static struct page_state {
        unsigned long mask;
        unsigned long res;
-        char *msg;
+        enum action_page_type type;
        int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
-        { reserved,     reserved,       "reserved kernel",      me_kernel },
+        { reserved,     reserved,       MSG_KERNEL,     me_kernel },
        /*
         * free pages are specially detected outside this table:
         * PG_buddy pages only make a small fraction of all free pages.
@@ -791,31 +837,31 @@ static struct page_state {
         * currently unused objects without touching them. But just
         * treat it as standard kernel for now.
         */
-        { slab,         slab,           "kernel slab",  me_kernel },
+        { slab,         slab,           MSG_SLAB,       me_kernel },
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
-        { head,         head,           "huge",         me_huge_page },
+        { head,         head,           MSG_HUGE,               me_huge_page },
-        { tail,         tail,           "huge",         me_huge_page },
+        { tail,         tail,           MSG_HUGE,               me_huge_page },
 #else
-        { compound,     compound,       "huge",         me_huge_page },
+        { compound,     compound,       MSG_HUGE,               me_huge_page },
 #endif
-        { sc|dirty,     sc|dirty,       "dirty swapcache",      me_swapcache_dirty },
+        { sc|dirty,     sc|dirty,       MSG_DIRTY_SWAPCACHE,    me_swapcache_dirty },
-        { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
+        { sc|dirty,     sc,             MSG_CLEAN_SWAPCACHE,    me_swapcache_clean },
-        { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
+        { mlock|dirty,  mlock|dirty,    MSG_DIRTY_MLOCKED_LRU,  me_pagecache_dirty },
-        { mlock|dirty,  mlock,          "clean mlocked LRU",    me_pagecache_clean },
+        { mlock|dirty,  mlock,          MSG_CLEAN_MLOCKED_LRU,  me_pagecache_clean },
-        { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+        { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU,      me_pagecache_dirty },
-        { unevict|dirty, unevict,       "clean unevictable LRU", me_pagecache_clean },
+        { unevict|dirty, unevict,       MSG_CLEAN_UNEVICTABLE_LRU,      me_pagecache_clean },
-        { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
+        { lru|dirty,    lru|dirty,      MSG_DIRTY_LRU,  me_pagecache_dirty },
-        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
+        { lru|dirty,    lru,            MSG_CLEAN_LRU,  me_pagecache_clean },
        /*
         * Catchall entry: must be at end.
         */
-        { 0,            0,              "unknown page state",   me_unknown },
+        { 0,            0,              MSG_UNKNOWN,    me_unknown },
 };
 #undef dirty
@@ -835,10 +881,10 @@ static struct page_state {
 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
 */
-static void action_result(unsigned long pfn, char *msg, int result)
+static void action_result(unsigned long pfn, enum action_page_type type, int result)
 {
-        pr_err("MCE %#lx: %s page recovery: %s\n",
+        pr_err("MCE %#lx: recovery action for %s: %s\n",
-                pfn, msg, action_name[result]);
+                pfn, action_page_types[type], action_name[result]);
 }
 static int page_action(struct page_state *ps, struct page *p,
@@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p,
                count--;
        if (count != 0) {
                printk(KERN_ERR
-                       "MCE %#lx: %s page still referenced by %d users\n",
+                       "MCE %#lx: %s still referenced by %d users\n",
-                       pfn, ps->msg, count);
+                       pfn, action_page_types[ps->type], count);
                result = FAILED;
        }
-        action_result(pfn, ps->msg, result);
+        action_result(pfn, ps->type, result);
        /* Could do more checks here if page looks ok */
        /*
@@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        if (!(flags & MF_COUNT_INCREASED) &&
                !get_page_unless_zero(hpage)) {
                if (is_free_buddy_page(p)) {
-                        action_result(pfn, "free buddy", DELAYED);
+                        action_result(pfn, MSG_BUDDY, DELAYED);
                        return 0;
                } else if (PageHuge(hpage)) {
                        /*
@@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        }
                        set_page_hwpoison_huge_page(hpage);
                        res = dequeue_hwpoisoned_huge_page(hpage);
-                        action_result(pfn, "free huge",
+                        action_result(pfn, MSG_FREE_HUGE,
                                      res ? IGNORED : DELAYED);
                        unlock_page(hpage);
                        return res;
                } else {
-                        action_result(pfn, "high order kernel", IGNORED);
+                        action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
                        return -EBUSY;
                }
        }
@@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                         */
                        if (is_free_buddy_page(p)) {
                                if (flags & MF_COUNT_INCREASED)
-                                        action_result(pfn, "free buddy", DELAYED);
+                                        action_result(pfn, MSG_BUDDY, DELAYED);
                                else
-                                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                                        action_result(pfn, MSG_BUDDY_2ND,
+                                                      DELAYED);
                                return 0;
                        }
                }
@@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * If this happens just bail out.
         */
        if (compound_head(p) != hpage) {
-                action_result(pfn, "different compound page after locking", IGNORED);
+                action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * on the head page to show that the hugepage is hwpoisoned
         */
        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
-                action_result(pfn, "hugepage already hardware poisoned",
+                action_result(pfn, MSG_POISONED_HUGE, IGNORED);
-                                IGNORED);
                unlock_page(hpage);
                put_page(hpage);
                return 0;
@@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         */
        if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
            != SWAP_SUCCESS) {
-                action_result(pfn, "unmapping failed", IGNORED);
+                action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * Torn down by someone else?
         */
        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
-                action_result(pfn, "already truncated LRU", IGNORED);
+                action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1540,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
        unlock_page(hpage);
-        /* Keep page count to indicate a given hugepage is isolated. */
+        ret = isolate_huge_page(hpage, &pagelist);
-        list_move(&hpage->lru, &pagelist);
+        if (ret) {
+                /*
+                 * get_any_page() and isolate_huge_page() takes a refcount each,
+                 * so need to drop one here.
+                 */
+                put_page(hpage);
+        } else {
+                pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
+                return -EBUSY;
+        }
        ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                MIGRATE_SYNC, MR_MEMORY_FAILURE);
        if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index ac20b2a6a0c3..22e037e3364e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        /*
         * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
         */
-        if (vma->vm_ops)
+        pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
-                printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
+                 vma->vm_file,
-                       vma->vm_ops->fault);
+                 vma->vm_ops ? vma->vm_ops->fault : NULL,
-        if (vma->vm_file)
+                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
-                printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
+                 mapping ? mapping->a_ops->readpage : NULL);
-                       vma->vm_file->f_op->mmap);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
@@ -2181,6 +2180,42 @@ oom:
        return VM_FAULT_OOM;
 }
+/*
+ * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
+ * mapping
+ */
+static int wp_pfn_shared(struct mm_struct *mm,
+                        struct vm_area_struct *vma, unsigned long address,
+                        pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+                        pmd_t *pmd)
+{
+        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
+                struct vm_fault vmf = {
+                        .page = NULL,
+                        .pgoff = linear_page_index(vma, address),
+                        .virtual_address = (void __user *)(address & PAGE_MASK),
+                        .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
+                };
+                int ret;
+                pte_unmap_unlock(page_table, ptl);
+                ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
+                if (ret & VM_FAULT_ERROR)
+                        return ret;
+                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+                /*
+                 * We might have raced with another page fault while we
+                 * released the pte_offset_map_lock.
+                 */
+                if (!pte_same(*page_table, orig_pte)) {
+                        pte_unmap_unlock(page_table, ptl);
+                        return 0;
+                }
+        }
+        return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
+                             NULL, 0, 0);
+}
 static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
                          unsigned long address, pte_t *page_table,
                          pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
@@ -2259,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * VM_PFNMAP VMA.
                 *
                 * We should not cow pages in a shared writeable mapping.
-                 * Just mark the pages writable as we can't do any dirty
+                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
-                 * accounting on raw pfn maps.
                 */
                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                     (VM_WRITE|VM_SHARED))
-                        return wp_page_reuse(mm, vma, address, page_table, ptl,
+                        return wp_pfn_shared(mm, vma, address, page_table, ptl,
-                                             orig_pte, old_page, 0, 0);
+                                             orig_pte, pmd);
                pte_unmap_unlock(page_table, ptl);
                return wp_page_copy(mm, vma, address, page_table, pmd,
@@ -2845,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
        struct vm_fault vmf;
        int off;
-        nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+        nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
        mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
        start_addr = max(address & mask, vma->vm_start);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e2e8014fb755..457bde530cbe 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1373,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
                        if (PageLRU(page))
                                return pfn;
                        if (PageHuge(page)) {
-                                if (is_hugepage_active(page))
+                                if (page_huge_active(page))
                                        return pfn;
                                else
                                        pfn = round_up(pfn + 1,
diff --git a/mm/mempool.c b/mm/mempool.c
index 949970db2874..2cc08de8b1db 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -6,26 +6,138 @@
 *  extreme VM load.
 *
 *  started by Ingo Molnar, Copyright (C) 2001
+ *  debugging by David Rientjes, Copyright (C) 2015
 */
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/export.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
+#include "slab.h"
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+static void poison_error(mempool_t *pool, void *element, size_t size,
+                         size_t byte)
+{
+        const int nr = pool->curr_nr;
+        const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
+        const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
+        int i;
+        pr_err("BUG: mempool element poison mismatch\n");
+        pr_err("Mempool %p size %zu\n", pool, size);
+        pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
+        for (i = start; i < end; i++)
+                pr_cont("%x ", *(u8 *)(element + i));
+        pr_cont("%s\n", end < size ? "..." : "");
+        dump_stack();
+}
+static void __check_element(mempool_t *pool, void *element, size_t size)
+{
+        u8 *obj = element;
+        size_t i;
+        for (i = 0; i < size; i++) {
+                u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
+                if (obj[i] != exp) {
+                        poison_error(pool, element, size, i);
+                        return;
+                }
+        }
+        memset(obj, POISON_INUSE, size);
+}
+static void check_element(mempool_t *pool, void *element)
+{
+        /* Mempools backed by slab allocator */
+        if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+                __check_element(pool, element, ksize(element));
+        /* Mempools backed by page allocator */
+        if (pool->free == mempool_free_pages) {
+                int order = (int)(long)pool->pool_data;
+                void *addr = kmap_atomic((struct page *)element);
+                __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
+                kunmap_atomic(addr);
+        }
+}
+static void __poison_element(void *element, size_t size)
+{
+        u8 *obj = element;
+        memset(obj, POISON_FREE, size - 1);
+        obj[size - 1] = POISON_END;
+}
+static void poison_element(mempool_t *pool, void *element)
+{
+        /* Mempools backed by slab allocator */
+        if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+                __poison_element(element, ksize(element));
+        /* Mempools backed by page allocator */
+        if (pool->alloc == mempool_alloc_pages) {
+                int order = (int)(long)pool->pool_data;
+                void *addr = kmap_atomic((struct page *)element);
+                __poison_element(addr, 1UL << (PAGE_SHIFT + order));
+                kunmap_atomic(addr);
+        }
+}
+#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static inline void check_element(mempool_t *pool, void *element)
+{
+}
+static inline void poison_element(mempool_t *pool, void *element)
+{
+}
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static void kasan_poison_element(mempool_t *pool, void *element)
+{
+        if (pool->alloc == mempool_alloc_slab)
+                kasan_slab_free(pool->pool_data, element);
+        if (pool->alloc == mempool_kmalloc)
+                kasan_kfree(element);
+        if (pool->alloc == mempool_alloc_pages)
+                kasan_free_pages(element, (unsigned long)pool->pool_data);
+}
+static void kasan_unpoison_element(mempool_t *pool, void *element)
+{
+        if (pool->alloc == mempool_alloc_slab)
+                kasan_slab_alloc(pool->pool_data, element);
+        if (pool->alloc == mempool_kmalloc)
+                kasan_krealloc(element, (size_t)pool->pool_data);
+        if (pool->alloc == mempool_alloc_pages)
+                kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+}
 static void add_element(mempool_t *pool, void *element)
 {
        BUG_ON(pool->curr_nr >= pool->min_nr);
+        poison_element(pool, element);
+        kasan_poison_element(pool, element);
        pool->elements[pool->curr_nr++] = element;
 }
 static void *remove_element(mempool_t *pool)
 {
-        BUG_ON(pool->curr_nr <= 0);
+        void *element = pool->elements[--pool->curr_nr];
-        return pool->elements[--pool->curr_nr];
+        BUG_ON(pool->curr_nr < 0);
+        check_element(pool, element);
+        kasan_unpoison_element(pool, element);
+        return element;
 }
 /**
@@ -334,6 +446,7 @@ EXPORT_SYMBOL(mempool_free);
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
        struct kmem_cache *mem = pool_data;
+        VM_BUG_ON(mem->ctor);
        return kmem_cache_alloc(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
diff --git a/mm/migrate.c b/mm/migrate.c
index a65ff72ab739..f53838fe3dfe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
         * Please do not reorder this without considering how mm/ksm.c's
         * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
         */
-        ClearPageSwapCache(page);
+        if (PageSwapCache(page))
+                ClearPageSwapCache(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
diff --git a/mm/mmap.c b/mm/mmap.c
index 06a6076c92e5..bb50cacc3ea5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
 * by another page fault trying to merge _that_. But that's ok: if it
 * is being set up, that automatically means that it will be a singleton
 * acceptable for merging, so we can do all of this optimistically. But
- * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ * we do that READ_ONCE() to make sure that we never re-load the pointer.
 *
 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
@@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
 {
        if (anon_vma_compatible(a, b)) {
-                struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
                if (anon_vma && list_is_singular(&old->anon_vma_chain))
                        return anon_vma;
@@ -1551,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        /* Clear old maps */
        error = -ENOMEM;
-munmap_back:
+        while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
-        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+                              &rb_parent)) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
-                goto munmap_back;
        }
        /*
@@ -1571,7 +1570,8 @@ munmap_back:
        /*
         * Can we just expand an old mapping?
         */
-        vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+        vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+                        NULL);
        if (vma)
                goto out;
@@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
        actual_size = size;
        if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
                actual_size -= PAGE_SIZE;
-        if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+        if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
                return -ENOMEM;
        /* mlock limit tests */
@@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
                unsigned long locked;
                unsigned long limit;
                locked = mm->locked_vm + grow;
-                limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+                limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
                limit >>= PAGE_SHIFT;
                if (locked > limit && !capable(CAP_IPC_LOCK))
                        return -ENOMEM;
@@ -2739,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
        /*
         * Clear old maps.  this also does some error checking for us
         */
- munmap_back:
+        while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
-        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+                              &rb_parent)) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
-                goto munmap_back;
        }
        /* Check against address space limits *after* clearing old maps... */
diff --git a/mm/mremap.c b/mm/mremap.c
index 2dc44b1cb1df..034e2d360652 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -345,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
        struct vm_area_struct *vma = find_vma(mm, addr);
        if (!vma || vma->vm_start > addr)
-                goto Efault;
+                return ERR_PTR(-EFAULT);
        if (is_vm_hugetlb_page(vma))
-                goto Einval;
+                return ERR_PTR(-EINVAL);
        /* We can't remap across vm area boundaries */
        if (old_len > vma->vm_end - addr)
-                goto Efault;
+                return ERR_PTR(-EFAULT);
        /* Need to be careful about a growing mapping */
        if (new_len > old_len) {
                unsigned long pgoff;
                if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
-                        goto Efault;
+                        return ERR_PTR(-EFAULT);
                pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
                pgoff += vma->vm_pgoff;
                if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
-                        goto Einval;
+                        return ERR_PTR(-EINVAL);
        }
        if (vma->vm_flags & VM_LOCKED) {
@@ -372,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
                lock_limit = rlimit(RLIMIT_MEMLOCK);
                locked += new_len - old_len;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                        goto Eagain;
+                        return ERR_PTR(-EAGAIN);
        }
        if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
-                goto Enomem;
+                return ERR_PTR(-ENOMEM);
        if (vma->vm_flags & VM_ACCOUNT) {
                unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
                if (security_vm_enough_memory_mm(mm, charged))
-                        goto Efault;
+                        return ERR_PTR(-ENOMEM);
                *p = charged;
        }
        return vma;
-Efault: /* very odd choice for most of the cases, but... */
-        return ERR_PTR(-EFAULT);
-Einval:
-        return ERR_PTR(-EINVAL);
-Enomem:
-        return ERR_PTR(-ENOMEM);
-Eagain:
-        return ERR_PTR(-EAGAIN);
 }
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 52628c819bf7..2b665da1b3c9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly;
 static DECLARE_RWSEM(oom_sem);
 /**
- * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * mark_tsk_oom_victim - marks the given task as OOM victim.
 * @tsk: task to mark
 *
 * Has to be called with oom_sem taken for read and never after
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0372411f38fc..5daf5568b9e1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2228,7 +2228,8 @@ int set_page_dirty(struct page *page)
                 * it will confuse readahead and make it restart the size rampup
                 * process. But it's a trivial problem.
                 */
-                ClearPageReclaim(page);
+                if (PageReclaim(page))
+                        ClearPageReclaim(page);
 #ifdef CONFIG_BLOCK
                if (!spd)
                        spd = __set_page_dirty_buffers;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1b849500640c..ebffa0e4a9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1371,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
        int to_drain, batch;
        local_irq_save(flags);
-        batch = ACCESS_ONCE(pcp->batch);
+        batch = READ_ONCE(pcp->batch);
        to_drain = min(pcp->count, batch);
        if (to_drain > 0) {
                free_pcppages_bulk(zone, to_drain, pcp);
@@ -1570,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                list_add_tail(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
-                unsigned long batch = ACCESS_ONCE(pcp->batch);
+                unsigned long batch = READ_ONCE(pcp->batch);
                free_pcppages_bulk(zone, batch, pcp);
                pcp->count -= batch;
        }
@@ -6207,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
        mask <<= (BITS_PER_LONG - bitidx - 1);
        flags <<= (BITS_PER_LONG - bitidx - 1);
-        word = ACCESS_ONCE(bitmap[word_bitidx]);
+        word = READ_ONCE(bitmap[word_bitidx]);
        for (;;) {
                old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
                if (word == old_word)
diff --git a/mm/rmap.c b/mm/rmap.c
index c161a14b6a8f..24dd3f9fee27 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -456,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
        unsigned long anon_mapping;
        rcu_read_lock();
-        anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+        anon_mapping = (unsigned long)READ_ONCE(page->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
@@ -500,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
        unsigned long anon_mapping;
        rcu_read_lock();
-        anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+        anon_mapping = (unsigned long)READ_ONCE(page->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
                goto out;
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-        root_anon_vma = ACCESS_ONCE(anon_vma->root);
+        root_anon_vma = READ_ONCE(anon_vma->root);
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the page is still mapped, then this anon_vma is still
diff --git a/mm/slub.c b/mm/slub.c
index 0fdd6c1e1f82..54c0876b43d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4277,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        int node;
                        struct page *page;
-                        page = ACCESS_ONCE(c->page);
+                        page = READ_ONCE(c->page);
                        if (!page)
                                continue;
@@ -4292,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        total += x;
                        nodes[node] += x;
-                        page = ACCESS_ONCE(c->partial);
+                        page = READ_ONCE(c->partial);
                        if (page) {
                                node = page_to_nid(page);
                                if (flags & SO_TOTAL)
diff --git a/mm/swap.c b/mm/swap.c
index cd3a5e64cea9..a7251a8ed532 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
@@ -42,7 +43,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
 /*
 * This path almost never happens for VM activity - pages are normally
@@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page)
 {
        compound_page_dtor *dtor;
-        __page_cache_release(page);
+        /*
+         * __page_cache_release() is supposed to be called for thp, not for
+         * hugetlb. This is because hugetlb page does never have PageLRU set
+         * (it's never listed to any LRU lists) and no memcg routines should
+         * be called for hugetlb (it has a separate hugetlb_cgroup.)
+         */
+        if (!PageHuge(page))
+                __page_cache_release(page);
        dtor = get_compound_page_dtor(page);
        (*dtor)(page);
 }
@@ -743,7 +751,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
 * be write it out by flusher threads as this is much more effective
 * than the single-page writeout from reclaim.
 */
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
                              void *arg)
 {
        int lru, file;
@@ -811,36 +819,36 @@ void lru_add_drain_cpu(int cpu)
                local_irq_restore(flags);
        }
-        pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+        pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
        if (pagevec_count(pvec))
-                pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
        activate_page_drain(cpu);
 }
 /**
- * deactivate_page - forcefully deactivate a page
+ * deactivate_file_page - forcefully deactivate a file page
 * @page: page to deactivate
 *
 * This function hints the VM that @page is a good reclaim candidate,
 * for example if its invalidation fails due to the page being dirty
 * or under writeback.
 */
-void deactivate_page(struct page *page)
+void deactivate_file_page(struct page *page)
 {
        /*
-         * In a workload with many unevictable page such as mprotect, unevictable
+         * In a workload with many unevictable page such as mprotect,
-         * page deactivation for accelerating reclaim is pointless.
+         * unevictable page deactivation for accelerating reclaim is pointless.
         */
        if (PageUnevictable(page))
                return;
        if (likely(get_page_unless_zero(page))) {
-                struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+                struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
                if (!pagevec_add(pvec, page))
-                        pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+                        pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
-                put_cpu_var(lru_deactivate_pvecs);
+                put_cpu_var(lru_deactivate_file_pvecs);
        }
 }
@@ -872,7 +880,7 @@ void lru_add_drain_all(void)
                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
-                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+                    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 405923f77334..8bc8e66138da 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
        unsigned int pages, max_pages, last_ra;
        static atomic_t last_readahead_pages;
-        max_pages = 1 << ACCESS_ONCE(page_cluster);
+        max_pages = 1 << READ_ONCE(page_cluster);
        if (max_pages <= 1)
                return 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63f55ccb9b26..a7e72103f23b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                        else
                                continue;
                }
-                count = ACCESS_ONCE(si->swap_map[i]);
+                count = READ_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
        }
diff --git a/mm/truncate.c b/mm/truncate.c
index 7a9d8a3cb143..66af9031fae8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -490,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                         * of interest and try to speed up its reclaim.
                         */
                        if (!ret)
-                                deactivate_page(page);
+                                deactivate_file_page(page);
                        count += ret;
                }
                pagevec_remove_exceptionals(&pvec);
diff --git a/mm/util.c b/mm/util.c
index 3981ae9d1b15..68ff8a5361e7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -325,9 +325,37 @@ void kvfree(const void *addr)
 }
 EXPORT_SYMBOL(kvfree);
+static inline void *__page_rmapping(struct page *page)
+{
+        unsigned long mapping;
+        mapping = (unsigned long)page->mapping;
+        mapping &= ~PAGE_MAPPING_FLAGS;
+        return (void *)mapping;
+}
+/* Neutral page->mapping pointer to address_space or anon_vma or other */
+void *page_rmapping(struct page *page)
+{
+        page = compound_head(page);
+        return __page_rmapping(page);
+}
+struct anon_vma *page_anon_vma(struct page *page)
+{
+        unsigned long mapping;
+        page = compound_head(page);
+        mapping = (unsigned long)page->mapping;
+        if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+                return NULL;
+        return __page_rmapping(page);
+}
 struct address_space *page_mapping(struct page *page)
 {
-        struct address_space *mapping = page->mapping;
+        unsigned long mapping;
        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(PageSlab(page)))
@@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page)
                swp_entry_t entry;
                entry.val = page_private(page);
-                mapping = swap_address_space(entry);
+                return swap_address_space(entry);
-        } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+        }
-                mapping = NULL;
-        return mapping;
+        mapping = (unsigned long)page->mapping;
+        if (mapping & PAGE_MAPPING_FLAGS)
+                return NULL;
+        return page->mapping;
 }
 int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a5bbdd3b5d67..2faaa2976447 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -765,7 +765,7 @@ struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
        unsigned long free, dirty;
-        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+        unsigned long dirty_min, dirty_max; /*< dirty range */
        struct list_head free_list;
        struct rcu_head rcu_head;
        struct list_head purge;
@@ -796,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr)
        return addr;
 }
-static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
+{
+        unsigned long addr;
+        addr = va_start + (pages_off << PAGE_SHIFT);
+        BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
+        return (void *)addr;
+}
+/**
+ * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
+ *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
+ * @order:    how many 2^order pages should be occupied in newly allocated block
+ * @gfp_mask: flags for the page level allocator
+ *
+ * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
+ */
+static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 {
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        struct vmap_area *va;
        unsigned long vb_idx;
        int node, err;
+        void *vaddr;
        node = numa_node_id();
@@ -826,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
                return ERR_PTR(err);
        }
+        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
-        vb->free = VMAP_BBMAP_BITS;
+        /* At least something should be left free */
+        BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+        vb->free = VMAP_BBMAP_BITS - (1UL << order);
        vb->dirty = 0;
-        bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+        vb->dirty_min = VMAP_BBMAP_BITS;
+        vb->dirty_max = 0;
        INIT_LIST_HEAD(&vb->free_list);
        vb_idx = addr_to_vb_idx(va->va_start);
@@ -842,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        vbq = &get_cpu_var(vmap_block_queue);
        spin_lock(&vbq->lock);
-        list_add_rcu(&vb->free_list, &vbq->free);
+        list_add_tail_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);
        put_cpu_var(vmap_block_queue);
-        return vb;
+        return vaddr;
 }
 static void free_vmap_block(struct vmap_block *vb)
@@ -881,7 +903,8 @@ static void purge_fragmented_blocks(int cpu)
                if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
                        vb->free = 0; /* prevent further allocs after releasing lock */
                        vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
-                        bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+                        vb->dirty_min = 0;
+                        vb->dirty_max = VMAP_BBMAP_BITS;
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
@@ -910,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 {
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
-        unsigned long addr = 0;
+        void *vaddr = NULL;
        unsigned int order;
        BUG_ON(size & ~PAGE_MASK);
@@ -925,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
        }
        order = get_order(size);
-again:
        rcu_read_lock();
        vbq = &get_cpu_var(vmap_block_queue);
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
-                int i;
+                unsigned long pages_off;
                spin_lock(&vb->lock);
-                if (vb->free < 1UL << order)
+                if (vb->free < (1UL << order)) {
-                        goto next;
+                        spin_unlock(&vb->lock);
+                        continue;
+                }
-                i = VMAP_BBMAP_BITS - vb->free;
+                pages_off = VMAP_BBMAP_BITS - vb->free;
-                addr = vb->va->va_start + (i << PAGE_SHIFT);
+                vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
-                BUG_ON(addr_to_vb_idx(addr) !=
-                                addr_to_vb_idx(vb->va->va_start));
                vb->free -= 1UL << order;
                if (vb->free == 0) {
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                }
                spin_unlock(&vb->lock);
                break;
-next:
-                spin_unlock(&vb->lock);
        }
        put_cpu_var(vmap_block_queue);
        rcu_read_unlock();
-        if (!addr) {
+        /* Allocate new block if nothing was found */
-                vb = new_vmap_block(gfp_mask);
+        if (!vaddr)
-                if (IS_ERR(vb))
+                vaddr = new_vmap_block(order, gfp_mask);
-                        return vb;
-                goto again;
-        }
-        return (void *)addr;
+        return vaddr;
 }
 static void vb_free(const void *addr, unsigned long size)
@@ -979,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size)
        order = get_order(size);
        offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+        offset >>= PAGE_SHIFT;
        vb_idx = addr_to_vb_idx((unsigned long)addr);
        rcu_read_lock();
@@ -989,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size)
        vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
        spin_lock(&vb->lock);
-        BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
+        /* Expand dirty range */
+        vb->dirty_min = min(vb->dirty_min, offset);
+        vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
        vb->dirty += 1UL << order;
        if (vb->dirty == VMAP_BBMAP_BITS) {
@@ -1028,25 +1050,18 @@ void vm_unmap_aliases(void)
                rcu_read_lock();
                list_for_each_entry_rcu(vb, &vbq->free, free_list) {
-                        int i, j;
                        spin_lock(&vb->lock);
-                        i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
+                        if (vb->dirty) {
-                        if (i < VMAP_BBMAP_BITS) {
+                                unsigned long va_start = vb->va->va_start;
                                unsigned long s, e;
-                                j = find_last_bit(vb->dirty_map,
+                                s = va_start + (vb->dirty_min << PAGE_SHIFT);
-                                                        VMAP_BBMAP_BITS);
+                                e = va_start + (vb->dirty_max << PAGE_SHIFT);
-                                j = j + 1; /* need exclusive index */
-                                s = vb->va->va_start + (i << PAGE_SHIFT);
+                                start = min(s, start);
-                                e = vb->va->va_start + (j << PAGE_SHIFT);
+                                end   = max(e, end);
-                                flush = 1;
-                                if (s < start)
+                                flush = 1;
-                                        start = s;
-                                if (e > end)
-                                        end = e;
                        }
                        spin_unlock(&vb->lock);
                }
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0dec1fa5f656..08bd7a3d464a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
 */
 /*
- * This allocator is designed for use with zram. Thus, the allocator is
- * supposed to work well under low memory conditions. In particular, it
- * never attempts higher order page allocation which is very likely to
- * fail under memory pressure. On the other hand, if we just use single
- * (0-order) pages, it would suffer from very high fragmentation --
- * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
- * This was one of the major issues with its predecessor (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
 * Following is how we use various fields and flags of underlying
 * struct page(s) to form a zspage.
 *
@@ -57,6 +28,8 @@
 *
 *      page->private (union with page->first_page): refers to the
 *              component page after the first page
+ *              If the page is first_page for huge object, it stores handle.
+ *              Look at size_class->huge.
 *      page->freelist: points to the first free object in zspage.
 *              Free objects are linked together using in-place
 *              metadata.
@@ -78,6 +51,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/highmem.h>
@@ -110,6 +84,8 @@
 #define ZS_MAX_ZSPAGE_ORDER 2
 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
 /*
 * Object location (<PFN>, <obj_idx>) is encoded as
 * as single (unsigned long) handle value.
@@ -133,13 +109,33 @@
 #endif
 #endif
 #define _PFN_BITS               (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS)
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT  0
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
 #define OBJ_INDEX_MASK  ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
 #define MAX(a, b) ((a) >= (b) ? (a) : (b))
 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
 #define ZS_MIN_ALLOC_SIZE \
        MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
 #define ZS_MAX_ALLOC_SIZE       PAGE_SIZE
 /*
@@ -172,6 +168,8 @@ enum fullness_group {
 enum zs_stat_type {
        OBJ_ALLOCATED,
        OBJ_USED,
+        CLASS_ALMOST_FULL,
+        CLASS_ALMOST_EMPTY,
        NR_ZS_STAT_TYPE,
 };
@@ -216,6 +214,8 @@ struct size_class {
        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
        int pages_per_zspage;
+        /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+        bool huge;
 #ifdef CONFIG_ZSMALLOC_STAT
        struct zs_size_stat stats;
@@ -233,14 +233,24 @@ struct size_class {
 * This must be power of 2 and less than or equal to ZS_ALIGN
 */
 struct link_free {
-        /* Handle of next free chunk (encodes <PFN, obj_idx>) */
+        union {
-        void *next;
+                /*
+                 * Position of next free chunk (encodes <PFN, obj_idx>)
+                 * It's valid for non-allocated object
+                 */
+                void *next;
+                /*
+                 * Handle of allocated object.
+                 */
+                unsigned long handle;
+        };
 };
 struct zs_pool {
        char *name;
        struct size_class **size_class;
+        struct kmem_cache *handle_cachep;
        gfp_t flags;    /* allocation flags used when growing pool */
        atomic_long_t pages_allocated;
@@ -267,8 +277,37 @@ struct mapping_area {
 #endif
        char *vm_addr; /* address of kmap_atomic()'ed pages */
        enum zs_mapmode vm_mm; /* mapping mode */
+        bool huge;
 };
+static int create_handle_cache(struct zs_pool *pool)
+{
+        pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+                                        0, 0, NULL);
+        return pool->handle_cachep ? 0 : 1;
+}
+static void destroy_handle_cache(struct zs_pool *pool)
+{
+        kmem_cache_destroy(pool->handle_cachep);
+}
+static unsigned long alloc_handle(struct zs_pool *pool)
+{
+        return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+                pool->flags & ~__GFP_HIGHMEM);
+}
+static void free_handle(struct zs_pool *pool, unsigned long handle)
+{
+        kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+        *(unsigned long *)handle = obj;
+}
 /* zpool driver */
 #ifdef CONFIG_ZPOOL
@@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = {
 MODULE_ALIAS("zpool-zsmalloc");
 #endif /* CONFIG_ZPOOL */
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+        return pages_per_zspage * PAGE_SIZE / size;
+}
 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -396,9 +440,182 @@ static int get_size_class_index(int size)
                idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
                                ZS_SIZE_CLASS_DELTA);
-        return idx;
+        return min(zs_size_classes - 1, idx);
+}
+#ifdef CONFIG_ZSMALLOC_STAT
+static inline void zs_stat_inc(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+        class->stats.objs[type] += cnt;
+}
+static inline void zs_stat_dec(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+        class->stats.objs[type] -= cnt;
+}
+static inline unsigned long zs_stat_get(struct size_class *class,
+                                enum zs_stat_type type)
+{
+        return class->stats.objs[type];
+}
+static int __init zs_stat_init(void)
+{
+        if (!debugfs_initialized())
+                return -ENODEV;
+        zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+        if (!zs_stat_root)
+                return -ENOMEM;
+        return 0;
+}
+static void __exit zs_stat_exit(void)
+{
+        debugfs_remove_recursive(zs_stat_root);
+}
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+        int i;
+        struct zs_pool *pool = s->private;
+        struct size_class *class;
+        int objs_per_zspage;
+        unsigned long class_almost_full, class_almost_empty;
+        unsigned long obj_allocated, obj_used, pages_used;
+        unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+        unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+        seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+                        "class", "size", "almost_full", "almost_empty",
+                        "obj_allocated", "obj_used", "pages_used",
+                        "pages_per_zspage");
+        for (i = 0; i < zs_size_classes; i++) {
+                class = pool->size_class[i];
+                if (class->index != i)
+                        continue;
+                spin_lock(&class->lock);
+                class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+                class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+                obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+                obj_used = zs_stat_get(class, OBJ_USED);
+                spin_unlock(&class->lock);
+                objs_per_zspage = get_maxobj_per_zspage(class->size,
+                                class->pages_per_zspage);
+                pages_used = obj_allocated / objs_per_zspage *
+                                class->pages_per_zspage;
+                seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+                        i, class->size, class_almost_full, class_almost_empty,
+                        obj_allocated, obj_used, pages_used,
+                        class->pages_per_zspage);
+                total_class_almost_full += class_almost_full;
+                total_class_almost_empty += class_almost_empty;
+                total_objs += obj_allocated;
+                total_used_objs += obj_used;
+                total_pages += pages_used;
+        }
+        seq_puts(s, "\n");
+        seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+                        "Total", "", total_class_almost_full,
+                        total_class_almost_empty, total_objs,
+                        total_used_objs, total_pages);
+        return 0;
+}
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, zs_stats_size_show, inode->i_private);
+}
+static const struct file_operations zs_stat_size_ops = {
+        .open           = zs_stats_size_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+        struct dentry *entry;
+        if (!zs_stat_root)
+                return -ENODEV;
+        entry = debugfs_create_dir(name, zs_stat_root);
+        if (!entry) {
+                pr_warn("debugfs dir <%s> creation failed\n", name);
+                return -ENOMEM;
+        }
+        pool->stat_dentry = entry;
+        entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+                        pool->stat_dentry, pool, &zs_stat_size_ops);
+        if (!entry) {
+                pr_warn("%s: debugfs file entry <%s> creation failed\n",
+                                name, "classes");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+        debugfs_remove_recursive(pool->stat_dentry);
+}
+#else /* CONFIG_ZSMALLOC_STAT */
+static inline void zs_stat_inc(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+}
+static inline void zs_stat_dec(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+}
+static inline unsigned long zs_stat_get(struct size_class *class,
+                                enum zs_stat_type type)
+{
+        return 0;
+}
+static int __init zs_stat_init(void)
+{
+        return 0;
+}
+static void __exit zs_stat_exit(void)
+{
+}
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+        return 0;
+}
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
 }
+#endif
 /*
 * For each size class, zspages are divided into different groups
 * depending on how "full" they are. This was done so that we could
@@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page)
                fg = ZS_EMPTY;
        else if (inuse == max_objects)
                fg = ZS_FULL;
-        else if (inuse <= max_objects / fullness_threshold_frac)
+        else if (inuse <= 3 * max_objects / fullness_threshold_frac)
                fg = ZS_ALMOST_EMPTY;
        else
                fg = ZS_ALMOST_FULL;
@@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
                list_add_tail(&page->lru, &(*head)->lru);
        *head = page;
+        zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+                        CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
 }
 /*
@@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
                                        struct page, lru);
        list_del_init(&page->lru);
+        zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+                        CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
 }
 /*
@@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
 * page from the freelist of the old fullness group to that of the new
 * fullness group.
 */
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+static enum fullness_group fix_fullness_group(struct size_class *class,
                                                struct page *page)
 {
        int class_idx;
-        struct size_class *class;
        enum fullness_group currfg, newfg;
        BUG_ON(!is_first_page(page));
@@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
        if (newfg == currfg)
                goto out;
-        class = pool->size_class[class_idx];
        remove_zspage(page, class, currfg);
        insert_zspage(page, class, newfg);
        set_zspage_mapping(page, class_idx, newfg);
@@ -512,7 +731,8 @@ out:
 * to form a zspage for each size class. This is important
 * to reduce wastage due to unusable space left at end of
 * each zspage which is given as:
- *      wastage = Zp - Zp % size_class
+ *     wastage = Zp % class_size
+ *     usage = Zp - wastage
 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
 *
 * For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page)
 /*
 * Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
+ * We use the least bit of handle for tagging.
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
 */
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
 {
-        unsigned long handle;
+        unsigned long obj;
        if (!page) {
                BUG_ON(obj_idx);
                return NULL;
        }
-        handle = page_to_pfn(page) << OBJ_INDEX_BITS;
+        obj = page_to_pfn(page) << OBJ_INDEX_BITS;
-        handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+        obj |= ((obj_idx) & OBJ_INDEX_MASK);
+        obj <<= OBJ_TAG_BITS;
-        return (void *)handle;
+        return (void *)obj;
 }
 /*
 * Decode <page, obj_idx> pair from the given object handle. We adjust the
 * decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
+ * location_to_obj().
 */
-static void obj_handle_to_location(unsigned long handle, struct page **page,
+static void obj_to_location(unsigned long obj, struct page **page,
                                unsigned long *obj_idx)
 {
-        *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
+        obj >>= OBJ_TAG_BITS;
-        *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+        *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+        *obj_idx = (obj & OBJ_INDEX_MASK);
+}
+static unsigned long handle_to_obj(unsigned long handle)
+{
+        return *(unsigned long *)handle;
+}
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+                        void *obj)
+{
+        if (class->huge) {
+                VM_BUG_ON(!is_first_page(page));
+                return *(unsigned long *)page_private(page);
+        } else
+                return *(unsigned long *)obj;
 }
 static unsigned long obj_idx_to_offset(struct page *page,
@@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
        return off + obj_idx * class_size;
 }
+static inline int trypin_tag(unsigned long handle)
+{
+        unsigned long *ptr = (unsigned long *)handle;
+        return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+static void pin_tag(unsigned long handle)
+{
+        while (!trypin_tag(handle));
+}
+static void unpin_tag(unsigned long handle)
+{
+        unsigned long *ptr = (unsigned long *)handle;
+        clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
 static void reset_page(struct page *page)
 {
        clear_bit(PG_private, &page->flags);
@@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                link = (struct link_free *)vaddr + off / sizeof(*link);
                while ((off += class->size) < PAGE_SIZE) {
-                        link->next = obj_location_to_handle(page, i++);
+                        link->next = location_to_obj(page, i++);
                        link += class->size / sizeof(*link);
                }
@@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                 * page (if present)
                 */
                next_page = get_next_page(page);
-                link->next = obj_location_to_handle(next_page, 0);
+                link->next = location_to_obj(next_page, 0);
                kunmap_atomic(vaddr);
                page = next_page;
                off %= PAGE_SIZE;
@@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
        init_zspage(first_page, class);
-        first_page->freelist = obj_location_to_handle(first_page, 0);
+        first_page->freelist = location_to_obj(first_page, 0);
        /* Maximum number of objects we can store in this zspage */
        first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
@@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area,
 {
        int sizes[2];
        void *addr;
-        char *buf = area->vm_buf;
+        char *buf;
        /* no write fastpath */
        if (area->vm_mm == ZS_MM_RO)
                goto out;
+        buf = area->vm_buf;
+        if (!area->huge) {
+                buf = buf + ZS_HANDLE_SIZE;
+                size -= ZS_HANDLE_SIZE;
+                off += ZS_HANDLE_SIZE;
+        }
        sizes[0] = PAGE_SIZE - off;
        sizes[1] = size - sizes[0];
@@ -952,11 +1213,6 @@ static void init_zs_size_classes(void)
        zs_size_classes = nr;
 }
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
-        return pages_per_zspage * PAGE_SIZE / size;
-}
 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 {
        if (prev->pages_per_zspage != pages_per_zspage)
@@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
        return true;
 }
-#ifdef CONFIG_ZSMALLOC_STAT
+static bool zspage_full(struct page *page)
-static inline void zs_stat_inc(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-        class->stats.objs[type] += cnt;
-}
-static inline void zs_stat_dec(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-        class->stats.objs[type] -= cnt;
-}
-static inline unsigned long zs_stat_get(struct size_class *class,
-                                enum zs_stat_type type)
-{
-        return class->stats.objs[type];
-}
-static int __init zs_stat_init(void)
-{
-        if (!debugfs_initialized())
-                return -ENODEV;
-        zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
-        if (!zs_stat_root)
-                return -ENOMEM;
-        return 0;
-}
-static void __exit zs_stat_exit(void)
-{
-        debugfs_remove_recursive(zs_stat_root);
-}
-static int zs_stats_size_show(struct seq_file *s, void *v)
 {
-        int i;
+        BUG_ON(!is_first_page(page));
-        struct zs_pool *pool = s->private;
-        struct size_class *class;
-        int objs_per_zspage;
-        unsigned long obj_allocated, obj_used, pages_used;
-        unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
-        seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
-                                "obj_allocated", "obj_used", "pages_used");
-        for (i = 0; i < zs_size_classes; i++) {
-                class = pool->size_class[i];
-                if (class->index != i)
-                        continue;
-                spin_lock(&class->lock);
-                obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
-                obj_used = zs_stat_get(class, OBJ_USED);
-                spin_unlock(&class->lock);
-                objs_per_zspage = get_maxobj_per_zspage(class->size,
-                                class->pages_per_zspage);
-                pages_used = obj_allocated / objs_per_zspage *
-                                class->pages_per_zspage;
-                seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i,
-                        class->size, obj_allocated, obj_used, pages_used);
-                total_objs += obj_allocated;
-                total_used_objs += obj_used;
-                total_pages += pages_used;
-        }
-        seq_puts(s, "\n");
-        seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
-                        total_objs, total_used_objs, total_pages);
-        return 0;
-}
-static int zs_stats_size_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, zs_stats_size_show, inode->i_private);
-}
-static const struct file_operations zs_stat_size_ops = {
-        .open           = zs_stats_size_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
-        struct dentry *entry;
-        if (!zs_stat_root)
-                return -ENODEV;
-        entry = debugfs_create_dir(name, zs_stat_root);
-        if (!entry) {
-                pr_warn("debugfs dir <%s> creation failed\n", name);
-                return -ENOMEM;
-        }
-        pool->stat_dentry = entry;
-        entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
-                        pool->stat_dentry, pool, &zs_stat_size_ops);
-        if (!entry) {
-                pr_warn("%s: debugfs file entry <%s> creation failed\n",
-                                name, "obj_in_classes");
-                return -ENOMEM;
-        }
-        return 0;
-}
-static void zs_pool_stat_destroy(struct zs_pool *pool)
-{
-        debugfs_remove_recursive(pool->stat_dentry);
-}
-#else /* CONFIG_ZSMALLOC_STAT */
-static inline void zs_stat_inc(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-}
-static inline void zs_stat_dec(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-}
-static inline unsigned long zs_stat_get(struct size_class *class,
-                                enum zs_stat_type type)
-{
-        return 0;
-}
-static int __init zs_stat_init(void)
-{
-        return 0;
-}
-static void __exit zs_stat_exit(void)
-{
-}
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
-        return 0;
-}
-static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+        return page->inuse == page->objects;
-{
 }
-#endif
 unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
        return atomic_long_read(&pool->pages_allocated);
@@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
                        enum zs_mapmode mm)
 {
        struct page *page;
-        unsigned long obj_idx, off;
+        unsigned long obj, obj_idx, off;
        unsigned int class_idx;
        enum fullness_group fg;
        struct size_class *class;
        struct mapping_area *area;
        struct page *pages[2];
+        void *ret;
        BUG_ON(!handle);
@@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
         */
        BUG_ON(in_interrupt());
-        obj_handle_to_location(handle, &page, &obj_idx);
+        /* From now on, migration cannot move the object */
+        pin_tag(handle);
+        obj = handle_to_obj(handle);
+        obj_to_location(obj, &page, &obj_idx);
        get_zspage_mapping(get_first_page(page), &class_idx, &fg);
        class = pool->size_class[class_idx];
        off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
        if (off + class->size <= PAGE_SIZE) {
                /* this object is contained entirely within a page */
                area->vm_addr = kmap_atomic(page);
-                return area->vm_addr + off;
+                ret = area->vm_addr + off;
+                goto out;
        }
        /* this object spans two pages */
@@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
        pages[1] = get_next_page(page);
        BUG_ON(!pages[1]);
-        return __zs_map_object(area, pages, off, class->size);
+        ret = __zs_map_object(area, pages, off, class->size);
+out:
+        if (!class->huge)
+                ret += ZS_HANDLE_SIZE;
+        return ret;
 }
 EXPORT_SYMBOL_GPL(zs_map_object);
 void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 {
        struct page *page;
-        unsigned long obj_idx, off;
+        unsigned long obj, obj_idx, off;
        unsigned int class_idx;
        enum fullness_group fg;
@@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
        BUG_ON(!handle);
-        obj_handle_to_location(handle, &page, &obj_idx);
+        obj = handle_to_obj(handle);
+        obj_to_location(obj, &page, &obj_idx);
        get_zspage_mapping(get_first_page(page), &class_idx, &fg);
        class = pool->size_class[class_idx];
        off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
                __zs_unmap_object(area, pages, off, class->size);
        }
        put_cpu_var(zs_map_area);
+        unpin_tag(handle);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
+static unsigned long obj_malloc(struct page *first_page,
+                struct size_class *class, unsigned long handle)
+{
+        unsigned long obj;
+        struct link_free *link;
+        struct page *m_page;
+        unsigned long m_objidx, m_offset;
+        void *vaddr;
+        handle |= OBJ_ALLOCATED_TAG;
+        obj = (unsigned long)first_page->freelist;
+        obj_to_location(obj, &m_page, &m_objidx);
+        m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+        vaddr = kmap_atomic(m_page);
+        link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+        first_page->freelist = link->next;
+        if (!class->huge)
+                /* record handle in the header of allocated chunk */
+                link->handle = handle;
+        else
+                /* record handle in first_page->private */
+                set_page_private(first_page, handle);
+        kunmap_atomic(vaddr);
+        first_page->inuse++;
+        zs_stat_inc(class, OBJ_USED, 1);
+        return obj;
+}
 /**
 * zs_malloc - Allocate block of given size from pool.
 * @pool: pool to allocate from
@@ -1236,17 +1384,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object);
 */
 unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 {
-        unsigned long obj;
+        unsigned long handle, obj;
-        struct link_free *link;
        struct size_class *class;
-        void *vaddr;
+        struct page *first_page;
-        struct page *first_page, *m_page;
-        unsigned long m_objidx, m_offset;
        if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
                return 0;
+        handle = alloc_handle(pool);
+        if (!handle)
+                return 0;
+        /* extra space in chunk to keep the handle */
+        size += ZS_HANDLE_SIZE;
        class = pool->size_class[get_size_class_index(size)];
        spin_lock(&class->lock);
@@ -1255,8 +1405,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
        if (!first_page) {
                spin_unlock(&class->lock);
                first_page = alloc_zspage(class, pool->flags);
-                if (unlikely(!first_page))
+                if (unlikely(!first_page)) {
+                        free_handle(pool, handle);
                        return 0;
+                }
                set_zspage_mapping(first_page, class->index, ZS_EMPTY);
                atomic_long_add(class->pages_per_zspage,
@@ -1267,73 +1419,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
                                class->size, class->pages_per_zspage));
        }
-        obj = (unsigned long)first_page->freelist;
+        obj = obj_malloc(first_page, class, handle);
-        obj_handle_to_location(obj, &m_page, &m_objidx);
-        m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-        vaddr = kmap_atomic(m_page);
-        link = (struct link_free *)vaddr + m_offset / sizeof(*link);
-        first_page->freelist = link->next;
-        memset(link, POISON_INUSE, sizeof(*link));
-        kunmap_atomic(vaddr);
-        first_page->inuse++;
-        zs_stat_inc(class, OBJ_USED, 1);
        /* Now move the zspage to another fullness group, if required */
-        fix_fullness_group(pool, first_page);
+        fix_fullness_group(class, first_page);
+        record_obj(handle, obj);
        spin_unlock(&class->lock);
-        return obj;
+        return handle;
 }
 EXPORT_SYMBOL_GPL(zs_malloc);
-void zs_free(struct zs_pool *pool, unsigned long obj)
+static void obj_free(struct zs_pool *pool, struct size_class *class,
+                        unsigned long obj)
 {
        struct link_free *link;
        struct page *first_page, *f_page;
        unsigned long f_objidx, f_offset;
        void *vaddr;
        int class_idx;
-        struct size_class *class;
        enum fullness_group fullness;
-        if (unlikely(!obj))
+        BUG_ON(!obj);
-                return;
-        obj_handle_to_location(obj, &f_page, &f_objidx);
+        obj &= ~OBJ_ALLOCATED_TAG;
+        obj_to_location(obj, &f_page, &f_objidx);
        first_page = get_first_page(f_page);
        get_zspage_mapping(first_page, &class_idx, &fullness);
-        class = pool->size_class[class_idx];
        f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
-        spin_lock(&class->lock);
+        vaddr = kmap_atomic(f_page);
        /* Insert this object in containing zspage's freelist */
-        vaddr = kmap_atomic(f_page);
        link = (struct link_free *)(vaddr + f_offset);
        link->next = first_page->freelist;
+        if (class->huge)
+                set_page_private(first_page, 0);
        kunmap_atomic(vaddr);
        first_page->freelist = (void *)obj;
        first_page->inuse--;
-        fullness = fix_fullness_group(pool, first_page);
        zs_stat_dec(class, OBJ_USED, 1);
-        if (fullness == ZS_EMPTY)
+}
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+        struct page *first_page, *f_page;
+        unsigned long obj, f_objidx;
+        int class_idx;
+        struct size_class *class;
+        enum fullness_group fullness;
+        if (unlikely(!handle))
+                return;
+        pin_tag(handle);
+        obj = handle_to_obj(handle);
+        obj_to_location(obj, &f_page, &f_objidx);
+        first_page = get_first_page(f_page);
+        get_zspage_mapping(first_page, &class_idx, &fullness);
+        class = pool->size_class[class_idx];
+        spin_lock(&class->lock);
+        obj_free(pool, class, obj);
+        fullness = fix_fullness_group(class, first_page);
+        if (fullness == ZS_EMPTY) {
                zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
                                class->size, class->pages_per_zspage));
+                atomic_long_sub(class->pages_per_zspage,
+                                &pool->pages_allocated);
+                free_zspage(first_page);
+        }
        spin_unlock(&class->lock);
+        unpin_tag(handle);
+        free_handle(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+static void zs_object_copy(unsigned long src, unsigned long dst,
+                                struct size_class *class)
+{
+        struct page *s_page, *d_page;
+        unsigned long s_objidx, d_objidx;
+        unsigned long s_off, d_off;
+        void *s_addr, *d_addr;
+        int s_size, d_size, size;
+        int written = 0;
+        s_size = d_size = class->size;
+        obj_to_location(src, &s_page, &s_objidx);
+        obj_to_location(dst, &d_page, &d_objidx);
+        s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+        d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+        if (s_off + class->size > PAGE_SIZE)
+                s_size = PAGE_SIZE - s_off;
+        if (d_off + class->size > PAGE_SIZE)
+                d_size = PAGE_SIZE - d_off;
+        s_addr = kmap_atomic(s_page);
+        d_addr = kmap_atomic(d_page);
+        while (1) {
+                size = min(s_size, d_size);
+                memcpy(d_addr + d_off, s_addr + s_off, size);
+                written += size;
+                if (written == class->size)
+                        break;
+                s_off += size;
+                s_size -= size;
+                d_off += size;
+                d_size -= size;
+                if (s_off >= PAGE_SIZE) {
+                        kunmap_atomic(d_addr);
+                        kunmap_atomic(s_addr);
+                        s_page = get_next_page(s_page);
+                        BUG_ON(!s_page);
+                        s_addr = kmap_atomic(s_page);
+                        d_addr = kmap_atomic(d_page);
+                        s_size = class->size - written;
+                        s_off = 0;
+                }
+                if (d_off >= PAGE_SIZE) {
+                        kunmap_atomic(d_addr);
+                        d_page = get_next_page(d_page);
+                        BUG_ON(!d_page);
+                        d_addr = kmap_atomic(d_page);
+                        d_size = class->size - written;
+                        d_off = 0;
+                }
+        }
+        kunmap_atomic(d_addr);
+        kunmap_atomic(s_addr);
+}
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct page *page, int index,
+                                        struct size_class *class)
+{
+        unsigned long head;
+        int offset = 0;
+        unsigned long handle = 0;
+        void *addr = kmap_atomic(page);
+        if (!is_first_page(page))
+                offset = page->index;
+        offset += class->size * index;
+        while (offset < PAGE_SIZE) {
+                head = obj_to_head(class, page, addr + offset);
+                if (head & OBJ_ALLOCATED_TAG) {
+                        handle = head & ~OBJ_ALLOCATED_TAG;
+                        if (trypin_tag(handle))
+                                break;
+                        handle = 0;
+                }
+                offset += class->size;
+                index++;
+        }
+        kunmap_atomic(addr);
+        return handle;
+}
+struct zs_compact_control {
+        /* Source page for migration which could be a subpage of zspage. */
+        struct page *s_page;
+        /* Destination page for migration which should be a first page
+         * of zspage. */
+        struct page *d_page;
+         /* Starting object index within @s_page which used for live object
+          * in the subpage. */
+        int index;
+        /* how many of objects are migrated */
+        int nr_migrated;
+};
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+                                struct zs_compact_control *cc)
+{
+        unsigned long used_obj, free_obj;
+        unsigned long handle;
+        struct page *s_page = cc->s_page;
+        struct page *d_page = cc->d_page;
+        unsigned long index = cc->index;
+        int nr_migrated = 0;
+        int ret = 0;
+        while (1) {
+                handle = find_alloced_obj(s_page, index, class);
+                if (!handle) {
+                        s_page = get_next_page(s_page);
+                        if (!s_page)
+                                break;
+                        index = 0;
+                        continue;
+                }
+                /* Stop if there is no more space */
+                if (zspage_full(d_page)) {
+                        unpin_tag(handle);
+                        ret = -ENOMEM;
+                        break;
+                }
+                used_obj = handle_to_obj(handle);
+                free_obj = obj_malloc(d_page, class, handle);
+                zs_object_copy(used_obj, free_obj, class);
+                index++;
+                record_obj(handle, free_obj);
+                unpin_tag(handle);
+                obj_free(pool, class, used_obj);
+                nr_migrated++;
+        }
+        /* Remember last position in this iteration */
+        cc->s_page = s_page;
+        cc->index = index;
+        cc->nr_migrated = nr_migrated;
+        return ret;
+}
+static struct page *alloc_target_page(struct size_class *class)
+{
+        int i;
+        struct page *page;
+        for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+                page = class->fullness_list[i];
+                if (page) {
+                        remove_zspage(page, class, i);
+                        break;
+                }
+        }
+        return page;
+}
+static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+                                struct page *first_page)
+{
+        enum fullness_group fullness;
+        BUG_ON(!is_first_page(first_page));
+        fullness = get_fullness_group(first_page);
+        insert_zspage(first_page, class, fullness);
+        set_zspage_mapping(first_page, class->index, fullness);
        if (fullness == ZS_EMPTY) {
+                zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+                        class->size, class->pages_per_zspage));
                atomic_long_sub(class->pages_per_zspage,
                                &pool->pages_allocated);
                free_zspage(first_page);
        }
 }
-EXPORT_SYMBOL_GPL(zs_free);
+static struct page *isolate_source_page(struct size_class *class)
+{
+        struct page *page;
+        page = class->fullness_list[ZS_ALMOST_EMPTY];
+        if (page)
+                remove_zspage(page, class, ZS_ALMOST_EMPTY);
+        return page;
+}
+static unsigned long __zs_compact(struct zs_pool *pool,
+                                struct size_class *class)
+{
+        int nr_to_migrate;
+        struct zs_compact_control cc;
+        struct page *src_page;
+        struct page *dst_page = NULL;
+        unsigned long nr_total_migrated = 0;
+        spin_lock(&class->lock);
+        while ((src_page = isolate_source_page(class))) {
+                BUG_ON(!is_first_page(src_page));
+                /* The goal is to migrate all live objects in source page */
+                nr_to_migrate = src_page->inuse;
+                cc.index = 0;
+                cc.s_page = src_page;
+                while ((dst_page = alloc_target_page(class))) {
+                        cc.d_page = dst_page;
+                        /*
+                         * If there is no more space in dst_page, try to
+                         * allocate another zspage.
+                         */
+                        if (!migrate_zspage(pool, class, &cc))
+                                break;
+                        putback_zspage(pool, class, dst_page);
+                        nr_total_migrated += cc.nr_migrated;
+                        nr_to_migrate -= cc.nr_migrated;
+                }
+                /* Stop if we couldn't find slot */
+                if (dst_page == NULL)
+                        break;
+                putback_zspage(pool, class, dst_page);
+                putback_zspage(pool, class, src_page);
+                spin_unlock(&class->lock);
+                nr_total_migrated += cc.nr_migrated;
+                cond_resched();
+                spin_lock(&class->lock);
+        }
+        if (src_page)
+                putback_zspage(pool, class, src_page);
+        spin_unlock(&class->lock);
+        return nr_total_migrated;
+}
+unsigned long zs_compact(struct zs_pool *pool)
+{
+        int i;
+        unsigned long nr_migrated = 0;
+        struct size_class *class;
+        for (i = zs_size_classes - 1; i >= 0; i--) {
+                class = pool->size_class[i];
+                if (!class)
+                        continue;
+                if (class->index != i)
+                        continue;
+                nr_migrated += __zs_compact(pool, class);
+        }
+        return nr_migrated;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
 /**
 * zs_create_pool - Creates an allocation pool to work from.
@@ -1355,20 +1794,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
        if (!pool)
                return NULL;
-        pool->name = kstrdup(name, GFP_KERNEL);
-        if (!pool->name) {
-                kfree(pool);
-                return NULL;
-        }
        pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
                        GFP_KERNEL);
        if (!pool->size_class) {
-                kfree(pool->name);
                kfree(pool);
                return NULL;
        }
+        pool->name = kstrdup(name, GFP_KERNEL);
+        if (!pool->name)
+                goto err;
+        if (create_handle_cache(pool))
+                goto err;
        /*
         * Iterate reversly, because, size of size_class that we want to use
         * for merging should be larger or equal to current size.
@@ -1406,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
                class->size = size;
                class->index = i;
                class->pages_per_zspage = pages_per_zspage;
+                if (pages_per_zspage == 1 &&
+                        get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+                        class->huge = true;
                spin_lock_init(&class->lock);
                pool->size_class[i] = class;
@@ -1450,6 +1892,7 @@ void zs_destroy_pool(struct zs_pool *pool)
                kfree(class);
        }
+        destroy_handle_cache(pool);
        kfree(pool->size_class);
        kfree(pool->name);
        kfree(pool);
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-04-15 19:39:15 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-04-15 19:39:15 -0400
commit	eea3a00264cf243a28e4331566ce67b86059339d (patch)
tree	487f16389e0dfa32e9caa7604d1274a7dcda8f04 /mm
parent	e7c82412433a8039616c7314533a0a1c025d99bf (diff)
parent	e693d73c20ffdb06840c9378f367bad849ac0d5d (diff)