12 files changed, 206 insertions, 65 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 495d7368ced8..56cec636a1fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING
          See Documentation/admin-guide/mm/idle_page_tracking.rst for
          more details.
-# arch_add_memory() comprehends device memory
+config ARCH_HAS_PTE_DEVMAP
-config ARCH_HAS_ZONE_DEVICE
        bool
 config ZONE_DEVICE
@@ -658,7 +657,7 @@ config ZONE_DEVICE
        depends on MEMORY_HOTPLUG
        depends on MEMORY_HOTREMOVE
        depends on SPARSEMEM_VMEMMAP
-        depends on ARCH_HAS_ZONE_DEVICE
+        depends on ARCH_HAS_PTE_DEVMAP
        select XARRAY_MULTI
        help
diff --git a/mm/cma.c b/mm/cma.c
index 3340ef34c154..7fe0b8356775 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -278,6 +278,12 @@ int __init cma_declare_contiguous(phys_addr_t base,
         */
        alignment = max(alignment,  (phys_addr_t)PAGE_SIZE <<
                          max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+        if (fixed && base & (alignment - 1)) {
+                ret = -EINVAL;
+                pr_err("Region at %pa must be aligned to %pa bytes\n",
+                        &base, &alignment);
+                goto err;
+        }
        base = ALIGN(base, alignment);
        size = ALIGN(size, alignment);
        limit &= ~(alignment - 1);
@@ -308,6 +314,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
        if (limit == 0 || limit > memblock_end)
                limit = memblock_end;
+        if (base + size > limit) {
+                ret = -EINVAL;
+                pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+                        &size, &base, &limit);
+                goto err;
+        }
        /* Reserve memory */
        if (fixed) {
                if (memblock_is_region_reserved(base, size) ||
@@ -494,7 +507,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 * @pages: Allocated pages.
 * @count: Number of allocated pages.
 *
- * This function releases memory allocated by alloc_cma().
+ * This function releases memory allocated by cma_alloc().
 * It returns false when provided pages do not belong to contiguous area and
 * true otherwise.
 */
diff --git a/mm/gup.c b/mm/gup.c
index 8bbaa5523116..98f13ab37bac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1895,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
                unsigned long end, struct page **pages, int *nr)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 249671873aa9..cdbb7a84cb6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -695,12 +695,15 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
        if (mem_cgroup_disabled())
                return;
-        __this_cpu_add(memcg->vmstats_local->stat[idx], val);
        x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
                struct mem_cgroup *mi;
+                /*
+                 * Batch local counters to keep them in sync with
+                 * the hierarchical ones.
+                 */
+                __this_cpu_add(memcg->vmstats_local->stat[idx], x);
                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
                        atomic_long_add(x, &mi->vmstats[idx]);
                x = 0;
@@ -749,13 +752,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        /* Update memcg */
        __mod_memcg_state(memcg, idx, val);
-        /* Update lruvec */
-        __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
                struct mem_cgroup_per_node *pi;
+                /*
+                 * Batch local counters to keep them in sync with
+                 * the hierarchical ones.
+                 */
+                __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
                        atomic_long_add(x, &pi->lruvec_stat[idx]);
                x = 0;
@@ -777,12 +782,15 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
        if (mem_cgroup_disabled())
                return;
-        __this_cpu_add(memcg->vmstats_local->events[idx], count);
        x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
        if (unlikely(x > MEMCG_CHARGE_BATCH)) {
                struct mem_cgroup *mi;
+                /*
+                 * Batch local counters to keep them in sync with
+                 * the hierarchical ones.
+                 */
+                __this_cpu_add(memcg->vmstats_local->events[idx], x);
                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
                        atomic_long_add(x, &mi->vmevents[idx]);
                x = 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6166ba5a15f3..4ebe696138e8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1734,9 +1734,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
                endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
                pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
                        &beginpa, &endpa);
-        }
-        return ret;
+                return -EBUSY;
+        }
+        return 0;
 }
 static int check_cpu_on_node(pg_data_t *pgdat)
@@ -1819,19 +1820,9 @@ static void __release_memory_resource(resource_size_t start,
        }
 }
-/**
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
- * remove_memory
- * @nid: the node ID
- * @start: physical address of the region to remove
- * @size: size of the region to remove
- *
- * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
- * and online/offline operations before this call, as required by
- * try_offline_node().
- */
-void __ref __remove_memory(int nid, u64 start, u64 size)
 {
-        int ret;
+        int rc = 0;
        BUG_ON(check_hotplug_memory_range(start, size));
@@ -1839,13 +1830,13 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
        /*
         * All memory blocks must be offlined before removing memory.  Check
-         * whether all memory blocks in question are offline and trigger a BUG()
+         * whether all memory blocks in question are offline and return error
         * if this is not the case.
         */
-        ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
+        rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-                                check_memblock_offlined_cb);
+                               check_memblock_offlined_cb);
-        if (ret)
+        if (rc)
-                BUG();
+                goto done;
        /* remove memmap entry */
        firmware_map_remove(start, start + size, "System RAM");
@@ -1857,14 +1848,45 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
        try_offline_node(nid);
+done:
        mem_hotplug_done();
+        return rc;
 }
-void remove_memory(int nid, u64 start, u64 size)
+/**
+ * remove_memory
+ * @nid: the node ID
+ * @start: physical address of the region to remove
+ * @size: size of the region to remove
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
+void __remove_memory(int nid, u64 start, u64 size)
+{
+        /*
+         * trigger BUG() is some memory is not offlined prior to calling this
+         * function
+         */
+        if (try_remove_memory(nid, start, size))
+                BUG();
+}
+/*
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
+ * some memory is not offline
+ */
+int remove_memory(int nid, u64 start, u64 size)
 {
+        int rc;
        lock_device_hotplug();
-        __remove_memory(nid, start, size);
+        rc  = try_remove_memory(nid, start, size);
        unlock_device_hotplug();
+        return rc;
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/nommu.c b/mm/nommu.c
index eb3e2e558da1..fed1b6e9c89b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1261,7 +1261,9 @@ unsigned long do_mmap(struct file *file,
        add_nommu_region(region);
        /* clear anonymous mappings that don't ask for uninitialized data */
-        if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+        if (!vma->vm_file &&
+            (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
+             !(flags & MAP_UNINITIALIZED)))
                memset((void *)region->vm_start, 0,
                       region->vm_end - region->vm_start);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fd7f45a04eb..e515bfcf7f28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4102,7 +4102,6 @@ static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
                                        const struct alloc_context *ac)
 {
-        struct reclaim_state reclaim_state;
        int progress;
        unsigned int noreclaim_flag;
        unsigned long pflags;
@@ -4114,13 +4113,10 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
        psi_memstall_enter(&pflags);
        fs_reclaim_acquire(gfp_mask);
        noreclaim_flag = memalloc_noreclaim_save();
-        reclaim_state.reclaimed_slab = 0;
-        current->reclaim_state = &reclaim_state;
        progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
                                                                ac->nodemask);
-        current->reclaim_state = NULL;
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(gfp_mask);
        psi_memstall_leave(&pflags);
diff --git a/mm/shmem.c b/mm/shmem.c
index f4dce9c8670d..99497cb32e71 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -400,7 +400,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
 static int shmem_huge __read_mostly;
-#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
+#if defined(CONFIG_SYSFS)
 static int shmem_parse_huge(const char *str)
 {
        if (!strcmp(str, "never"))
@@ -417,7 +417,9 @@ static int shmem_parse_huge(const char *str)
                return SHMEM_HUGE_FORCE;
        return -EINVAL;
 }
+#endif
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
 static const char *shmem_format_huge(int huge)
 {
        switch (huge) {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6c49dbb3769e..807490fe217a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1028,7 +1028,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
 }
 struct kmem_cache *
-kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
+{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
 EXPORT_SYMBOL(kmalloc_caches);
 /*
diff --git a/mm/util.c b/mm/util.c
index 68575a315dc5..e6351a80f248 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,7 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/swap.h>
@@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 }
 #endif
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ * @task:        task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_sem is held as writer.
+ *
+ * Return:
+ * * 0       on success
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+                        struct task_struct *task, bool bypass_rlim)
+{
+        unsigned long locked_vm, limit;
+        int ret = 0;
+        lockdep_assert_held_write(&mm->mmap_sem);
+        locked_vm = mm->locked_vm;
+        if (inc) {
+                if (!bypass_rlim) {
+                        limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+                        if (locked_vm + pages > limit)
+                                ret = -ENOMEM;
+                }
+                if (!ret)
+                        mm->locked_vm = locked_vm + pages;
+        } else {
+                WARN_ON_ONCE(pages > locked_vm);
+                mm->locked_vm = locked_vm - pages;
+        }
+        pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+                 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+                 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+                 ret ? " - exceeded" : "");
+        return ret;
+}
+EXPORT_SYMBOL_GPL(__account_locked_vm);
+/**
+ * account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against, may be NULL
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0       on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+{
+        int ret;
+        if (pages == 0 || !mm)
+                return 0;
+        down_write(&mm->mmap_sem);
+        ret = __account_locked_vm(mm, pages, inc, current,
+                                  capable(CAP_IPC_LOCK));
+        up_write(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(account_locked_vm);
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long pgoff)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8e3dcd527b8..44df66a98f2a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,9 @@ struct scan_control {
                unsigned int file_taken;
                unsigned int taken;
        } nr;
+        /* for recording the reclaimed slab by now */
+        struct reclaim_state reclaim_state;
 };
 #ifdef ARCH_HAS_PREFETCH
@@ -238,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
 }
 #endif /* CONFIG_MEMCG_KMEM */
+static void set_task_reclaim_state(struct task_struct *task,
+                                   struct reclaim_state *rs)
+{
+        /* Check for an overwrite */
+        WARN_ON_ONCE(rs && task->reclaim_state);
+        /* Check for the nulling of an already-nulled member */
+        WARN_ON_ONCE(!rs && !task->reclaim_state);
+        task->reclaim_state = rs;
+}
 #ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
@@ -3191,11 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
                return 1;
+        set_task_reclaim_state(current, &sc.reclaim_state);
        trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+        set_task_reclaim_state(current, NULL);
        return nr_reclaimed;
 }
@@ -3218,6 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
        };
        unsigned long lru_pages;
+        set_task_reclaim_state(current, &sc.reclaim_state);
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -3235,7 +3253,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+        set_task_reclaim_state(current, NULL);
        *nr_scanned = sc.nr_scanned;
        return sc.nr_reclaimed;
 }
@@ -3262,6 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                .may_shrinkslab = 1,
        };
+        set_task_reclaim_state(current, &sc.reclaim_state);
        /*
         * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
         * take care of from where we get pages. So the node where we start the
@@ -3282,6 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
        psi_memstall_leave(&pflags);
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+        set_task_reclaim_state(current, NULL);
        return nr_reclaimed;
 }
@@ -3483,6 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                .may_unmap = 1,
        };
+        set_task_reclaim_state(current, &sc.reclaim_state);
        psi_memstall_enter(&pflags);
        __fs_reclaim_acquire();
@@ -3664,6 +3687,8 @@ out:
        snapshot_refaults(NULL, pgdat);
        __fs_reclaim_release();
        psi_memstall_leave(&pflags);
+        set_task_reclaim_state(current, NULL);
        /*
         * Return the order kswapd stopped reclaiming at as
         * prepare_kswapd_sleep() takes it into account. If another caller
@@ -3787,15 +3812,10 @@ static int kswapd(void *p)
        unsigned int classzone_idx = MAX_NR_ZONES - 1;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
-        struct reclaim_state reclaim_state = {
-                .reclaimed_slab = 0,
-        };
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
        if (!cpumask_empty(cpumask))
                set_cpus_allowed_ptr(tsk, cpumask);
-        current->reclaim_state = &reclaim_state;
        /*
         * Tell the memory management that we're a "memory allocator",
@@ -3857,7 +3877,6 @@ kswapd_try_sleep:
        }
        tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
-        current->reclaim_state = NULL;
        return 0;
 }
@@ -3922,7 +3941,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 */
 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 {
-        struct reclaim_state reclaim_state;
        struct scan_control sc = {
                .nr_to_reclaim = nr_to_reclaim,
                .gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3934,18 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .hibernation_mode = 1,
        };
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
-        struct task_struct *p = current;
        unsigned long nr_reclaimed;
        unsigned int noreclaim_flag;
        fs_reclaim_acquire(sc.gfp_mask);
        noreclaim_flag = memalloc_noreclaim_save();
-        reclaim_state.reclaimed_slab = 0;
+        set_task_reclaim_state(current, &sc.reclaim_state);
-        p->reclaim_state = &reclaim_state;
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
-        p->reclaim_state = NULL;
+        set_task_reclaim_state(current, NULL);
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc.gfp_mask);
@@ -4110,7 +4126,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
        /* Minimum pages needed in order to stay on node */
        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
-        struct reclaim_state reclaim_state;
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4135,8 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
         */
        noreclaim_flag = memalloc_noreclaim_save();
        p->flags |= PF_SWAPWRITE;
-        reclaim_state.reclaimed_slab = 0;
+        set_task_reclaim_state(p, &sc.reclaim_state);
-        p->reclaim_state = &reclaim_state;
        if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                /*
@@ -4148,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
                } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
        }
-        p->reclaim_state = NULL;
+        set_task_reclaim_state(p, NULL);
        current->flags &= ~PF_SWAPWRITE;
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc.gfp_mask);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index dfcd69d08c1e..6c72b18d8b9c 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -101,6 +101,7 @@ struct z3fold_buddy_slots {
 * @refcount:           reference count for the z3fold page
 * @work:               work_struct for page layout optimization
 * @slots:              pointer to the structure holding buddy slots
+ * @pool:               pointer to the containing pool
 * @cpu:                CPU which this page "belongs" to
 * @first_chunks:       the size of the first buddy in chunks, 0 if free
 * @middle_chunks:      the size of the middle buddy in chunks, 0 if free
@@ -114,6 +115,7 @@ struct z3fold_header {
        struct kref refcount;
        struct work_struct work;
        struct z3fold_buddy_slots *slots;
+        struct z3fold_pool *pool;
        short cpu;
        unsigned short first_chunks;
        unsigned short middle_chunks;
@@ -193,8 +195,10 @@ static void compact_page_work(struct work_struct *w);
 static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
                                                        gfp_t gfp)
 {
-        struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
+        struct z3fold_buddy_slots *slots;
-                                                            gfp);
+        slots = kmem_cache_alloc(pool->c_handle,
+                                 (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
        if (slots) {
                memset(slots->slot, 0, sizeof(slots->slot));
@@ -320,6 +324,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
        zhdr->start_middle = 0;
        zhdr->cpu = -1;
        zhdr->slots = slots;
+        zhdr->pool = pool;
        INIT_LIST_HEAD(&zhdr->buddy);
        INIT_WORK(&zhdr->work, compact_page_work);
        return zhdr;
@@ -426,7 +431,7 @@ static enum buddy handle_to_buddy(unsigned long handle)
 static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
 {
-        return slots_to_pool(zhdr->slots);
+        return zhdr->pool;
 }
 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
@@ -850,7 +855,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
        enum buddy bud;
        bool can_sleep = gfpflags_allow_blocking(gfp);
-        if (!size || (gfp & __GFP_HIGHMEM))
+        if (!size)
                return -EINVAL;
        if (size > PAGE_SIZE)
@@ -1345,24 +1350,29 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
        zhdr = page_address(page);
        pool = zhdr_to_pool(zhdr);
-        if (!trylock_page(page))
-                return -EAGAIN;
        if (!z3fold_page_trylock(zhdr)) {
-                unlock_page(page);
                return -EAGAIN;
        }
        if (zhdr->mapped_count != 0) {
                z3fold_page_unlock(zhdr);
-                unlock_page(page);
                return -EBUSY;
        }
+        if (work_pending(&zhdr->work)) {
+                z3fold_page_unlock(zhdr);
+                return -EAGAIN;
+        }
        new_zhdr = page_address(newpage);
        memcpy(new_zhdr, zhdr, PAGE_SIZE);
        newpage->private = page->private;
        page->private = 0;
        z3fold_page_unlock(zhdr);
        spin_lock_init(&new_zhdr->page_lock);
+        INIT_WORK(&new_zhdr->work, compact_page_work);
+        /*
+         * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
+         * so we only have to reinitialize it.
+         */
+        INIT_LIST_HEAD(&new_zhdr->buddy);
        new_mapping = page_mapping(page);
        __ClearPageMovable(page);
        ClearPagePrivate(page);
@@ -1386,7 +1396,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
        queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
        page_mapcount_reset(page);
-        unlock_page(page);
        put_page(page);
        return 0;
 }