aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/cma.c15
-rw-r--r--mm/gup.c2
-rw-r--r--mm/memcontrol.c22
-rw-r--r--mm/memory_hotplug.c64
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/util.c75
-rw-r--r--mm/vmscan.c44
-rw-r--r--mm/z3fold.c29
12 files changed, 206 insertions, 65 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 495d7368ced8..56cec636a1fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING
649 See Documentation/admin-guide/mm/idle_page_tracking.rst for 649 See Documentation/admin-guide/mm/idle_page_tracking.rst for
650 more details. 650 more details.
651 651
652# arch_add_memory() comprehends device memory 652config ARCH_HAS_PTE_DEVMAP
653config ARCH_HAS_ZONE_DEVICE
654 bool 653 bool
655 654
656config ZONE_DEVICE 655config ZONE_DEVICE
@@ -658,7 +657,7 @@ config ZONE_DEVICE
658 depends on MEMORY_HOTPLUG 657 depends on MEMORY_HOTPLUG
659 depends on MEMORY_HOTREMOVE 658 depends on MEMORY_HOTREMOVE
660 depends on SPARSEMEM_VMEMMAP 659 depends on SPARSEMEM_VMEMMAP
661 depends on ARCH_HAS_ZONE_DEVICE 660 depends on ARCH_HAS_PTE_DEVMAP
662 select XARRAY_MULTI 661 select XARRAY_MULTI
663 662
664 help 663 help
diff --git a/mm/cma.c b/mm/cma.c
index 3340ef34c154..7fe0b8356775 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -278,6 +278,12 @@ int __init cma_declare_contiguous(phys_addr_t base,
278 */ 278 */
279 alignment = max(alignment, (phys_addr_t)PAGE_SIZE << 279 alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
280 max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); 280 max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
281 if (fixed && base & (alignment - 1)) {
282 ret = -EINVAL;
283 pr_err("Region at %pa must be aligned to %pa bytes\n",
284 &base, &alignment);
285 goto err;
286 }
281 base = ALIGN(base, alignment); 287 base = ALIGN(base, alignment);
282 size = ALIGN(size, alignment); 288 size = ALIGN(size, alignment);
283 limit &= ~(alignment - 1); 289 limit &= ~(alignment - 1);
@@ -308,6 +314,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
308 if (limit == 0 || limit > memblock_end) 314 if (limit == 0 || limit > memblock_end)
309 limit = memblock_end; 315 limit = memblock_end;
310 316
317 if (base + size > limit) {
318 ret = -EINVAL;
319 pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
320 &size, &base, &limit);
321 goto err;
322 }
323
311 /* Reserve memory */ 324 /* Reserve memory */
312 if (fixed) { 325 if (fixed) {
313 if (memblock_is_region_reserved(base, size) || 326 if (memblock_is_region_reserved(base, size) ||
@@ -494,7 +507,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
494 * @pages: Allocated pages. 507 * @pages: Allocated pages.
495 * @count: Number of allocated pages. 508 * @count: Number of allocated pages.
496 * 509 *
497 * This function releases memory allocated by alloc_cma(). 510 * This function releases memory allocated by cma_alloc().
498 * It returns false when provided pages do not belong to contiguous area and 511 * It returns false when provided pages do not belong to contiguous area and
499 * true otherwise. 512 * true otherwise.
500 */ 513 */
diff --git a/mm/gup.c b/mm/gup.c
index 8bbaa5523116..98f13ab37bac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1895,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1895} 1895}
1896#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ 1896#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
1897 1897
1898#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1898#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1899static int __gup_device_huge(unsigned long pfn, unsigned long addr, 1899static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1900 unsigned long end, struct page **pages, int *nr) 1900 unsigned long end, struct page **pages, int *nr)
1901{ 1901{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 249671873aa9..cdbb7a84cb6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -695,12 +695,15 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
695 if (mem_cgroup_disabled()) 695 if (mem_cgroup_disabled())
696 return; 696 return;
697 697
698 __this_cpu_add(memcg->vmstats_local->stat[idx], val);
699
700 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); 698 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
701 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { 699 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
702 struct mem_cgroup *mi; 700 struct mem_cgroup *mi;
703 701
702 /*
703 * Batch local counters to keep them in sync with
704 * the hierarchical ones.
705 */
706 __this_cpu_add(memcg->vmstats_local->stat[idx], x);
704 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 707 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
705 atomic_long_add(x, &mi->vmstats[idx]); 708 atomic_long_add(x, &mi->vmstats[idx]);
706 x = 0; 709 x = 0;
@@ -749,13 +752,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
749 /* Update memcg */ 752 /* Update memcg */
750 __mod_memcg_state(memcg, idx, val); 753 __mod_memcg_state(memcg, idx, val);
751 754
752 /* Update lruvec */
753 __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
754
755 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); 755 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
756 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { 756 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
757 struct mem_cgroup_per_node *pi; 757 struct mem_cgroup_per_node *pi;
758 758
759 /*
760 * Batch local counters to keep them in sync with
761 * the hierarchical ones.
762 */
763 __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
759 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) 764 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
760 atomic_long_add(x, &pi->lruvec_stat[idx]); 765 atomic_long_add(x, &pi->lruvec_stat[idx]);
761 x = 0; 766 x = 0;
@@ -777,12 +782,15 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
777 if (mem_cgroup_disabled()) 782 if (mem_cgroup_disabled())
778 return; 783 return;
779 784
780 __this_cpu_add(memcg->vmstats_local->events[idx], count);
781
782 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); 785 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
783 if (unlikely(x > MEMCG_CHARGE_BATCH)) { 786 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
784 struct mem_cgroup *mi; 787 struct mem_cgroup *mi;
785 788
789 /*
790 * Batch local counters to keep them in sync with
791 * the hierarchical ones.
792 */
793 __this_cpu_add(memcg->vmstats_local->events[idx], x);
786 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 794 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
787 atomic_long_add(x, &mi->vmevents[idx]); 795 atomic_long_add(x, &mi->vmevents[idx]);
788 x = 0; 796 x = 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6166ba5a15f3..4ebe696138e8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1734,9 +1734,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1734 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1734 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
1735 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 1735 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1736 &beginpa, &endpa); 1736 &beginpa, &endpa);
1737 }
1738 1737
1739 return ret; 1738 return -EBUSY;
1739 }
1740 return 0;
1740} 1741}
1741 1742
1742static int check_cpu_on_node(pg_data_t *pgdat) 1743static int check_cpu_on_node(pg_data_t *pgdat)
@@ -1819,19 +1820,9 @@ static void __release_memory_resource(resource_size_t start,
1819 } 1820 }
1820} 1821}
1821 1822
1822/** 1823static int __ref try_remove_memory(int nid, u64 start, u64 size)
1823 * remove_memory
1824 * @nid: the node ID
1825 * @start: physical address of the region to remove
1826 * @size: size of the region to remove
1827 *
1828 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1829 * and online/offline operations before this call, as required by
1830 * try_offline_node().
1831 */
1832void __ref __remove_memory(int nid, u64 start, u64 size)
1833{ 1824{
1834 int ret; 1825 int rc = 0;
1835 1826
1836 BUG_ON(check_hotplug_memory_range(start, size)); 1827 BUG_ON(check_hotplug_memory_range(start, size));
1837 1828
@@ -1839,13 +1830,13 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
1839 1830
1840 /* 1831 /*
1841 * All memory blocks must be offlined before removing memory. Check 1832 * All memory blocks must be offlined before removing memory. Check
1842 * whether all memory blocks in question are offline and trigger a BUG() 1833 * whether all memory blocks in question are offline and return error
1843 * if this is not the case. 1834 * if this is not the case.
1844 */ 1835 */
1845 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1836 rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1846 check_memblock_offlined_cb); 1837 check_memblock_offlined_cb);
1847 if (ret) 1838 if (rc)
1848 BUG(); 1839 goto done;
1849 1840
1850 /* remove memmap entry */ 1841 /* remove memmap entry */
1851 firmware_map_remove(start, start + size, "System RAM"); 1842 firmware_map_remove(start, start + size, "System RAM");
@@ -1857,14 +1848,45 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
1857 1848
1858 try_offline_node(nid); 1849 try_offline_node(nid);
1859 1850
1851done:
1860 mem_hotplug_done(); 1852 mem_hotplug_done();
1853 return rc;
1861} 1854}
1862 1855
1863void remove_memory(int nid, u64 start, u64 size) 1856/**
1857 * remove_memory
1858 * @nid: the node ID
1859 * @start: physical address of the region to remove
1860 * @size: size of the region to remove
1861 *
1862 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1863 * and online/offline operations before this call, as required by
1864 * try_offline_node().
1865 */
1866void __remove_memory(int nid, u64 start, u64 size)
1867{
1868
1869 /*
1870 * trigger BUG() is some memory is not offlined prior to calling this
1871 * function
1872 */
1873 if (try_remove_memory(nid, start, size))
1874 BUG();
1875}
1876
1877/*
1878 * Remove memory if every memory block is offline, otherwise return -EBUSY is
1879 * some memory is not offline
1880 */
1881int remove_memory(int nid, u64 start, u64 size)
1864{ 1882{
1883 int rc;
1884
1865 lock_device_hotplug(); 1885 lock_device_hotplug();
1866 __remove_memory(nid, start, size); 1886 rc = try_remove_memory(nid, start, size);
1867 unlock_device_hotplug(); 1887 unlock_device_hotplug();
1888
1889 return rc;
1868} 1890}
1869EXPORT_SYMBOL_GPL(remove_memory); 1891EXPORT_SYMBOL_GPL(remove_memory);
1870#endif /* CONFIG_MEMORY_HOTREMOVE */ 1892#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/nommu.c b/mm/nommu.c
index eb3e2e558da1..fed1b6e9c89b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1261,7 +1261,9 @@ unsigned long do_mmap(struct file *file,
1261 add_nommu_region(region); 1261 add_nommu_region(region);
1262 1262
1263 /* clear anonymous mappings that don't ask for uninitialized data */ 1263 /* clear anonymous mappings that don't ask for uninitialized data */
1264 if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) 1264 if (!vma->vm_file &&
1265 (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
1266 !(flags & MAP_UNINITIALIZED)))
1265 memset((void *)region->vm_start, 0, 1267 memset((void *)region->vm_start, 0,
1266 region->vm_end - region->vm_start); 1268 region->vm_end - region->vm_start);
1267 1269
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fd7f45a04eb..e515bfcf7f28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4102,7 +4102,6 @@ static int
4102__perform_reclaim(gfp_t gfp_mask, unsigned int order, 4102__perform_reclaim(gfp_t gfp_mask, unsigned int order,
4103 const struct alloc_context *ac) 4103 const struct alloc_context *ac)
4104{ 4104{
4105 struct reclaim_state reclaim_state;
4106 int progress; 4105 int progress;
4107 unsigned int noreclaim_flag; 4106 unsigned int noreclaim_flag;
4108 unsigned long pflags; 4107 unsigned long pflags;
@@ -4114,13 +4113,10 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
4114 psi_memstall_enter(&pflags); 4113 psi_memstall_enter(&pflags);
4115 fs_reclaim_acquire(gfp_mask); 4114 fs_reclaim_acquire(gfp_mask);
4116 noreclaim_flag = memalloc_noreclaim_save(); 4115 noreclaim_flag = memalloc_noreclaim_save();
4117 reclaim_state.reclaimed_slab = 0;
4118 current->reclaim_state = &reclaim_state;
4119 4116
4120 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 4117 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
4121 ac->nodemask); 4118 ac->nodemask);
4122 4119
4123 current->reclaim_state = NULL;
4124 memalloc_noreclaim_restore(noreclaim_flag); 4120 memalloc_noreclaim_restore(noreclaim_flag);
4125 fs_reclaim_release(gfp_mask); 4121 fs_reclaim_release(gfp_mask);
4126 psi_memstall_leave(&pflags); 4122 psi_memstall_leave(&pflags);
diff --git a/mm/shmem.c b/mm/shmem.c
index f4dce9c8670d..99497cb32e71 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -400,7 +400,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
400 400
401static int shmem_huge __read_mostly; 401static int shmem_huge __read_mostly;
402 402
403#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 403#if defined(CONFIG_SYSFS)
404static int shmem_parse_huge(const char *str) 404static int shmem_parse_huge(const char *str)
405{ 405{
406 if (!strcmp(str, "never")) 406 if (!strcmp(str, "never"))
@@ -417,7 +417,9 @@ static int shmem_parse_huge(const char *str)
417 return SHMEM_HUGE_FORCE; 417 return SHMEM_HUGE_FORCE;
418 return -EINVAL; 418 return -EINVAL;
419} 419}
420#endif
420 421
422#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
421static const char *shmem_format_huge(int huge) 423static const char *shmem_format_huge(int huge)
422{ 424{
423 switch (huge) { 425 switch (huge) {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6c49dbb3769e..807490fe217a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1028,7 +1028,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
1028} 1028}
1029 1029
1030struct kmem_cache * 1030struct kmem_cache *
1031kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; 1031kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
1032{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
1032EXPORT_SYMBOL(kmalloc_caches); 1033EXPORT_SYMBOL(kmalloc_caches);
1033 1034
1034/* 1035/*
diff --git a/mm/util.c b/mm/util.c
index 68575a315dc5..e6351a80f248 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,7 @@
7#include <linux/err.h> 7#include <linux/err.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/sched/mm.h> 9#include <linux/sched/mm.h>
10#include <linux/sched/signal.h>
10#include <linux/sched/task_stack.h> 11#include <linux/sched/task_stack.h>
11#include <linux/security.h> 12#include <linux/security.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
@@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
300} 301}
301#endif 302#endif
302 303
304/**
305 * __account_locked_vm - account locked pages to an mm's locked_vm
306 * @mm: mm to account against
307 * @pages: number of pages to account
308 * @inc: %true if @pages should be considered positive, %false if not
309 * @task: task used to check RLIMIT_MEMLOCK
310 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
311 *
312 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
313 * that mmap_sem is held as writer.
314 *
315 * Return:
316 * * 0 on success
317 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
318 */
319int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
320 struct task_struct *task, bool bypass_rlim)
321{
322 unsigned long locked_vm, limit;
323 int ret = 0;
324
325 lockdep_assert_held_write(&mm->mmap_sem);
326
327 locked_vm = mm->locked_vm;
328 if (inc) {
329 if (!bypass_rlim) {
330 limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
331 if (locked_vm + pages > limit)
332 ret = -ENOMEM;
333 }
334 if (!ret)
335 mm->locked_vm = locked_vm + pages;
336 } else {
337 WARN_ON_ONCE(pages > locked_vm);
338 mm->locked_vm = locked_vm - pages;
339 }
340
341 pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
342 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
343 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
344 ret ? " - exceeded" : "");
345
346 return ret;
347}
348EXPORT_SYMBOL_GPL(__account_locked_vm);
349
350/**
351 * account_locked_vm - account locked pages to an mm's locked_vm
352 * @mm: mm to account against, may be NULL
353 * @pages: number of pages to account
354 * @inc: %true if @pages should be considered positive, %false if not
355 *
356 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
357 *
358 * Return:
359 * * 0 on success, or if mm is NULL
360 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
361 */
362int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
363{
364 int ret;
365
366 if (pages == 0 || !mm)
367 return 0;
368
369 down_write(&mm->mmap_sem);
370 ret = __account_locked_vm(mm, pages, inc, current,
371 capable(CAP_IPC_LOCK));
372 up_write(&mm->mmap_sem);
373
374 return ret;
375}
376EXPORT_SYMBOL_GPL(account_locked_vm);
377
303unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, 378unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
304 unsigned long len, unsigned long prot, 379 unsigned long len, unsigned long prot,
305 unsigned long flag, unsigned long pgoff) 380 unsigned long flag, unsigned long pgoff)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8e3dcd527b8..44df66a98f2a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,9 @@ struct scan_control {
131 unsigned int file_taken; 131 unsigned int file_taken;
132 unsigned int taken; 132 unsigned int taken;
133 } nr; 133 } nr;
134
135 /* for recording the reclaimed slab by now */
136 struct reclaim_state reclaim_state;
134}; 137};
135 138
136#ifdef ARCH_HAS_PREFETCH 139#ifdef ARCH_HAS_PREFETCH
@@ -238,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
238} 241}
239#endif /* CONFIG_MEMCG_KMEM */ 242#endif /* CONFIG_MEMCG_KMEM */
240 243
244static void set_task_reclaim_state(struct task_struct *task,
245 struct reclaim_state *rs)
246{
247 /* Check for an overwrite */
248 WARN_ON_ONCE(rs && task->reclaim_state);
249
250 /* Check for the nulling of an already-nulled member */
251 WARN_ON_ONCE(!rs && !task->reclaim_state);
252
253 task->reclaim_state = rs;
254}
255
241#ifdef CONFIG_MEMCG 256#ifdef CONFIG_MEMCG
242static bool global_reclaim(struct scan_control *sc) 257static bool global_reclaim(struct scan_control *sc)
243{ 258{
@@ -3191,11 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3191 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) 3206 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3192 return 1; 3207 return 1;
3193 3208
3209 set_task_reclaim_state(current, &sc.reclaim_state);
3194 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); 3210 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3195 3211
3196 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3212 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3197 3213
3198 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 3214 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3215 set_task_reclaim_state(current, NULL);
3199 3216
3200 return nr_reclaimed; 3217 return nr_reclaimed;
3201} 3218}
@@ -3218,6 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3218 }; 3235 };
3219 unsigned long lru_pages; 3236 unsigned long lru_pages;
3220 3237
3238 set_task_reclaim_state(current, &sc.reclaim_state);
3221 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 3239 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3222 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 3240 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3223 3241
@@ -3235,7 +3253,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3235 3253
3236 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 3254 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3237 3255
3256 set_task_reclaim_state(current, NULL);
3238 *nr_scanned = sc.nr_scanned; 3257 *nr_scanned = sc.nr_scanned;
3258
3239 return sc.nr_reclaimed; 3259 return sc.nr_reclaimed;
3240} 3260}
3241 3261
@@ -3262,6 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3262 .may_shrinkslab = 1, 3282 .may_shrinkslab = 1,
3263 }; 3283 };
3264 3284
3285 set_task_reclaim_state(current, &sc.reclaim_state);
3265 /* 3286 /*
3266 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 3287 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
3267 * take care of from where we get pages. So the node where we start the 3288 * take care of from where we get pages. So the node where we start the
@@ -3282,6 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3282 psi_memstall_leave(&pflags); 3303 psi_memstall_leave(&pflags);
3283 3304
3284 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 3305 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3306 set_task_reclaim_state(current, NULL);
3285 3307
3286 return nr_reclaimed; 3308 return nr_reclaimed;
3287} 3309}
@@ -3483,6 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3483 .may_unmap = 1, 3505 .may_unmap = 1,
3484 }; 3506 };
3485 3507
3508 set_task_reclaim_state(current, &sc.reclaim_state);
3486 psi_memstall_enter(&pflags); 3509 psi_memstall_enter(&pflags);
3487 __fs_reclaim_acquire(); 3510 __fs_reclaim_acquire();
3488 3511
@@ -3664,6 +3687,8 @@ out:
3664 snapshot_refaults(NULL, pgdat); 3687 snapshot_refaults(NULL, pgdat);
3665 __fs_reclaim_release(); 3688 __fs_reclaim_release();
3666 psi_memstall_leave(&pflags); 3689 psi_memstall_leave(&pflags);
3690 set_task_reclaim_state(current, NULL);
3691
3667 /* 3692 /*
3668 * Return the order kswapd stopped reclaiming at as 3693 * Return the order kswapd stopped reclaiming at as
3669 * prepare_kswapd_sleep() takes it into account. If another caller 3694 * prepare_kswapd_sleep() takes it into account. If another caller
@@ -3787,15 +3812,10 @@ static int kswapd(void *p)
3787 unsigned int classzone_idx = MAX_NR_ZONES - 1; 3812 unsigned int classzone_idx = MAX_NR_ZONES - 1;
3788 pg_data_t *pgdat = (pg_data_t*)p; 3813 pg_data_t *pgdat = (pg_data_t*)p;
3789 struct task_struct *tsk = current; 3814 struct task_struct *tsk = current;
3790
3791 struct reclaim_state reclaim_state = {
3792 .reclaimed_slab = 0,
3793 };
3794 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 3815 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3795 3816
3796 if (!cpumask_empty(cpumask)) 3817 if (!cpumask_empty(cpumask))
3797 set_cpus_allowed_ptr(tsk, cpumask); 3818 set_cpus_allowed_ptr(tsk, cpumask);
3798 current->reclaim_state = &reclaim_state;
3799 3819
3800 /* 3820 /*
3801 * Tell the memory management that we're a "memory allocator", 3821 * Tell the memory management that we're a "memory allocator",
@@ -3857,7 +3877,6 @@ kswapd_try_sleep:
3857 } 3877 }
3858 3878
3859 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); 3879 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3860 current->reclaim_state = NULL;
3861 3880
3862 return 0; 3881 return 0;
3863} 3882}
@@ -3922,7 +3941,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3922 */ 3941 */
3923unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 3942unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3924{ 3943{
3925 struct reclaim_state reclaim_state;
3926 struct scan_control sc = { 3944 struct scan_control sc = {
3927 .nr_to_reclaim = nr_to_reclaim, 3945 .nr_to_reclaim = nr_to_reclaim,
3928 .gfp_mask = GFP_HIGHUSER_MOVABLE, 3946 .gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3934,18 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3934 .hibernation_mode = 1, 3952 .hibernation_mode = 1,
3935 }; 3953 };
3936 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 3954 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3937 struct task_struct *p = current;
3938 unsigned long nr_reclaimed; 3955 unsigned long nr_reclaimed;
3939 unsigned int noreclaim_flag; 3956 unsigned int noreclaim_flag;
3940 3957
3941 fs_reclaim_acquire(sc.gfp_mask); 3958 fs_reclaim_acquire(sc.gfp_mask);
3942 noreclaim_flag = memalloc_noreclaim_save(); 3959 noreclaim_flag = memalloc_noreclaim_save();
3943 reclaim_state.reclaimed_slab = 0; 3960 set_task_reclaim_state(current, &sc.reclaim_state);
3944 p->reclaim_state = &reclaim_state;
3945 3961
3946 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3962 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3947 3963
3948 p->reclaim_state = NULL; 3964 set_task_reclaim_state(current, NULL);
3949 memalloc_noreclaim_restore(noreclaim_flag); 3965 memalloc_noreclaim_restore(noreclaim_flag);
3950 fs_reclaim_release(sc.gfp_mask); 3966 fs_reclaim_release(sc.gfp_mask);
3951 3967
@@ -4110,7 +4126,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
4110 /* Minimum pages needed in order to stay on node */ 4126 /* Minimum pages needed in order to stay on node */
4111 const unsigned long nr_pages = 1 << order; 4127 const unsigned long nr_pages = 1 << order;
4112 struct task_struct *p = current; 4128 struct task_struct *p = current;
4113 struct reclaim_state reclaim_state;
4114 unsigned int noreclaim_flag; 4129 unsigned int noreclaim_flag;
4115 struct scan_control sc = { 4130 struct scan_control sc = {
4116 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 4131 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4135,8 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
4135 */ 4150 */
4136 noreclaim_flag = memalloc_noreclaim_save(); 4151 noreclaim_flag = memalloc_noreclaim_save();
4137 p->flags |= PF_SWAPWRITE; 4152 p->flags |= PF_SWAPWRITE;
4138 reclaim_state.reclaimed_slab = 0; 4153 set_task_reclaim_state(p, &sc.reclaim_state);
4139 p->reclaim_state = &reclaim_state;
4140 4154
4141 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { 4155 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4142 /* 4156 /*
@@ -4148,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
4148 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 4162 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4149 } 4163 }
4150 4164
4151 p->reclaim_state = NULL; 4165 set_task_reclaim_state(p, NULL);
4152 current->flags &= ~PF_SWAPWRITE; 4166 current->flags &= ~PF_SWAPWRITE;
4153 memalloc_noreclaim_restore(noreclaim_flag); 4167 memalloc_noreclaim_restore(noreclaim_flag);
4154 fs_reclaim_release(sc.gfp_mask); 4168 fs_reclaim_release(sc.gfp_mask);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index dfcd69d08c1e..6c72b18d8b9c 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -101,6 +101,7 @@ struct z3fold_buddy_slots {
101 * @refcount: reference count for the z3fold page 101 * @refcount: reference count for the z3fold page
102 * @work: work_struct for page layout optimization 102 * @work: work_struct for page layout optimization
103 * @slots: pointer to the structure holding buddy slots 103 * @slots: pointer to the structure holding buddy slots
104 * @pool: pointer to the containing pool
104 * @cpu: CPU which this page "belongs" to 105 * @cpu: CPU which this page "belongs" to
105 * @first_chunks: the size of the first buddy in chunks, 0 if free 106 * @first_chunks: the size of the first buddy in chunks, 0 if free
106 * @middle_chunks: the size of the middle buddy in chunks, 0 if free 107 * @middle_chunks: the size of the middle buddy in chunks, 0 if free
@@ -114,6 +115,7 @@ struct z3fold_header {
114 struct kref refcount; 115 struct kref refcount;
115 struct work_struct work; 116 struct work_struct work;
116 struct z3fold_buddy_slots *slots; 117 struct z3fold_buddy_slots *slots;
118 struct z3fold_pool *pool;
117 short cpu; 119 short cpu;
118 unsigned short first_chunks; 120 unsigned short first_chunks;
119 unsigned short middle_chunks; 121 unsigned short middle_chunks;
@@ -193,8 +195,10 @@ static void compact_page_work(struct work_struct *w);
193static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, 195static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
194 gfp_t gfp) 196 gfp_t gfp)
195{ 197{
196 struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle, 198 struct z3fold_buddy_slots *slots;
197 gfp); 199
200 slots = kmem_cache_alloc(pool->c_handle,
201 (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
198 202
199 if (slots) { 203 if (slots) {
200 memset(slots->slot, 0, sizeof(slots->slot)); 204 memset(slots->slot, 0, sizeof(slots->slot));
@@ -320,6 +324,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
320 zhdr->start_middle = 0; 324 zhdr->start_middle = 0;
321 zhdr->cpu = -1; 325 zhdr->cpu = -1;
322 zhdr->slots = slots; 326 zhdr->slots = slots;
327 zhdr->pool = pool;
323 INIT_LIST_HEAD(&zhdr->buddy); 328 INIT_LIST_HEAD(&zhdr->buddy);
324 INIT_WORK(&zhdr->work, compact_page_work); 329 INIT_WORK(&zhdr->work, compact_page_work);
325 return zhdr; 330 return zhdr;
@@ -426,7 +431,7 @@ static enum buddy handle_to_buddy(unsigned long handle)
426 431
427static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) 432static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
428{ 433{
429 return slots_to_pool(zhdr->slots); 434 return zhdr->pool;
430} 435}
431 436
432static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) 437static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
@@ -850,7 +855,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
850 enum buddy bud; 855 enum buddy bud;
851 bool can_sleep = gfpflags_allow_blocking(gfp); 856 bool can_sleep = gfpflags_allow_blocking(gfp);
852 857
853 if (!size || (gfp & __GFP_HIGHMEM)) 858 if (!size)
854 return -EINVAL; 859 return -EINVAL;
855 860
856 if (size > PAGE_SIZE) 861 if (size > PAGE_SIZE)
@@ -1345,24 +1350,29 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
1345 zhdr = page_address(page); 1350 zhdr = page_address(page);
1346 pool = zhdr_to_pool(zhdr); 1351 pool = zhdr_to_pool(zhdr);
1347 1352
1348 if (!trylock_page(page))
1349 return -EAGAIN;
1350
1351 if (!z3fold_page_trylock(zhdr)) { 1353 if (!z3fold_page_trylock(zhdr)) {
1352 unlock_page(page);
1353 return -EAGAIN; 1354 return -EAGAIN;
1354 } 1355 }
1355 if (zhdr->mapped_count != 0) { 1356 if (zhdr->mapped_count != 0) {
1356 z3fold_page_unlock(zhdr); 1357 z3fold_page_unlock(zhdr);
1357 unlock_page(page);
1358 return -EBUSY; 1358 return -EBUSY;
1359 } 1359 }
1360 if (work_pending(&zhdr->work)) {
1361 z3fold_page_unlock(zhdr);
1362 return -EAGAIN;
1363 }
1360 new_zhdr = page_address(newpage); 1364 new_zhdr = page_address(newpage);
1361 memcpy(new_zhdr, zhdr, PAGE_SIZE); 1365 memcpy(new_zhdr, zhdr, PAGE_SIZE);
1362 newpage->private = page->private; 1366 newpage->private = page->private;
1363 page->private = 0; 1367 page->private = 0;
1364 z3fold_page_unlock(zhdr); 1368 z3fold_page_unlock(zhdr);
1365 spin_lock_init(&new_zhdr->page_lock); 1369 spin_lock_init(&new_zhdr->page_lock);
1370 INIT_WORK(&new_zhdr->work, compact_page_work);
1371 /*
1372 * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
1373 * so we only have to reinitialize it.
1374 */
1375 INIT_LIST_HEAD(&new_zhdr->buddy);
1366 new_mapping = page_mapping(page); 1376 new_mapping = page_mapping(page);
1367 __ClearPageMovable(page); 1377 __ClearPageMovable(page);
1368 ClearPagePrivate(page); 1378 ClearPagePrivate(page);
@@ -1386,7 +1396,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
1386 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); 1396 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1387 1397
1388 page_mapcount_reset(page); 1398 page_mapcount_reset(page);
1389 unlock_page(page);
1390 put_page(page); 1399 put_page(page);
1391 return 0; 1400 return 0;
1392} 1401}