summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-06 13:31:36 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-06 13:31:36 -0500
commit8dcd175bc3d50b78413c56d5b17d4bddd77412ef (patch)
tree2c2fb25759b43f2e73830f07ef3b444d76825280
parentafe6fe7036c6efdcb46cabc64bec9b6e4a005210 (diff)
parentfff04900ea79915939ef6a3aad78fca6511a3034 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: - a few misc things - ocfs2 updates - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (159 commits) tools/testing/selftests/proc/proc-self-syscall.c: remove duplicate include proc: more robust bulk read test proc: test /proc/*/maps, smaps, smaps_rollup, statm proc: use seq_puts() everywhere proc: read kernel cpu stat pointer once proc: remove unused argument in proc_pid_lookup() fs/proc/thread_self.c: code cleanup for proc_setup_thread_self() fs/proc/self.c: code cleanup for proc_setup_self() proc: return exit code 4 for skipped tests mm,mremap: bail out earlier in mremap_to under map pressure mm/sparse: fix a bad comparison mm/memory.c: do_fault: avoid usage of stale vm_area_struct writeback: fix inode cgroup switching comment mm/huge_memory.c: fix "orig_pud" set but not used mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC mm/memcontrol.c: fix bad line in comment mm/cma.c: cma_declare_contiguous: correct err handling mm/page_ext.c: fix an imbalance with kmemleak mm/compaction: pass pgdat to too_many_isolated() instead of zone mm: remove zone_lru_lock() function, access ->lru_lock directly ...
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst16
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst9
-rw-r--r--Documentation/cgroup-v1/memcg_test.txt4
-rw-r--r--Documentation/cgroup-v1/memory.txt4
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/alpha/include/asm/topology.h3
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/arm64/include/asm/hugetlb.h5
-rw-r--r--arch/arm64/include/asm/memory.h4
-rw-r--r--arch/arm64/kernel/machine_kexec.c3
-rw-r--r--arch/arm64/mm/hugetlbpage.c20
-rw-r--r--arch/arm64/mm/init.c27
-rw-r--r--arch/arm64/mm/numa.c2
-rw-r--r--arch/ia64/kernel/numa.c2
-rw-r--r--arch/ia64/kernel/perfmon.c59
-rw-r--r--arch/ia64/mm/discontig.c6
-rw-r--r--arch/m68k/mm/memory.c2
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb.h12
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h18
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h4
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h3
-rw-r--r--arch/powerpc/kernel/paca.c3
-rw-r--r--arch/powerpc/kernel/pci-common.c3
-rw-r--r--arch/powerpc/kernel/vdso.c2
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c25
-rw-r--r--arch/powerpc/mm/hugetlbpage-radix.c17
-rw-r--r--arch/powerpc/mm/mmu_context_iommu.c145
-rw-r--r--arch/powerpc/mm/numa.c16
-rw-r--r--arch/powerpc/mm/pgtable-book3s64.c25
-rw-r--r--arch/powerpc/mm/pgtable-radix.c18
-rw-r--r--arch/powerpc/platforms/powernv/memtrace.c5
-rw-r--r--arch/riscv/kernel/vdso.c1
-rw-r--r--arch/s390/include/asm/pgtable.h5
-rw-r--r--arch/s390/kernel/vdso.c2
-rw-r--r--arch/s390/mm/pgtable.c8
-rw-r--r--arch/sh/kernel/syscalls/syscalltbl.sh4
-rw-r--r--arch/sh/kernel/syscalls_32.S2
-rw-r--r--arch/sparc/kernel/pci_fire.c3
-rw-r--r--arch/sparc/kernel/pci_schizo.c3
-rw-r--r--arch/sparc/kernel/psycho_common.c3
-rw-r--r--arch/sparc/kernel/sbus.c3
-rw-r--r--arch/sparc/mm/init_64.c6
-rw-r--r--arch/x86/include/asm/paravirt.h13
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/uaccess.h24
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c7
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/mm/numa.c4
-rw-r--r--arch/x86/xen/mmu.h4
-rw-r--r--arch/x86/xen/mmu_pv.c8
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c5
-rw-r--r--drivers/char/agp/efficeon-agp.c2
-rw-r--r--drivers/dma/dmaengine.c4
-rw-r--r--drivers/gpu/drm/i915/i915_utils.h6
-rw-r--r--drivers/hv/hv_balloon.c21
-rw-r--r--drivers/infiniband/hw/hfi1/affinity.c3
-rw-r--r--drivers/infiniband/hw/hfi1/init.c3
-rw-r--r--drivers/iommu/dmar.c5
-rw-r--r--drivers/iommu/intel-iommu.c3
-rw-r--r--drivers/misc/sgi-xp/xpc_uv.c3
-rw-r--r--drivers/misc/vmw_balloon.c32
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c5
-rw-r--r--drivers/xen/balloon.c18
-rw-r--r--fs/file.c1
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c8
-rw-r--r--fs/kernfs/file.c31
-rw-r--r--fs/ocfs2/alloc.c159
-rw-r--r--fs/ocfs2/cluster/nodemanager.c14
-rw-r--r--fs/ocfs2/dlmglue.c5
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/slot_map.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/proc/array.c16
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/self.c16
-rw-r--r--fs/proc/stat.c60
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/proc/thread_self.c16
-rw-r--r--include/asm-generic/pgtable.h18
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--include/linux/balloon_compaction.h34
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/compaction.h7
-rw-r--r--include/linux/device.h2
-rw-r--r--include/linux/frontswap.h7
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/gfp.h30
-rw-r--r--include/linux/hugetlb.h70
-rw-r--r--include/linux/kasan-checks.h2
-rw-r--r--include/linux/kernfs.h6
-rw-r--r--include/linux/ksm.h7
-rw-r--r--include/linux/list.h11
-rw-r--r--include/linux/memcontrol.h47
-rw-r--r--include/linux/memory_hotplug.h2
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/nodemask.h8
-rw-r--r--include/linux/page-flags.h44
-rw-r--r--include/linux/pagemap.h31
-rw-r--r--include/linux/poison.h2
-rw-r--r--include/linux/sched.h5
-rw-r--r--include/linux/sched/mm.h48
-rw-r--r--include/linux/shmem_fs.h3
-rw-r--r--include/linux/slub_def.h12
-rw-r--r--include/linux/swap.h4
-rw-r--r--include/uapi/linux/fcntl.h1
-rw-r--r--include/uapi/linux/kernel-page-flags.h2
-rw-r--r--init/init_task.c3
-rw-r--r--kernel/cgroup/cgroup.c12
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/power/snapshot.c17
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/fair.c15
-rw-r--r--kernel/sysctl.c2
-rw-r--r--lib/Kconfig.debug31
-rw-r--r--lib/Kconfig.kasan10
-rw-r--r--lib/Makefile1
-rw-r--r--lib/cpumask.c3
-rw-r--r--lib/test_kasan.c24
-rw-r--r--lib/test_vmalloc.c551
-rw-r--r--mm/Kconfig.debug17
-rw-r--r--mm/cma.c4
-rw-r--r--mm/cma_debug.c11
-rw-r--r--mm/compaction.c1039
-rw-r--r--mm/dmapool.c13
-rw-r--r--mm/failslab.c14
-rw-r--r--mm/filemap.c93
-rw-r--r--mm/gup.c200
-rw-r--r--mm/gup_benchmark.c8
-rw-r--r--mm/huge_memory.c37
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/internal.h24
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kasan/generic.c19
-rw-r--r--mm/kasan/generic_report.c3
-rw-r--r--mm/kasan/init.c6
-rw-r--r--mm/kasan/kasan.h3
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/ksm.c77
-rw-r--r--mm/list_lru.c3
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c150
-rw-r--r--mm/memfd.c3
-rw-r--r--mm/memory-failure.c14
-rw-r--r--mm/memory.c72
-rw-r--r--mm/memory_hotplug.c55
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/mempool.c8
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/mlock.c14
-rw-r--r--mm/mmap.c15
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/oom_kill.c81
-rw-r--r--mm/page-writeback.c24
-rw-r--r--mm/page_alloc.c160
-rw-r--r--mm/page_ext.c3
-rw-r--r--mm/page_idle.c8
-rw-r--r--mm/page_owner.c8
-rw-r--r--mm/page_poison.c4
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c741
-rw-r--r--mm/slab.c34
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c12
-rw-r--r--mm/slub.c16
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c16
-rw-r--r--mm/swap_state.c23
-rw-r--r--mm/swapfile.c487
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/util.c37
-rw-r--r--mm/vmalloc.c459
-rw-r--r--mm/vmscan.c88
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/workingset.c5
-rw-r--r--net/core/pktgen.c3
-rw-r--r--net/qrtr/qrtr.c3
-rw-r--r--scripts/Makefile.kasan5
-rwxr-xr-xscripts/decode_stacktrace.sh9
-rw-r--r--scripts/gcc-plugins/Kconfig4
-rw-r--r--tools/include/linux/numa.h16
-rw-r--r--tools/perf/bench/numa.c7
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c74
-rw-r--r--tools/testing/selftests/proc/.gitignore1
-rw-r--r--tools/testing/selftests/proc/Makefile1
-rw-r--r--tools/testing/selftests/proc/proc-loadavg-001.c2
-rw-r--r--tools/testing/selftests/proc/proc-pid-vm.c406
-rw-r--r--tools/testing/selftests/proc/proc-self-map-files-002.c2
-rw-r--r--tools/testing/selftests/proc/proc-self-syscall.c3
-rw-r--r--tools/testing/selftests/proc/proc-self-wchan.c2
-rw-r--r--tools/testing/selftests/proc/read.c14
-rw-r--r--tools/testing/selftests/tmpfs/.gitignore1
-rw-r--r--tools/testing/selftests/tmpfs/Makefile7
-rw-r--r--tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c67
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests16
-rw-r--r--tools/testing/selftests/vm/test_vmalloc.sh176
-rw-r--r--tools/vm/page-types.c2
-rw-r--r--tools/vm/slabinfo.c35
213 files changed, 4918 insertions, 2315 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 7bf3f129c68b..53d3288c328b 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1189,6 +1189,10 @@ PAGE_SIZE multiple when read back.
1189 Amount of cached filesystem data that was modified and 1189 Amount of cached filesystem data that was modified and
1190 is currently being written back to disk 1190 is currently being written back to disk
1191 1191
1192 anon_thp
1193 Amount of memory used in anonymous mappings backed by
1194 transparent hugepages
1195
1192 inactive_anon, active_anon, inactive_file, active_file, unevictable 1196 inactive_anon, active_anon, inactive_file, active_file, unevictable
1193 Amount of memory, swap-backed and filesystem-backed, 1197 Amount of memory, swap-backed and filesystem-backed,
1194 on the internal memory management lists used by the 1198 on the internal memory management lists used by the
@@ -1248,6 +1252,18 @@ PAGE_SIZE multiple when read back.
1248 1252
1249 Amount of reclaimed lazyfree pages 1253 Amount of reclaimed lazyfree pages
1250 1254
1255 thp_fault_alloc
1256
1257 Number of transparent hugepages which were allocated to satisfy
1258 a page fault, including COW faults. This counter is not present
1259 when CONFIG_TRANSPARENT_HUGEPAGE is not set.
1260
1261 thp_collapse_alloc
1262
1263 Number of transparent hugepages which were allocated to allow
1264 collapsing an existing range of pages. This counter is not
1265 present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
1266
1251 memory.swap.current 1267 memory.swap.current
1252 A read-only single value file which exists on non-root 1268 A read-only single value file which exists on non-root
1253 cgroups. 1269 cgroups.
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 3f7bade2c231..340a5aee9b80 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -75,9 +75,10 @@ number of times a page is mapped.
75 20. NOPAGE 75 20. NOPAGE
76 21. KSM 76 21. KSM
77 22. THP 77 22. THP
78 23. BALLOON 78 23. OFFLINE
79 24. ZERO_PAGE 79 24. ZERO_PAGE
80 25. IDLE 80 25. IDLE
81 26. PGTABLE
81 82
82 * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the 83 * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the
83 memory cgroup each page is charged to, indexed by PFN. Only available when 84 memory cgroup each page is charged to, indexed by PFN. Only available when
@@ -118,8 +119,8 @@ Short descriptions to the page flags
118 identical memory pages dynamically shared between one or more processes 119 identical memory pages dynamically shared between one or more processes
11922 - THP 12022 - THP
120 contiguous pages which construct transparent hugepages 121 contiguous pages which construct transparent hugepages
12123 - BALLOON 12223 - OFFLINE
122 balloon compaction page 123 page is logically offline
12324 - ZERO_PAGE 12424 - ZERO_PAGE
124 zero page for pfn_zero or huge_zero page 125 zero page for pfn_zero or huge_zero page
12525 - IDLE 12625 - IDLE
@@ -128,6 +129,8 @@ Short descriptions to the page flags
128 Note that this flag may be stale in case the page was accessed via 129 Note that this flag may be stale in case the page was accessed via
129 a PTE. To make sure the flag is up-to-date one has to read 130 a PTE. To make sure the flag is up-to-date one has to read
130 ``/sys/kernel/mm/page_idle/bitmap`` first. 131 ``/sys/kernel/mm/page_idle/bitmap`` first.
13226 - PGTABLE
133 page is in use as a page table
131 134
132IO related page flags 135IO related page flags
133--------------------- 136---------------------
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt
index 5c7f310f32bb..621e29ffb358 100644
--- a/Documentation/cgroup-v1/memcg_test.txt
+++ b/Documentation/cgroup-v1/memcg_test.txt
@@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
107 107
1088. LRU 1088. LRU
109 Each memcg has its own private LRU. Now, its handling is under global 109 Each memcg has its own private LRU. Now, its handling is under global
110 VM's control (means that it's handled under global zone_lru_lock). 110 VM's control (means that it's handled under global pgdat->lru_lock).
111 Almost all routines around memcg's LRU is called by global LRU's 111 Almost all routines around memcg's LRU is called by global LRU's
112 list management functions under zone_lru_lock(). 112 list management functions under pgdat->lru_lock.
113 113
114 A special function is mem_cgroup_isolate_pages(). This scans 114 A special function is mem_cgroup_isolate_pages(). This scans
115 memcg's private LRU and call __isolate_lru_page() to extract a page 115 memcg's private LRU and call __isolate_lru_page() to extract a page
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt
index 3682e99234c2..a347fc9293e5 100644
--- a/Documentation/cgroup-v1/memory.txt
+++ b/Documentation/cgroup-v1/memory.txt
@@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered.
267 Other lock order is following: 267 Other lock order is following:
268 PG_locked. 268 PG_locked.
269 mm->page_table_lock 269 mm->page_table_lock
270 zone_lru_lock 270 pgdat->lru_lock
271 lock_page_cgroup. 271 lock_page_cgroup.
272 In many cases, just lock_page_cgroup() is called. 272 In many cases, just lock_page_cgroup() is called.
273 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by 273 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
274 zone_lru_lock, it has no lock of its own. 274 pgdat->lru_lock, it has no lock of its own.
275 275
2762.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) 2762.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
277 277
diff --git a/MAINTAINERS b/MAINTAINERS
index bd549618aea9..c7d3e51c7064 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9835,6 +9835,14 @@ F: kernel/sched/membarrier.c
9835F: include/uapi/linux/membarrier.h 9835F: include/uapi/linux/membarrier.h
9836F: arch/powerpc/include/asm/membarrier.h 9836F: arch/powerpc/include/asm/membarrier.h
9837 9837
9838MEMBLOCK
9839M: Mike Rapoport <rppt@linux.ibm.com>
9840L: linux-mm@kvack.org
9841S: Maintained
9842F: include/linux/memblock.h
9843F: mm/memblock.c
9844F: Documentation/core-api/boot-time-mm.rst
9845
9838MEMORY MANAGEMENT 9846MEMORY MANAGEMENT
9839L: linux-mm@kvack.org 9847L: linux-mm@kvack.org
9840W: http://www.linux-mm.org 9848W: http://www.linux-mm.org
diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h
index e6e13a85796a..5a77a40567fa 100644
--- a/arch/alpha/include/asm/topology.h
+++ b/arch/alpha/include/asm/topology.h
@@ -4,6 +4,7 @@
4 4
5#include <linux/smp.h> 5#include <linux/smp.h>
6#include <linux/threads.h> 6#include <linux/threads.h>
7#include <linux/numa.h>
7#include <asm/machvec.h> 8#include <asm/machvec.h>
8 9
9#ifdef CONFIG_NUMA 10#ifdef CONFIG_NUMA
@@ -29,7 +30,7 @@ static const struct cpumask *cpumask_of_node(int node)
29{ 30{
30 int cpu; 31 int cpu;
31 32
32 if (node == -1) 33 if (node == NUMA_NO_NODE)
33 return cpu_all_mask; 34 return cpu_all_mask;
34 35
35 cpumask_clear(&node_to_cpumask_map[node]); 36 cpumask_clear(&node_to_cpumask_map[node]);
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a4168d366127..cfbf307d6dc4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1467,6 +1467,10 @@ config SYSVIPC_COMPAT
1467 def_bool y 1467 def_bool y
1468 depends on COMPAT && SYSVIPC 1468 depends on COMPAT && SYSVIPC
1469 1469
1470config ARCH_ENABLE_HUGEPAGE_MIGRATION
1471 def_bool y
1472 depends on HUGETLB_PAGE && MIGRATION
1473
1470menu "Power management options" 1474menu "Power management options"
1471 1475
1472source "kernel/power/Kconfig" 1476source "kernel/power/Kconfig"
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index fb6609875455..c6a07a3b433e 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -20,6 +20,11 @@
20 20
21#include <asm/page.h> 21#include <asm/page.h>
22 22
23#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
24#define arch_hugetlb_migration_supported arch_hugetlb_migration_supported
25extern bool arch_hugetlb_migration_supported(struct hstate *h);
26#endif
27
23#define __HAVE_ARCH_HUGE_PTEP_GET 28#define __HAVE_ARCH_HUGE_PTEP_GET
24static inline pte_t huge_ptep_get(pte_t *ptep) 29static inline pte_t huge_ptep_get(pte_t *ptep)
25{ 30{
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 0c656850eeea..b01ef0180a03 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -80,11 +80,7 @@
80 */ 80 */
81#ifdef CONFIG_KASAN 81#ifdef CONFIG_KASAN
82#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT)) 82#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT))
83#ifdef CONFIG_KASAN_EXTRA
84#define KASAN_THREAD_SHIFT 2
85#else
86#define KASAN_THREAD_SHIFT 1 83#define KASAN_THREAD_SHIFT 1
87#endif /* CONFIG_KASAN_EXTRA */
88#else 84#else
89#define KASAN_SHADOW_SIZE (0) 85#define KASAN_SHADOW_SIZE (0)
90#define KASAN_THREAD_SHIFT 0 86#define KASAN_THREAD_SHIFT 0
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index aa9c94113700..66b5d697d943 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -321,7 +321,7 @@ void crash_post_resume(void)
321 * but does not hold any data of loaded kernel image. 321 * but does not hold any data of loaded kernel image.
322 * 322 *
323 * Note that all the pages in crash dump kernel memory have been initially 323 * Note that all the pages in crash dump kernel memory have been initially
324 * marked as Reserved in kexec_reserve_crashkres_pages(). 324 * marked as Reserved as memory was allocated via memblock_reserve().
325 * 325 *
326 * In hibernation, the pages which are Reserved and yet "nosave" are excluded 326 * In hibernation, the pages which are Reserved and yet "nosave" are excluded
327 * from the hibernation iamge. crash_is_nosave() does thich check for crash 327 * from the hibernation iamge. crash_is_nosave() does thich check for crash
@@ -361,7 +361,6 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
361 361
362 for (addr = begin; addr < end; addr += PAGE_SIZE) { 362 for (addr = begin; addr < end; addr += PAGE_SIZE) {
363 page = phys_to_page(addr); 363 page = phys_to_page(addr);
364 ClearPageReserved(page);
365 free_reserved_page(page); 364 free_reserved_page(page);
366 } 365 }
367} 366}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 28cbc22d7e30..6b4a47b3adf4 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -27,6 +27,26 @@
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28#include <asm/pgalloc.h> 28#include <asm/pgalloc.h>
29 29
30#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
31bool arch_hugetlb_migration_supported(struct hstate *h)
32{
33 size_t pagesize = huge_page_size(h);
34
35 switch (pagesize) {
36#ifdef CONFIG_ARM64_4K_PAGES
37 case PUD_SIZE:
38#endif
39 case PMD_SIZE:
40 case CONT_PMD_SIZE:
41 case CONT_PTE_SIZE:
42 return true;
43 }
44 pr_warn("%s: unrecognized huge page size 0x%lx\n",
45 __func__, pagesize);
46 return false;
47}
48#endif
49
30int pmd_huge(pmd_t pmd) 50int pmd_huge(pmd_t pmd)
31{ 51{
32 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); 52 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 7205a9085b4d..c38976b70069 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -118,35 +118,10 @@ static void __init reserve_crashkernel(void)
118 crashk_res.start = crash_base; 118 crashk_res.start = crash_base;
119 crashk_res.end = crash_base + crash_size - 1; 119 crashk_res.end = crash_base + crash_size - 1;
120} 120}
121
122static void __init kexec_reserve_crashkres_pages(void)
123{
124#ifdef CONFIG_HIBERNATION
125 phys_addr_t addr;
126 struct page *page;
127
128 if (!crashk_res.end)
129 return;
130
131 /*
132 * To reduce the size of hibernation image, all the pages are
133 * marked as Reserved initially.
134 */
135 for (addr = crashk_res.start; addr < (crashk_res.end + 1);
136 addr += PAGE_SIZE) {
137 page = phys_to_page(addr);
138 SetPageReserved(page);
139 }
140#endif
141}
142#else 121#else
143static void __init reserve_crashkernel(void) 122static void __init reserve_crashkernel(void)
144{ 123{
145} 124}
146
147static void __init kexec_reserve_crashkres_pages(void)
148{
149}
150#endif /* CONFIG_KEXEC_CORE */ 125#endif /* CONFIG_KEXEC_CORE */
151 126
152#ifdef CONFIG_CRASH_DUMP 127#ifdef CONFIG_CRASH_DUMP
@@ -586,8 +561,6 @@ void __init mem_init(void)
586 /* this will put all unused low memory onto the freelists */ 561 /* this will put all unused low memory onto the freelists */
587 memblock_free_all(); 562 memblock_free_all();
588 563
589 kexec_reserve_crashkres_pages();
590
591 mem_init_print_info(NULL); 564 mem_init_print_info(NULL);
592 565
593 /* 566 /*
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index ae34e3a1cef1..7a0a555b366a 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -120,7 +120,7 @@ static void __init setup_node_to_cpumask_map(void)
120 } 120 }
121 121
122 /* cpumask_of_node() will now work */ 122 /* cpumask_of_node() will now work */
123 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 123 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
124} 124}
125 125
126/* 126/*
diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c
index 92c376279c6d..1315da6c7aeb 100644
--- a/arch/ia64/kernel/numa.c
+++ b/arch/ia64/kernel/numa.c
@@ -74,7 +74,7 @@ void __init build_cpu_to_node_map(void)
74 cpumask_clear(&node_to_cpu_mask[node]); 74 cpumask_clear(&node_to_cpu_mask[node]);
75 75
76 for_each_possible_early_cpu(cpu) { 76 for_each_possible_early_cpu(cpu) {
77 node = -1; 77 node = NUMA_NO_NODE;
78 for (i = 0; i < NR_CPUS; ++i) 78 for (i = 0; i < NR_CPUS; ++i)
79 if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { 79 if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
80 node = node_cpuid[i].nid; 80 node = node_cpuid[i].nid;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 46bff1661836..7a969f4c3534 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -583,17 +583,6 @@ pfm_put_task(struct task_struct *task)
583 if (task != current) put_task_struct(task); 583 if (task != current) put_task_struct(task);
584} 584}
585 585
586static inline void
587pfm_reserve_page(unsigned long a)
588{
589 SetPageReserved(vmalloc_to_page((void *)a));
590}
591static inline void
592pfm_unreserve_page(unsigned long a)
593{
594 ClearPageReserved(vmalloc_to_page((void*)a));
595}
596
597static inline unsigned long 586static inline unsigned long
598pfm_protect_ctx_ctxsw(pfm_context_t *x) 587pfm_protect_ctx_ctxsw(pfm_context_t *x)
599{ 588{
@@ -816,44 +805,6 @@ pfm_reset_msgq(pfm_context_t *ctx)
816 DPRINT(("ctx=%p msgq reset\n", ctx)); 805 DPRINT(("ctx=%p msgq reset\n", ctx));
817} 806}
818 807
819static void *
820pfm_rvmalloc(unsigned long size)
821{
822 void *mem;
823 unsigned long addr;
824
825 size = PAGE_ALIGN(size);
826 mem = vzalloc(size);
827 if (mem) {
828 //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
829 addr = (unsigned long)mem;
830 while (size > 0) {
831 pfm_reserve_page(addr);
832 addr+=PAGE_SIZE;
833 size-=PAGE_SIZE;
834 }
835 }
836 return mem;
837}
838
839static void
840pfm_rvfree(void *mem, unsigned long size)
841{
842 unsigned long addr;
843
844 if (mem) {
845 DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
846 addr = (unsigned long) mem;
847 while ((long) size > 0) {
848 pfm_unreserve_page(addr);
849 addr+=PAGE_SIZE;
850 size-=PAGE_SIZE;
851 }
852 vfree(mem);
853 }
854 return;
855}
856
857static pfm_context_t * 808static pfm_context_t *
858pfm_context_alloc(int ctx_flags) 809pfm_context_alloc(int ctx_flags)
859{ 810{
@@ -1498,7 +1449,7 @@ pfm_free_smpl_buffer(pfm_context_t *ctx)
1498 /* 1449 /*
1499 * free the buffer 1450 * free the buffer
1500 */ 1451 */
1501 pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size); 1452 vfree(ctx->ctx_smpl_hdr);
1502 1453
1503 ctx->ctx_smpl_hdr = NULL; 1454 ctx->ctx_smpl_hdr = NULL;
1504 ctx->ctx_smpl_size = 0UL; 1455 ctx->ctx_smpl_size = 0UL;
@@ -2137,7 +2088,7 @@ doit:
2137 * All memory free operations (especially for vmalloc'ed memory) 2088 * All memory free operations (especially for vmalloc'ed memory)
2138 * MUST be done with interrupts ENABLED. 2089 * MUST be done with interrupts ENABLED.
2139 */ 2090 */
2140 if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size); 2091 vfree(smpl_buf_addr);
2141 2092
2142 /* 2093 /*
2143 * return the memory used by the context 2094 * return the memory used by the context
@@ -2266,10 +2217,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
2266 2217
2267 /* 2218 /*
2268 * We do the easy to undo allocations first. 2219 * We do the easy to undo allocations first.
2269 *
2270 * pfm_rvmalloc(), clears the buffer, so there is no leak
2271 */ 2220 */
2272 smpl_buf = pfm_rvmalloc(size); 2221 smpl_buf = vzalloc(size);
2273 if (smpl_buf == NULL) { 2222 if (smpl_buf == NULL) {
2274 DPRINT(("Can't allocate sampling buffer\n")); 2223 DPRINT(("Can't allocate sampling buffer\n"));
2275 return -ENOMEM; 2224 return -ENOMEM;
@@ -2346,7 +2295,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
2346error: 2295error:
2347 vm_area_free(vma); 2296 vm_area_free(vma);
2348error_kmem: 2297error_kmem:
2349 pfm_rvfree(smpl_buf, size); 2298 vfree(smpl_buf);
2350 2299
2351 return -ENOMEM; 2300 return -ENOMEM;
2352} 2301}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 8a965784340c..f9c36750c6a4 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -227,7 +227,7 @@ void __init setup_per_cpu_areas(void)
227 * CPUs are put into groups according to node. Walk cpu_map 227 * CPUs are put into groups according to node. Walk cpu_map
228 * and create new groups at node boundaries. 228 * and create new groups at node boundaries.
229 */ 229 */
230 prev_node = -1; 230 prev_node = NUMA_NO_NODE;
231 ai->nr_groups = 0; 231 ai->nr_groups = 0;
232 for (unit = 0; unit < nr_units; unit++) { 232 for (unit = 0; unit < nr_units; unit++) {
233 cpu = cpu_map[unit]; 233 cpu = cpu_map[unit];
@@ -435,7 +435,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
435{ 435{
436 void *ptr = NULL; 436 void *ptr = NULL;
437 u8 best = 0xff; 437 u8 best = 0xff;
438 int bestnode = -1, node, anynode = 0; 438 int bestnode = NUMA_NO_NODE, node, anynode = 0;
439 439
440 for_each_online_node(node) { 440 for_each_online_node(node) {
441 if (node_isset(node, memory_less_mask)) 441 if (node_isset(node, memory_less_mask))
@@ -447,7 +447,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
447 anynode = node; 447 anynode = node;
448 } 448 }
449 449
450 if (bestnode == -1) 450 if (bestnode == NUMA_NO_NODE)
451 bestnode = anynode; 451 bestnode = anynode;
452 452
453 ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE, 453 ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c
index b86a2e21693b..227c04fe60d2 100644
--- a/arch/m68k/mm/memory.c
+++ b/arch/m68k/mm/memory.c
@@ -51,7 +51,7 @@ void __init init_pointer_table(unsigned long ptable)
51 pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp)); 51 pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp));
52 52
53 /* unreserve the page so it's possible to free that page */ 53 /* unreserve the page so it's possible to free that page */
54 PD_PAGE(dp)->flags &= ~(1 << PG_reserved); 54 __ClearPageReserved(PD_PAGE(dp));
55 init_page_count(PD_PAGE(dp)); 55 init_page_count(PD_PAGE(dp));
56 56
57 return; 57 return;
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 5b0177733994..66c1e4f88d65 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -13,6 +13,10 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
13 unsigned long len, unsigned long pgoff, 13 unsigned long len, unsigned long pgoff,
14 unsigned long flags); 14 unsigned long flags);
15 15
16extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
17 unsigned long addr, pte_t *ptep,
18 pte_t old_pte, pte_t pte);
19
16static inline int hstate_get_psize(struct hstate *hstate) 20static inline int hstate_get_psize(struct hstate *hstate)
17{ 21{
18 unsigned long shift; 22 unsigned long shift;
@@ -42,4 +46,12 @@ static inline bool gigantic_page_supported(void)
42/* hugepd entry valid bit */ 46/* hugepd entry valid bit */
43#define HUGEPD_VAL_BITS (0x8000000000000000UL) 47#define HUGEPD_VAL_BITS (0x8000000000000000UL)
44 48
49#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
50extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
51 unsigned long addr, pte_t *ptep);
52
53#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
54extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
55 unsigned long addr, pte_t *ptep,
56 pte_t old_pte, pte_t new_pte);
45#endif 57#endif
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index d8c8d7c9df15..868fcaf56f6b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1306,6 +1306,24 @@ static inline int pud_pfn(pud_t pud)
1306 BUILD_BUG(); 1306 BUILD_BUG();
1307 return 0; 1307 return 0;
1308} 1308}
1309#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
1310pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
1311void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
1312 pte_t *, pte_t, pte_t);
1313
1314/*
1315 * Returns true for a R -> RW upgrade of pte
1316 */
1317static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_val)
1318{
1319 if (!(old_val & _PAGE_READ))
1320 return false;
1321
1322 if ((!(old_val & _PAGE_WRITE)) && (new_val & _PAGE_WRITE))
1323 return true;
1324
1325 return false;
1326}
1309 1327
1310#endif /* __ASSEMBLY__ */ 1328#endif /* __ASSEMBLY__ */
1311#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ 1329#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 7d1a3d1543fc..5ab134eeed20 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -127,6 +127,10 @@ extern void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep
127 pte_t entry, unsigned long address, 127 pte_t entry, unsigned long address,
128 int psize); 128 int psize);
129 129
130extern void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
131 unsigned long addr, pte_t *ptep,
132 pte_t old_pte, pte_t pte);
133
130static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, 134static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
131 unsigned long set) 135 unsigned long set)
132{ 136{
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index aee4fcc24990..77fc21278fa2 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -10,6 +10,7 @@
10#include <linux/pci.h> 10#include <linux/pci.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/ioport.h> 12#include <linux/ioport.h>
13#include <linux/numa.h>
13 14
14struct device_node; 15struct device_node;
15 16
@@ -265,7 +266,7 @@ extern int pcibios_map_io_space(struct pci_bus *bus);
265#ifdef CONFIG_NUMA 266#ifdef CONFIG_NUMA
266#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE)) 267#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE))
267#else 268#else
268#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = -1) 269#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = NUMA_NO_NODE)
269#endif 270#endif
270 271
271#endif /* CONFIG_PPC64 */ 272#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 913bfca09c4f..b8480127793d 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -11,6 +11,7 @@
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/memblock.h> 12#include <linux/memblock.h>
13#include <linux/sched/task.h> 13#include <linux/sched/task.h>
14#include <linux/numa.h>
14 15
15#include <asm/lppaca.h> 16#include <asm/lppaca.h>
16#include <asm/paca.h> 17#include <asm/paca.h>
@@ -36,7 +37,7 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align,
36 * which will put its paca in the right place. 37 * which will put its paca in the right place.
37 */ 38 */
38 if (cpu == boot_cpuid) { 39 if (cpu == boot_cpuid) {
39 nid = -1; 40 nid = NUMA_NO_NODE;
40 memblock_set_bottom_up(true); 41 memblock_set_bottom_up(true);
41 } else { 42 } else {
42 nid = early_cpu_to_node(cpu); 43 nid = early_cpu_to_node(cpu);
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 88e4f69a09e5..4538e8ddde80 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -32,6 +32,7 @@
32#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <linux/vgaarb.h> 34#include <linux/vgaarb.h>
35#include <linux/numa.h>
35 36
36#include <asm/processor.h> 37#include <asm/processor.h>
37#include <asm/io.h> 38#include <asm/io.h>
@@ -132,7 +133,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev)
132 int nid = of_node_to_nid(dev); 133 int nid = of_node_to_nid(dev);
133 134
134 if (nid < 0 || !node_online(nid)) 135 if (nid < 0 || !node_online(nid))
135 nid = -1; 136 nid = NUMA_NO_NODE;
136 137
137 PHB_SET_NODE(phb, nid); 138 PHB_SET_NODE(phb, nid);
138 } 139 }
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 7725a9714736..a31b6234fcd7 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -798,7 +798,6 @@ static int __init vdso_init(void)
798 BUG_ON(vdso32_pagelist == NULL); 798 BUG_ON(vdso32_pagelist == NULL);
799 for (i = 0; i < vdso32_pages; i++) { 799 for (i = 0; i < vdso32_pages; i++) {
800 struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); 800 struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
801 ClearPageReserved(pg);
802 get_page(pg); 801 get_page(pg);
803 vdso32_pagelist[i] = pg; 802 vdso32_pagelist[i] = pg;
804 } 803 }
@@ -812,7 +811,6 @@ static int __init vdso_init(void)
812 BUG_ON(vdso64_pagelist == NULL); 811 BUG_ON(vdso64_pagelist == NULL);
813 for (i = 0; i < vdso64_pages; i++) { 812 for (i = 0; i < vdso64_pages; i++) {
814 struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); 813 struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
815 ClearPageReserved(pg);
816 get_page(pg); 814 get_page(pg);
817 vdso64_pagelist[i] = pg; 815 vdso64_pagelist[i] = pg;
818 } 816 }
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 2e6a8f9345d3..367ce3a4a503 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -121,3 +121,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
121 *ptep = __pte(new_pte & ~H_PAGE_BUSY); 121 *ptep = __pte(new_pte & ~H_PAGE_BUSY);
122 return 0; 122 return 0;
123} 123}
124
125pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
126 unsigned long addr, pte_t *ptep)
127{
128 unsigned long pte_val;
129 /*
130 * Clear the _PAGE_PRESENT so that no hardware parallel update is
131 * possible. Also keep the pte_present true so that we don't take
132 * wrong fault.
133 */
134 pte_val = pte_update(vma->vm_mm, addr, ptep,
135 _PAGE_PRESENT, _PAGE_INVALID, 1);
136
137 return __pte(pte_val);
138}
139
140void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
141 pte_t *ptep, pte_t old_pte, pte_t pte)
142{
143
144 if (radix_enabled())
145 return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
146 old_pte, pte);
147 set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
148}
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 2486bee0f93e..11d9ea28a816 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -90,3 +90,20 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
90 90
91 return vm_unmapped_area(&info); 91 return vm_unmapped_area(&info);
92} 92}
93
94void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
95 unsigned long addr, pte_t *ptep,
96 pte_t old_pte, pte_t pte)
97{
98 struct mm_struct *mm = vma->vm_mm;
99
100 /*
101 * To avoid NMMU hang while relaxing access we need to flush the tlb before
102 * we set the new value.
103 */
104 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
105 (atomic_read(&mm->context.copros) > 0))
106 radix__flush_hugetlb_page(vma, addr);
107
108 set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
109}
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index a712a650a8b6..e7a9c4f6bfca 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -21,6 +21,7 @@
21#include <linux/sizes.h> 21#include <linux/sizes.h>
22#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
23#include <asm/pte-walk.h> 23#include <asm/pte-walk.h>
24#include <linux/mm_inline.h>
24 25
25static DEFINE_MUTEX(mem_list_mutex); 26static DEFINE_MUTEX(mem_list_mutex);
26 27
@@ -34,8 +35,18 @@ struct mm_iommu_table_group_mem_t {
34 atomic64_t mapped; 35 atomic64_t mapped;
35 unsigned int pageshift; 36 unsigned int pageshift;
36 u64 ua; /* userspace address */ 37 u64 ua; /* userspace address */
37 u64 entries; /* number of entries in hpas[] */ 38 u64 entries; /* number of entries in hpas/hpages[] */
38 u64 *hpas; /* vmalloc'ed */ 39 /*
40 * in mm_iommu_get we temporarily use this to store
41 * struct page address.
42 *
43 * We need to convert ua to hpa in real mode. Make it
44 * simpler by storing physical address.
45 */
46 union {
47 struct page **hpages; /* vmalloc'ed */
48 phys_addr_t *hpas;
49 };
39#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) 50#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1)
40 u64 dev_hpa; /* Device memory base address */ 51 u64 dev_hpa; /* Device memory base address */
41}; 52};
@@ -80,64 +91,13 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
80} 91}
81EXPORT_SYMBOL_GPL(mm_iommu_preregistered); 92EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
82 93
83/*
84 * Taken from alloc_migrate_target with changes to remove CMA allocations
85 */
86struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
87{
88 gfp_t gfp_mask = GFP_USER;
89 struct page *new_page;
90
91 if (PageCompound(page))
92 return NULL;
93
94 if (PageHighMem(page))
95 gfp_mask |= __GFP_HIGHMEM;
96
97 /*
98 * We don't want the allocation to force an OOM if possibe
99 */
100 new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
101 return new_page;
102}
103
104static int mm_iommu_move_page_from_cma(struct page *page)
105{
106 int ret = 0;
107 LIST_HEAD(cma_migrate_pages);
108
109 /* Ignore huge pages for now */
110 if (PageCompound(page))
111 return -EBUSY;
112
113 lru_add_drain();
114 ret = isolate_lru_page(page);
115 if (ret)
116 return ret;
117
118 list_add(&page->lru, &cma_migrate_pages);
119 put_page(page); /* Drop the gup reference */
120
121 ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
122 NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
123 if (ret) {
124 if (!list_empty(&cma_migrate_pages))
125 putback_movable_pages(&cma_migrate_pages);
126 }
127
128 return 0;
129}
130
131static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, 94static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
132 unsigned long entries, unsigned long dev_hpa, 95 unsigned long entries, unsigned long dev_hpa,
133 struct mm_iommu_table_group_mem_t **pmem) 96 struct mm_iommu_table_group_mem_t **pmem)
134{ 97{
135 struct mm_iommu_table_group_mem_t *mem; 98 struct mm_iommu_table_group_mem_t *mem;
136 long i, j, ret = 0, locked_entries = 0; 99 long i, ret, locked_entries = 0;
137 unsigned int pageshift; 100 unsigned int pageshift;
138 unsigned long flags;
139 unsigned long cur_ua;
140 struct page *page = NULL;
141 101
142 mutex_lock(&mem_list_mutex); 102 mutex_lock(&mem_list_mutex);
143 103
@@ -187,62 +147,43 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
187 goto unlock_exit; 147 goto unlock_exit;
188 } 148 }
189 149
150 down_read(&mm->mmap_sem);
151 ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
152 up_read(&mm->mmap_sem);
153 if (ret != entries) {
154 /* free the reference taken */
155 for (i = 0; i < ret; i++)
156 put_page(mem->hpages[i]);
157
158 vfree(mem->hpas);
159 kfree(mem);
160 ret = -EFAULT;
161 goto unlock_exit;
162 }
163
164 pageshift = PAGE_SHIFT;
190 for (i = 0; i < entries; ++i) { 165 for (i = 0; i < entries; ++i) {
191 cur_ua = ua + (i << PAGE_SHIFT); 166 struct page *page = mem->hpages[i];
192 if (1 != get_user_pages_fast(cur_ua, 167
193 1/* pages */, 1/* iswrite */, &page)) {
194 ret = -EFAULT;
195 for (j = 0; j < i; ++j)
196 put_page(pfn_to_page(mem->hpas[j] >>
197 PAGE_SHIFT));
198 vfree(mem->hpas);
199 kfree(mem);
200 goto unlock_exit;
201 }
202 /* 168 /*
203 * If we get a page from the CMA zone, since we are going to 169 * Allow to use larger than 64k IOMMU pages. Only do that
204 * be pinning these entries, we might as well move them out 170 * if we are backed by hugetlb.
205 * of the CMA zone if possible. NOTE: faulting in + migration
206 * can be expensive. Batching can be considered later
207 */ 171 */
208 if (is_migrate_cma_page(page)) { 172 if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
209 if (mm_iommu_move_page_from_cma(page))
210 goto populate;
211 if (1 != get_user_pages_fast(cur_ua,
212 1/* pages */, 1/* iswrite */,
213 &page)) {
214 ret = -EFAULT;
215 for (j = 0; j < i; ++j)
216 put_page(pfn_to_page(mem->hpas[j] >>
217 PAGE_SHIFT));
218 vfree(mem->hpas);
219 kfree(mem);
220 goto unlock_exit;
221 }
222 }
223populate:
224 pageshift = PAGE_SHIFT;
225 if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
226 pte_t *pte;
227 struct page *head = compound_head(page); 173 struct page *head = compound_head(page);
228 unsigned int compshift = compound_order(head); 174
229 unsigned int pteshift; 175 pageshift = compound_order(head) + PAGE_SHIFT;
230
231 local_irq_save(flags); /* disables as well */
232 pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift);
233
234 /* Double check it is still the same pinned page */
235 if (pte && pte_page(*pte) == head &&
236 pteshift == compshift + PAGE_SHIFT)
237 pageshift = max_t(unsigned int, pteshift,
238 PAGE_SHIFT);
239 local_irq_restore(flags);
240 } 176 }
241 mem->pageshift = min(mem->pageshift, pageshift); 177 mem->pageshift = min(mem->pageshift, pageshift);
178 /*
179 * We don't need struct page reference any more, switch
180 * to physical address.
181 */
242 mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; 182 mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
243 } 183 }
244 184
245good_exit: 185good_exit:
186 ret = 0;
246 atomic64_set(&mem->mapped, 1); 187 atomic64_set(&mem->mapped, 1);
247 mem->used = 1; 188 mem->used = 1;
248 mem->ua = ua; 189 mem->ua = ua;
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 87f0dd004295..df1e11ebbabb 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -84,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void)
84 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 84 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
85 85
86 /* cpumask_of_node() will now work */ 86 /* cpumask_of_node() will now work */
87 dbg("Node to cpumask map for %d nodes\n", nr_node_ids); 87 dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
88} 88}
89 89
90static int __init fake_numa_create_new_node(unsigned long end_pfn, 90static int __init fake_numa_create_new_node(unsigned long end_pfn,
@@ -215,7 +215,7 @@ static void initialize_distance_lookup_table(int nid,
215 */ 215 */
216static int associativity_to_nid(const __be32 *associativity) 216static int associativity_to_nid(const __be32 *associativity)
217{ 217{
218 int nid = -1; 218 int nid = NUMA_NO_NODE;
219 219
220 if (min_common_depth == -1) 220 if (min_common_depth == -1)
221 goto out; 221 goto out;
@@ -225,7 +225,7 @@ static int associativity_to_nid(const __be32 *associativity)
225 225
226 /* POWER4 LPAR uses 0xffff as invalid node */ 226 /* POWER4 LPAR uses 0xffff as invalid node */
227 if (nid == 0xffff || nid >= MAX_NUMNODES) 227 if (nid == 0xffff || nid >= MAX_NUMNODES)
228 nid = -1; 228 nid = NUMA_NO_NODE;
229 229
230 if (nid > 0 && 230 if (nid > 0 &&
231 of_read_number(associativity, 1) >= distance_ref_points_depth) { 231 of_read_number(associativity, 1) >= distance_ref_points_depth) {
@@ -244,7 +244,7 @@ out:
244 */ 244 */
245static int of_node_to_nid_single(struct device_node *device) 245static int of_node_to_nid_single(struct device_node *device)
246{ 246{
247 int nid = -1; 247 int nid = NUMA_NO_NODE;
248 const __be32 *tmp; 248 const __be32 *tmp;
249 249
250 tmp = of_get_associativity(device); 250 tmp = of_get_associativity(device);
@@ -256,7 +256,7 @@ static int of_node_to_nid_single(struct device_node *device)
256/* Walk the device tree upwards, looking for an associativity id */ 256/* Walk the device tree upwards, looking for an associativity id */
257int of_node_to_nid(struct device_node *device) 257int of_node_to_nid(struct device_node *device)
258{ 258{
259 int nid = -1; 259 int nid = NUMA_NO_NODE;
260 260
261 of_node_get(device); 261 of_node_get(device);
262 while (device) { 262 while (device) {
@@ -454,7 +454,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
454 */ 454 */
455static int numa_setup_cpu(unsigned long lcpu) 455static int numa_setup_cpu(unsigned long lcpu)
456{ 456{
457 int nid = -1; 457 int nid = NUMA_NO_NODE;
458 struct device_node *cpu; 458 struct device_node *cpu;
459 459
460 /* 460 /*
@@ -930,7 +930,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
930{ 930{
931 struct drmem_lmb *lmb; 931 struct drmem_lmb *lmb;
932 unsigned long lmb_size; 932 unsigned long lmb_size;
933 int nid = -1; 933 int nid = NUMA_NO_NODE;
934 934
935 lmb_size = drmem_lmb_size(); 935 lmb_size = drmem_lmb_size();
936 936
@@ -960,7 +960,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
960static int hot_add_node_scn_to_nid(unsigned long scn_addr) 960static int hot_add_node_scn_to_nid(unsigned long scn_addr)
961{ 961{
962 struct device_node *memory; 962 struct device_node *memory;
963 int nid = -1; 963 int nid = NUMA_NO_NODE;
964 964
965 for_each_node_by_type(memory, "memory") { 965 for_each_node_by_type(memory, "memory") {
966 unsigned long start, size; 966 unsigned long start, size;
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index ecd31569a120..e7da590c7a78 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -401,6 +401,31 @@ void arch_report_meminfo(struct seq_file *m)
401} 401}
402#endif /* CONFIG_PROC_FS */ 402#endif /* CONFIG_PROC_FS */
403 403
404pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
405 pte_t *ptep)
406{
407 unsigned long pte_val;
408
409 /*
410 * Clear the _PAGE_PRESENT so that no hardware parallel update is
411 * possible. Also keep the pte_present true so that we don't take
412 * wrong fault.
413 */
414 pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
415
416 return __pte(pte_val);
417
418}
419
420void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
421 pte_t *ptep, pte_t old_pte, pte_t pte)
422{
423 if (radix_enabled())
424 return radix__ptep_modify_prot_commit(vma, addr,
425 ptep, old_pte, pte);
426 set_pte_at(vma->vm_mm, addr, ptep, pte);
427}
428
404/* 429/*
405 * For hash translation mode, we use the deposited table to store hash slot 430 * For hash translation mode, we use the deposited table to store hash slot
406 * information and they are stored at PTRS_PER_PMD offset from related pmd 431 * information and they are stored at PTRS_PER_PMD offset from related pmd
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 931156069a81..dced3cd241c2 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -1063,3 +1063,21 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1063 } 1063 }
1064 /* See ptesync comment in radix__set_pte_at */ 1064 /* See ptesync comment in radix__set_pte_at */
1065} 1065}
1066
1067void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1068 unsigned long addr, pte_t *ptep,
1069 pte_t old_pte, pte_t pte)
1070{
1071 struct mm_struct *mm = vma->vm_mm;
1072
1073 /*
1074 * To avoid NMMU hang while relaxing access we need to flush the tlb before
1075 * we set the new value. We need to do this only for radix, because hash
1076 * translation does flush when updating the linux pte.
1077 */
1078 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1079 (atomic_read(&mm->context.copros) > 0))
1080 radix__flush_tlb_page(vma, addr);
1081
1082 set_pte_at(mm, addr, ptep, pte);
1083}
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 84d038ed3882..248a38ad25c7 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -20,6 +20,7 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/memory.h> 21#include <linux/memory.h>
22#include <linux/memory_hotplug.h> 22#include <linux/memory_hotplug.h>
23#include <linux/numa.h>
23#include <asm/machdep.h> 24#include <asm/machdep.h>
24#include <asm/debugfs.h> 25#include <asm/debugfs.h>
25 26
@@ -223,7 +224,7 @@ static int memtrace_online(void)
223 ent = &memtrace_array[i]; 224 ent = &memtrace_array[i];
224 225
225 /* We have onlined this chunk previously */ 226 /* We have onlined this chunk previously */
226 if (ent->nid == -1) 227 if (ent->nid == NUMA_NO_NODE)
227 continue; 228 continue;
228 229
229 /* Remove from io mappings */ 230 /* Remove from io mappings */
@@ -257,7 +258,7 @@ static int memtrace_online(void)
257 */ 258 */
258 debugfs_remove_recursive(ent->dir); 259 debugfs_remove_recursive(ent->dir);
259 pr_info("Added trace memory back to node %d\n", ent->nid); 260 pr_info("Added trace memory back to node %d\n", ent->nid);
260 ent->size = ent->start = ent->nid = -1; 261 ent->size = ent->start = ent->nid = NUMA_NO_NODE;
261 } 262 }
262 if (ret) 263 if (ret)
263 return ret; 264 return ret;
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index 582cb153eb24..0cd044122234 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -54,7 +54,6 @@ static int __init vdso_init(void)
54 struct page *pg; 54 struct page *pg;
55 55
56 pg = virt_to_page(vdso_start + (i << PAGE_SHIFT)); 56 pg = virt_to_page(vdso_start + (i << PAGE_SHIFT));
57 ClearPageReserved(pg);
58 vdso_pagelist[i] = pg; 57 vdso_pagelist[i] = pg;
59 } 58 }
60 vdso_pagelist[i] = virt_to_page(vdso_data); 59 vdso_pagelist[i] = virt_to_page(vdso_data);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 063732414dfb..76dc344edb8c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1069,8 +1069,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
1069} 1069}
1070 1070
1071#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION 1071#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
1072pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *); 1072pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
1073void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t); 1073void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
1074 pte_t *, pte_t, pte_t);
1074 1075
1075#define __HAVE_ARCH_PTEP_CLEAR_FLUSH 1076#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
1076static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, 1077static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 4ff354887db4..e7920a68a12e 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -291,7 +291,6 @@ static int __init vdso_init(void)
291 BUG_ON(vdso32_pagelist == NULL); 291 BUG_ON(vdso32_pagelist == NULL);
292 for (i = 0; i < vdso32_pages - 1; i++) { 292 for (i = 0; i < vdso32_pages - 1; i++) {
293 struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); 293 struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
294 ClearPageReserved(pg);
295 get_page(pg); 294 get_page(pg);
296 vdso32_pagelist[i] = pg; 295 vdso32_pagelist[i] = pg;
297 } 296 }
@@ -309,7 +308,6 @@ static int __init vdso_init(void)
309 BUG_ON(vdso64_pagelist == NULL); 308 BUG_ON(vdso64_pagelist == NULL);
310 for (i = 0; i < vdso64_pages - 1; i++) { 309 for (i = 0; i < vdso64_pages - 1; i++) {
311 struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); 310 struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
312 ClearPageReserved(pg);
313 get_page(pg); 311 get_page(pg);
314 vdso64_pagelist[i] = pg; 312 vdso64_pagelist[i] = pg;
315 } 313 }
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 689b66f29fc6..8485d6dc2754 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -301,12 +301,13 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
301} 301}
302EXPORT_SYMBOL(ptep_xchg_lazy); 302EXPORT_SYMBOL(ptep_xchg_lazy);
303 303
304pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, 304pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
305 pte_t *ptep) 305 pte_t *ptep)
306{ 306{
307 pgste_t pgste; 307 pgste_t pgste;
308 pte_t old; 308 pte_t old;
309 int nodat; 309 int nodat;
310 struct mm_struct *mm = vma->vm_mm;
310 311
311 preempt_disable(); 312 preempt_disable();
312 pgste = ptep_xchg_start(mm, addr, ptep); 313 pgste = ptep_xchg_start(mm, addr, ptep);
@@ -319,10 +320,11 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
319 return old; 320 return old;
320} 321}
321 322
322void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 323void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
323 pte_t *ptep, pte_t pte) 324 pte_t *ptep, pte_t old_pte, pte_t pte)
324{ 325{
325 pgste_t pgste; 326 pgste_t pgste;
327 struct mm_struct *mm = vma->vm_mm;
326 328
327 if (!MACHINE_HAS_NX) 329 if (!MACHINE_HAS_NX)
328 pte_val(pte) &= ~_PAGE_NOEXEC; 330 pte_val(pte) &= ~_PAGE_NOEXEC;
diff --git a/arch/sh/kernel/syscalls/syscalltbl.sh b/arch/sh/kernel/syscalls/syscalltbl.sh
index 85d78d9309ad..904b8e6e625d 100644
--- a/arch/sh/kernel/syscalls/syscalltbl.sh
+++ b/arch/sh/kernel/syscalls/syscalltbl.sh
@@ -13,10 +13,10 @@ emit() {
13 t_entry="$3" 13 t_entry="$3"
14 14
15 while [ $t_nxt -lt $t_nr ]; do 15 while [ $t_nxt -lt $t_nr ]; do
16 printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}" 16 printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
17 t_nxt=$((t_nxt+1)) 17 t_nxt=$((t_nxt+1))
18 done 18 done
19 printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" 19 printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
20} 20}
21 21
22grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( 22grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 96e9c54a07f5..bd1a9c544767 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -10,7 +10,7 @@
10#include <linux/sys.h> 10#include <linux/sys.h>
11#include <linux/linkage.h> 11#include <linux/linkage.h>
12 12
13#define __SYSCALL(nr, entry, nargs) .long entry 13#define __SYSCALL(nr, entry) .long entry
14 .data 14 .data
15ENTRY(sys_call_table) 15ENTRY(sys_call_table)
16#include <asm/syscall_table.h> 16#include <asm/syscall_table.h>
diff --git a/arch/sparc/kernel/pci_fire.c b/arch/sparc/kernel/pci_fire.c
index be71ae086622..0ca08d455e80 100644
--- a/arch/sparc/kernel/pci_fire.c
+++ b/arch/sparc/kernel/pci_fire.c
@@ -11,6 +11,7 @@
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/irq.h> 12#include <linux/irq.h>
13#include <linux/of_device.h> 13#include <linux/of_device.h>
14#include <linux/numa.h>
14 15
15#include <asm/prom.h> 16#include <asm/prom.h>
16#include <asm/irq.h> 17#include <asm/irq.h>
@@ -416,7 +417,7 @@ static int pci_fire_pbm_init(struct pci_pbm_info *pbm,
416 struct device_node *dp = op->dev.of_node; 417 struct device_node *dp = op->dev.of_node;
417 int err; 418 int err;
418 419
419 pbm->numa_node = -1; 420 pbm->numa_node = NUMA_NO_NODE;
420 421
421 pbm->pci_ops = &sun4u_pci_ops; 422 pbm->pci_ops = &sun4u_pci_ops;
422 pbm->config_space_reg_bits = 12; 423 pbm->config_space_reg_bits = 12;
diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c
index 934b97c72f7c..421aba00e6b0 100644
--- a/arch/sparc/kernel/pci_schizo.c
+++ b/arch/sparc/kernel/pci_schizo.c
@@ -12,6 +12,7 @@
12#include <linux/export.h> 12#include <linux/export.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/of_device.h> 14#include <linux/of_device.h>
15#include <linux/numa.h>
15 16
16#include <asm/iommu.h> 17#include <asm/iommu.h>
17#include <asm/irq.h> 18#include <asm/irq.h>
@@ -1347,7 +1348,7 @@ static int schizo_pbm_init(struct pci_pbm_info *pbm,
1347 pbm->next = pci_pbm_root; 1348 pbm->next = pci_pbm_root;
1348 pci_pbm_root = pbm; 1349 pci_pbm_root = pbm;
1349 1350
1350 pbm->numa_node = -1; 1351 pbm->numa_node = NUMA_NO_NODE;
1351 1352
1352 pbm->pci_ops = &sun4u_pci_ops; 1353 pbm->pci_ops = &sun4u_pci_ops;
1353 pbm->config_space_reg_bits = 8; 1354 pbm->config_space_reg_bits = 8;
diff --git a/arch/sparc/kernel/psycho_common.c b/arch/sparc/kernel/psycho_common.c
index 81aa91e5c0e6..e90bcb6bad7f 100644
--- a/arch/sparc/kernel/psycho_common.c
+++ b/arch/sparc/kernel/psycho_common.c
@@ -5,6 +5,7 @@
5 */ 5 */
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/numa.h>
8 9
9#include <asm/upa.h> 10#include <asm/upa.h>
10 11
@@ -454,7 +455,7 @@ void psycho_pbm_init_common(struct pci_pbm_info *pbm, struct platform_device *op
454 struct device_node *dp = op->dev.of_node; 455 struct device_node *dp = op->dev.of_node;
455 456
456 pbm->name = dp->full_name; 457 pbm->name = dp->full_name;
457 pbm->numa_node = -1; 458 pbm->numa_node = NUMA_NO_NODE;
458 pbm->chip_type = chip_type; 459 pbm->chip_type = chip_type;
459 pbm->chip_version = of_getintprop_default(dp, "version#", 0); 460 pbm->chip_version = of_getintprop_default(dp, "version#", 0);
460 pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0); 461 pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0);
diff --git a/arch/sparc/kernel/sbus.c b/arch/sparc/kernel/sbus.c
index 41c5deb581b8..32141e1006c4 100644
--- a/arch/sparc/kernel/sbus.c
+++ b/arch/sparc/kernel/sbus.c
@@ -15,6 +15,7 @@
15#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/of.h> 16#include <linux/of.h>
17#include <linux/of_device.h> 17#include <linux/of_device.h>
18#include <linux/numa.h>
18 19
19#include <asm/page.h> 20#include <asm/page.h>
20#include <asm/io.h> 21#include <asm/io.h>
@@ -561,7 +562,7 @@ static void __init sbus_iommu_init(struct platform_device *op)
561 562
562 op->dev.archdata.iommu = iommu; 563 op->dev.archdata.iommu = iommu;
563 op->dev.archdata.stc = strbuf; 564 op->dev.archdata.stc = strbuf;
564 op->dev.archdata.numa_node = -1; 565 op->dev.archdata.numa_node = NUMA_NO_NODE;
565 566
566 reg_base = regs + SYSIO_IOMMUREG_BASE; 567 reg_base = regs + SYSIO_IOMMUREG_BASE;
567 iommu->iommu_control = reg_base + IOMMU_CONTROL; 568 iommu->iommu_control = reg_base + IOMMU_CONTROL;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index b4221d3727d0..9e6bd868ba6f 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -976,13 +976,13 @@ static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid)
976{ 976{
977 int prev_nid, new_nid; 977 int prev_nid, new_nid;
978 978
979 prev_nid = -1; 979 prev_nid = NUMA_NO_NODE;
980 for ( ; start < end; start += PAGE_SIZE) { 980 for ( ; start < end; start += PAGE_SIZE) {
981 for (new_nid = 0; new_nid < num_node_masks; new_nid++) { 981 for (new_nid = 0; new_nid < num_node_masks; new_nid++) {
982 struct node_mem_mask *p = &node_masks[new_nid]; 982 struct node_mem_mask *p = &node_masks[new_nid];
983 983
984 if ((start & p->mask) == p->match) { 984 if ((start & p->mask) == p->match) {
985 if (prev_nid == -1) 985 if (prev_nid == NUMA_NO_NODE)
986 prev_nid = new_nid; 986 prev_nid = new_nid;
987 break; 987 break;
988 } 988 }
@@ -1208,7 +1208,7 @@ int of_node_to_nid(struct device_node *dp)
1208 md = mdesc_grab(); 1208 md = mdesc_grab();
1209 1209
1210 count = 0; 1210 count = 0;
1211 nid = -1; 1211 nid = NUMA_NO_NODE;
1212 mdesc_for_each_node_by_name(md, grp, "group") { 1212 mdesc_for_each_node_by_name(md, grp, "group") {
1213 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { 1213 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
1214 nid = count; 1214 nid = count;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a97f28d914d5..c25c38a05c1c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -422,25 +422,26 @@ static inline pgdval_t pgd_val(pgd_t pgd)
422} 422}
423 423
424#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION 424#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
425static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, 425static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
426 pte_t *ptep) 426 pte_t *ptep)
427{ 427{
428 pteval_t ret; 428 pteval_t ret;
429 429
430 ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep); 430 ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep);
431 431
432 return (pte_t) { .pte = ret }; 432 return (pte_t) { .pte = ret };
433} 433}
434 434
435static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 435static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
436 pte_t *ptep, pte_t pte) 436 pte_t *ptep, pte_t old_pte, pte_t pte)
437{ 437{
438
438 if (sizeof(pteval_t) > sizeof(long)) 439 if (sizeof(pteval_t) > sizeof(long))
439 /* 5 arg words */ 440 /* 5 arg words */
440 pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte); 441 pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte);
441 else 442 else
442 PVOP_VCALL4(mmu.ptep_modify_prot_commit, 443 PVOP_VCALL4(mmu.ptep_modify_prot_commit,
443 mm, addr, ptep, pte.pte); 444 vma, addr, ptep, pte.pte);
444} 445}
445 446
446static inline void set_pte(pte_t *ptep, pte_t pte) 447static inline void set_pte(pte_t *ptep, pte_t pte)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 488c59686a73..2474e434a6f7 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -55,6 +55,7 @@ struct task_struct;
55struct cpumask; 55struct cpumask;
56struct flush_tlb_info; 56struct flush_tlb_info;
57struct mmu_gather; 57struct mmu_gather;
58struct vm_area_struct;
58 59
59/* 60/*
60 * Wrapper type for pointers to code which uses the non-standard 61 * Wrapper type for pointers to code which uses the non-standard
@@ -254,9 +255,9 @@ struct pv_mmu_ops {
254 pte_t *ptep, pte_t pteval); 255 pte_t *ptep, pte_t pteval);
255 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 256 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
256 257
257 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, 258 pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr,
258 pte_t *ptep); 259 pte_t *ptep);
259 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, 260 void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr,
260 pte_t *ptep, pte_t pte); 261 pte_t *ptep, pte_t pte);
261 262
262 struct paravirt_callee_save pte_val; 263 struct paravirt_callee_save pte_val;
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 662963681ea6..e662f987dfa2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
7#include <linux/slab.h> 7#include <linux/slab.h>
8#include <linux/string.h> 8#include <linux/string.h>
9#include <linux/scatterlist.h> 9#include <linux/scatterlist.h>
10#include <linux/numa.h>
10#include <asm/io.h> 11#include <asm/io.h>
11#include <asm/pat.h> 12#include <asm/pat.h>
12#include <asm/x86_init.h> 13#include <asm/x86_init.h>
@@ -141,7 +142,7 @@ cpumask_of_pcibus(const struct pci_bus *bus)
141 int node; 142 int node;
142 143
143 node = __pcibus_to_node(bus); 144 node = __pcibus_to_node(bus);
144 return (node == -1) ? cpu_online_mask : 145 return (node == NUMA_NO_NODE) ? cpu_online_mask :
145 cpumask_of_node(node); 146 cpumask_of_node(node);
146} 147}
147#endif 148#endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5e49a0acb5ee..62004d22524a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -75,7 +75,7 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
75#endif 75#endif
76 76
77/** 77/**
78 * access_ok: - Checks if a user space pointer is valid 78 * access_ok - Checks if a user space pointer is valid
79 * @addr: User space pointer to start of block to check 79 * @addr: User space pointer to start of block to check
80 * @size: Size of block to check 80 * @size: Size of block to check
81 * 81 *
@@ -84,12 +84,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
84 * 84 *
85 * Checks if a pointer to a block of memory in user space is valid. 85 * Checks if a pointer to a block of memory in user space is valid.
86 * 86 *
87 * Returns true (nonzero) if the memory block may be valid, false (zero)
88 * if it is definitely invalid.
89 *
90 * Note that, depending on architecture, this function probably just 87 * Note that, depending on architecture, this function probably just
91 * checks that the pointer is in the user space range - after calling 88 * checks that the pointer is in the user space range - after calling
92 * this function, memory access functions may still return -EFAULT. 89 * this function, memory access functions may still return -EFAULT.
90 *
91 * Return: true (nonzero) if the memory block may be valid, false (zero)
92 * if it is definitely invalid.
93 */ 93 */
94#define access_ok(addr, size) \ 94#define access_ok(addr, size) \
95({ \ 95({ \
@@ -134,7 +134,7 @@ extern int __get_user_bad(void);
134__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) 134__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
135 135
136/** 136/**
137 * get_user: - Get a simple variable from user space. 137 * get_user - Get a simple variable from user space.
138 * @x: Variable to store result. 138 * @x: Variable to store result.
139 * @ptr: Source address, in user space. 139 * @ptr: Source address, in user space.
140 * 140 *
@@ -148,7 +148,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
148 * @ptr must have pointer-to-simple-variable type, and the result of 148 * @ptr must have pointer-to-simple-variable type, and the result of
149 * dereferencing @ptr must be assignable to @x without a cast. 149 * dereferencing @ptr must be assignable to @x without a cast.
150 * 150 *
151 * Returns zero on success, or -EFAULT on error. 151 * Return: zero on success, or -EFAULT on error.
152 * On error, the variable @x is set to zero. 152 * On error, the variable @x is set to zero.
153 */ 153 */
154/* 154/*
@@ -226,7 +226,7 @@ extern void __put_user_4(void);
226extern void __put_user_8(void); 226extern void __put_user_8(void);
227 227
228/** 228/**
229 * put_user: - Write a simple value into user space. 229 * put_user - Write a simple value into user space.
230 * @x: Value to copy to user space. 230 * @x: Value to copy to user space.
231 * @ptr: Destination address, in user space. 231 * @ptr: Destination address, in user space.
232 * 232 *
@@ -240,7 +240,7 @@ extern void __put_user_8(void);
240 * @ptr must have pointer-to-simple-variable type, and @x must be assignable 240 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
241 * to the result of dereferencing @ptr. 241 * to the result of dereferencing @ptr.
242 * 242 *
243 * Returns zero on success, or -EFAULT on error. 243 * Return: zero on success, or -EFAULT on error.
244 */ 244 */
245#define put_user(x, ptr) \ 245#define put_user(x, ptr) \
246({ \ 246({ \
@@ -502,7 +502,7 @@ struct __large_struct { unsigned long buf[100]; };
502} while (0) 502} while (0)
503 503
504/** 504/**
505 * __get_user: - Get a simple variable from user space, with less checking. 505 * __get_user - Get a simple variable from user space, with less checking.
506 * @x: Variable to store result. 506 * @x: Variable to store result.
507 * @ptr: Source address, in user space. 507 * @ptr: Source address, in user space.
508 * 508 *
@@ -519,7 +519,7 @@ struct __large_struct { unsigned long buf[100]; };
519 * Caller must check the pointer with access_ok() before calling this 519 * Caller must check the pointer with access_ok() before calling this
520 * function. 520 * function.
521 * 521 *
522 * Returns zero on success, or -EFAULT on error. 522 * Return: zero on success, or -EFAULT on error.
523 * On error, the variable @x is set to zero. 523 * On error, the variable @x is set to zero.
524 */ 524 */
525 525
@@ -527,7 +527,7 @@ struct __large_struct { unsigned long buf[100]; };
527 __get_user_nocheck((x), (ptr), sizeof(*(ptr))) 527 __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
528 528
529/** 529/**
530 * __put_user: - Write a simple value into user space, with less checking. 530 * __put_user - Write a simple value into user space, with less checking.
531 * @x: Value to copy to user space. 531 * @x: Value to copy to user space.
532 * @ptr: Destination address, in user space. 532 * @ptr: Destination address, in user space.
533 * 533 *
@@ -544,7 +544,7 @@ struct __large_struct { unsigned long buf[100]; };
544 * Caller must check the pointer with access_ok() before calling this 544 * Caller must check the pointer with access_ok() before calling this
545 * function. 545 * function.
546 * 546 *
547 * Returns zero on success, or -EFAULT on error. 547 * Return: zero on success, or -EFAULT on error.
548 */ 548 */
549 549
550#define __put_user(x, ptr) \ 550#define __put_user(x, ptr) \
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index a555da094157..1e225528f0d7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -27,6 +27,7 @@
27#include <linux/crash_dump.h> 27#include <linux/crash_dump.h>
28#include <linux/reboot.h> 28#include <linux/reboot.h>
29#include <linux/memory.h> 29#include <linux/memory.h>
30#include <linux/numa.h>
30 31
31#include <asm/uv/uv_mmrs.h> 32#include <asm/uv/uv_mmrs.h>
32#include <asm/uv/uv_hub.h> 33#include <asm/uv/uv_hub.h>
@@ -1390,7 +1391,7 @@ static void __init build_socket_tables(void)
1390 } 1391 }
1391 1392
1392 /* Set socket -> node values: */ 1393 /* Set socket -> node values: */
1393 lnid = -1; 1394 lnid = NUMA_NO_NODE;
1394 for_each_present_cpu(cpu) { 1395 for_each_present_cpu(cpu) {
1395 int nid = cpu_to_node(cpu); 1396 int nid = cpu_to_node(cpu);
1396 int apicid, sockid; 1397 int apicid, sockid;
@@ -1521,7 +1522,7 @@ static void __init uv_system_init_hub(void)
1521 new_hub->pnode = 0xffff; 1522 new_hub->pnode = 0xffff;
1522 1523
1523 new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); 1524 new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
1524 new_hub->memory_nid = -1; 1525 new_hub->memory_nid = NUMA_NO_NODE;
1525 new_hub->nr_possible_cpus = 0; 1526 new_hub->nr_possible_cpus = 0;
1526 new_hub->nr_online_cpus = 0; 1527 new_hub->nr_online_cpus = 0;
1527 } 1528 }
@@ -1538,7 +1539,7 @@ static void __init uv_system_init_hub(void)
1538 1539
1539 uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); 1540 uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
1540 uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; 1541 uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
1541 if (uv_cpu_hub_info(cpu)->memory_nid == -1) 1542 if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
1542 uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); 1543 uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
1543 1544
1544 /* Init memoryless node: */ 1545 /* Init memoryless node: */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index e8796fcd7e5a..13af08827eef 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -171,7 +171,7 @@ void __init setup_per_cpu_areas(void)
171 unsigned long delta; 171 unsigned long delta;
172 int rc; 172 int rc;
173 173
174 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n", 174 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n",
175 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 175 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
176 176
177 /* 177 /*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ccd1f2a8e557..c91ff9f9fe8a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
56#include <linux/stackprotector.h> 56#include <linux/stackprotector.h>
57#include <linux/gfp.h> 57#include <linux/gfp.h>
58#include <linux/cpuidle.h> 58#include <linux/cpuidle.h>
59#include <linux/numa.h>
59 60
60#include <asm/acpi.h> 61#include <asm/acpi.h>
61#include <asm/desc.h> 62#include <asm/desc.h>
@@ -841,7 +842,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
841/* reduce the number of lines printed when booting a large cpu count system */ 842/* reduce the number of lines printed when booting a large cpu count system */
842static void announce_cpu(int cpu, int apicid) 843static void announce_cpu(int cpu, int apicid)
843{ 844{
844 static int current_node = -1; 845 static int current_node = NUMA_NO_NODE;
845 int node = early_cpu_to_node(cpu); 846 int node = early_cpu_to_node(cpu);
846 static int width, node_width; 847 static int width, node_width;
847 848
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index bfd94e7812fc..7d290777246d 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -54,13 +54,13 @@ do { \
54} while (0) 54} while (0)
55 55
56/** 56/**
57 * clear_user: - Zero a block of memory in user space. 57 * clear_user - Zero a block of memory in user space.
58 * @to: Destination address, in user space. 58 * @to: Destination address, in user space.
59 * @n: Number of bytes to zero. 59 * @n: Number of bytes to zero.
60 * 60 *
61 * Zero a block of memory in user space. 61 * Zero a block of memory in user space.
62 * 62 *
63 * Returns number of bytes that could not be cleared. 63 * Return: number of bytes that could not be cleared.
64 * On success, this will be zero. 64 * On success, this will be zero.
65 */ 65 */
66unsigned long 66unsigned long
@@ -74,14 +74,14 @@ clear_user(void __user *to, unsigned long n)
74EXPORT_SYMBOL(clear_user); 74EXPORT_SYMBOL(clear_user);
75 75
76/** 76/**
77 * __clear_user: - Zero a block of memory in user space, with less checking. 77 * __clear_user - Zero a block of memory in user space, with less checking.
78 * @to: Destination address, in user space. 78 * @to: Destination address, in user space.
79 * @n: Number of bytes to zero. 79 * @n: Number of bytes to zero.
80 * 80 *
81 * Zero a block of memory in user space. Caller must check 81 * Zero a block of memory in user space. Caller must check
82 * the specified block with access_ok() before calling this function. 82 * the specified block with access_ok() before calling this function.
83 * 83 *
84 * Returns number of bytes that could not be cleared. 84 * Return: number of bytes that could not be cleared.
85 * On success, this will be zero. 85 * On success, this will be zero.
86 */ 86 */
87unsigned long 87unsigned long
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..12c1b7a83ed7 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -123,7 +123,7 @@ void __init setup_node_to_cpumask_map(void)
123 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 123 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
124 124
125 /* cpumask_of_node() will now work */ 125 /* cpumask_of_node() will now work */
126 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 126 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
127} 127}
128 128
129static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 129static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
@@ -866,7 +866,7 @@ const struct cpumask *cpumask_of_node(int node)
866{ 866{
867 if (node >= nr_node_ids) { 867 if (node >= nr_node_ids) {
868 printk(KERN_WARNING 868 printk(KERN_WARNING
869 "cpumask_of_node(%d): node > nr_node_ids(%d)\n", 869 "cpumask_of_node(%d): node > nr_node_ids(%u)\n",
870 node, nr_node_ids); 870 node, nr_node_ids);
871 dump_stack(); 871 dump_stack();
872 return cpu_none_mask; 872 return cpu_none_mask;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index a7e47cf7ec6c..6e4c6bd62203 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -17,8 +17,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
17 17
18void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 18void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
19 19
20pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 20pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep);
21void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 21void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
22 pte_t *ptep, pte_t pte); 22 pte_t *ptep, pte_t pte);
23 23
24unsigned long xen_read_cr2_direct(void); 24unsigned long xen_read_cr2_direct(void);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 0f4fe206dcc2..856a85814f00 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -306,20 +306,20 @@ static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
306 __xen_set_pte(ptep, pteval); 306 __xen_set_pte(ptep, pteval);
307} 307}
308 308
309pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 309pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
310 unsigned long addr, pte_t *ptep) 310 unsigned long addr, pte_t *ptep)
311{ 311{
312 /* Just return the pte as-is. We preserve the bits on commit */ 312 /* Just return the pte as-is. We preserve the bits on commit */
313 trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); 313 trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep);
314 return *ptep; 314 return *ptep;
315} 315}
316 316
317void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 317void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
318 pte_t *ptep, pte_t pte) 318 pte_t *ptep, pte_t pte)
319{ 319{
320 struct mmu_update u; 320 struct mmu_update u;
321 321
322 trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); 322 trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte);
323 xen_mc_batch(); 323 xen_mc_batch();
324 324
325 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 325 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 88e8440e75c3..2f3ee4d6af82 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -40,6 +40,7 @@
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/prefetch.h> 42#include <linux/prefetch.h>
43#include <linux/numa.h>
43#include "mtip32xx.h" 44#include "mtip32xx.h"
44 45
45#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) 46#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
@@ -4018,9 +4019,9 @@ static int get_least_used_cpu_on_node(int node)
4018/* Helper for selecting a node in round robin mode */ 4019/* Helper for selecting a node in round robin mode */
4019static inline int mtip_get_next_rr_node(void) 4020static inline int mtip_get_next_rr_node(void)
4020{ 4021{
4021 static int next_node = -1; 4022 static int next_node = NUMA_NO_NODE;
4022 4023
4023 if (next_node == -1) { 4024 if (next_node == NUMA_NO_NODE) {
4024 next_node = first_online_node; 4025 next_node = first_online_node;
4025 return next_node; 4026 return next_node;
4026 } 4027 }
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index 7f88490b5479..c53f0f9ef5b0 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -163,7 +163,6 @@ static int efficeon_free_gatt_table(struct agp_bridge_data *bridge)
163 unsigned long page = efficeon_private.l1_table[index]; 163 unsigned long page = efficeon_private.l1_table[index];
164 if (page) { 164 if (page) {
165 efficeon_private.l1_table[index] = 0; 165 efficeon_private.l1_table[index] = 0;
166 ClearPageReserved(virt_to_page((char *)page));
167 free_page(page); 166 free_page(page);
168 freed++; 167 freed++;
169 } 168 }
@@ -219,7 +218,6 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge)
219 efficeon_free_gatt_table(agp_bridge); 218 efficeon_free_gatt_table(agp_bridge);
220 return -ENOMEM; 219 return -ENOMEM;
221 } 220 }
222 SetPageReserved(virt_to_page((char *)page));
223 221
224 for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk) 222 for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk)
225 clflush((char *)page+offset); 223 clflush((char *)page+offset);
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index f1a441ab395d..3a11b1092e80 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -63,6 +63,7 @@
63#include <linux/acpi_dma.h> 63#include <linux/acpi_dma.h>
64#include <linux/of_dma.h> 64#include <linux/of_dma.h>
65#include <linux/mempool.h> 65#include <linux/mempool.h>
66#include <linux/numa.h>
66 67
67static DEFINE_MUTEX(dma_list_mutex); 68static DEFINE_MUTEX(dma_list_mutex);
68static DEFINE_IDA(dma_ida); 69static DEFINE_IDA(dma_ida);
@@ -386,7 +387,8 @@ EXPORT_SYMBOL(dma_issue_pending_all);
386static bool dma_chan_is_local(struct dma_chan *chan, int cpu) 387static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
387{ 388{
388 int node = dev_to_node(chan->device->dev); 389 int node = dev_to_node(chan->device->dev);
389 return node == -1 || cpumask_test_cpu(cpu, cpumask_of_node(node)); 390 return node == NUMA_NO_NODE ||
391 cpumask_test_cpu(cpu, cpumask_of_node(node));
390} 392}
391 393
392/** 394/**
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 9726df37c4c4..540e20eb032c 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -123,12 +123,6 @@ static inline u64 ptr_to_u64(const void *ptr)
123 123
124#include <linux/list.h> 124#include <linux/list.h>
125 125
126static inline int list_is_first(const struct list_head *list,
127 const struct list_head *head)
128{
129 return head->next == list;
130}
131
132static inline void __list_del_many(struct list_head *head, 126static inline void __list_del_many(struct list_head *head,
133 struct list_head *first) 127 struct list_head *first)
134{ 128{
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 7c6349a50ef1..dd475f3bcc8a 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -681,8 +681,13 @@ static struct notifier_block hv_memory_nb = {
681/* Check if the particular page is backed and can be onlined and online it. */ 681/* Check if the particular page is backed and can be onlined and online it. */
682static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) 682static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
683{ 683{
684 if (!has_pfn_is_backed(has, page_to_pfn(pg))) 684 if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
685 if (!PageOffline(pg))
686 __SetPageOffline(pg);
685 return; 687 return;
688 }
689 if (PageOffline(pg))
690 __ClearPageOffline(pg);
686 691
687 /* This frame is currently backed; online the page. */ 692 /* This frame is currently backed; online the page. */
688 __online_page_set_limits(pg); 693 __online_page_set_limits(pg);
@@ -771,7 +776,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
771 } 776 }
772} 777}
773 778
774static void hv_online_page(struct page *pg) 779static void hv_online_page(struct page *pg, unsigned int order)
775{ 780{
776 struct hv_hotadd_state *has; 781 struct hv_hotadd_state *has;
777 unsigned long flags; 782 unsigned long flags;
@@ -780,10 +785,11 @@ static void hv_online_page(struct page *pg)
780 spin_lock_irqsave(&dm_device.ha_lock, flags); 785 spin_lock_irqsave(&dm_device.ha_lock, flags);
781 list_for_each_entry(has, &dm_device.ha_region_list, list) { 786 list_for_each_entry(has, &dm_device.ha_region_list, list) {
782 /* The page belongs to a different HAS. */ 787 /* The page belongs to a different HAS. */
783 if ((pfn < has->start_pfn) || (pfn >= has->end_pfn)) 788 if ((pfn < has->start_pfn) ||
789 (pfn + (1UL << order) > has->end_pfn))
784 continue; 790 continue;
785 791
786 hv_page_online_one(has, pg); 792 hv_bring_pgs_online(has, pfn, 1UL << order);
787 break; 793 break;
788 } 794 }
789 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 795 spin_unlock_irqrestore(&dm_device.ha_lock, flags);
@@ -1201,6 +1207,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm,
1201 1207
1202 for (i = 0; i < num_pages; i++) { 1208 for (i = 0; i < num_pages; i++) {
1203 pg = pfn_to_page(i + start_frame); 1209 pg = pfn_to_page(i + start_frame);
1210 __ClearPageOffline(pg);
1204 __free_page(pg); 1211 __free_page(pg);
1205 dm->num_pages_ballooned--; 1212 dm->num_pages_ballooned--;
1206 } 1213 }
@@ -1213,7 +1220,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
1213 struct dm_balloon_response *bl_resp, 1220 struct dm_balloon_response *bl_resp,
1214 int alloc_unit) 1221 int alloc_unit)
1215{ 1222{
1216 unsigned int i = 0; 1223 unsigned int i, j;
1217 struct page *pg; 1224 struct page *pg;
1218 1225
1219 if (num_pages < alloc_unit) 1226 if (num_pages < alloc_unit)
@@ -1245,6 +1252,10 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
1245 if (alloc_unit != 1) 1252 if (alloc_unit != 1)
1246 split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); 1253 split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
1247 1254
1255 /* mark all pages offline */
1256 for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++)
1257 __SetPageOffline(pg + j);
1258
1248 bl_resp->range_count++; 1259 bl_resp->range_count++;
1249 bl_resp->range_array[i].finfo.start_page = 1260 bl_resp->range_array[i].finfo.start_page =
1250 page_to_pfn(pg); 1261 page_to_pfn(pg);
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 2baf38cc1e23..4fe662c3bbc1 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -48,6 +48,7 @@
48#include <linux/cpumask.h> 48#include <linux/cpumask.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/interrupt.h> 50#include <linux/interrupt.h>
51#include <linux/numa.h>
51 52
52#include "hfi.h" 53#include "hfi.h"
53#include "affinity.h" 54#include "affinity.h"
@@ -777,7 +778,7 @@ void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
777 _dev_comp_vect_cpu_mask_clean_up(dd, entry); 778 _dev_comp_vect_cpu_mask_clean_up(dd, entry);
778unlock: 779unlock:
779 mutex_unlock(&node_affinity.lock); 780 mutex_unlock(&node_affinity.lock);
780 dd->node = -1; 781 dd->node = NUMA_NO_NODE;
781} 782}
782 783
783/* 784/*
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 7835eb52e7c5..441b06e2a154 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -54,6 +54,7 @@
54#include <linux/printk.h> 54#include <linux/printk.h>
55#include <linux/hrtimer.h> 55#include <linux/hrtimer.h>
56#include <linux/bitmap.h> 56#include <linux/bitmap.h>
57#include <linux/numa.h>
57#include <rdma/rdma_vt.h> 58#include <rdma/rdma_vt.h>
58 59
59#include "hfi.h" 60#include "hfi.h"
@@ -1303,7 +1304,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
1303 dd->unit = ret; 1304 dd->unit = ret;
1304 list_add(&dd->list, &hfi1_dev_list); 1305 list_add(&dd->list, &hfi1_dev_list);
1305 } 1306 }
1306 dd->node = -1; 1307 dd->node = NUMA_NO_NODE;
1307 1308
1308 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1309 spin_unlock_irqrestore(&hfi1_devs_lock, flags);
1309 idr_preload_end(); 1310 idr_preload_end();
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 58dc70bffd5b..9c49300e9fb7 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -39,6 +39,7 @@
39#include <linux/dmi.h> 39#include <linux/dmi.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/iommu.h> 41#include <linux/iommu.h>
42#include <linux/numa.h>
42#include <asm/irq_remapping.h> 43#include <asm/irq_remapping.h>
43#include <asm/iommu_table.h> 44#include <asm/iommu_table.h>
44 45
@@ -477,7 +478,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
477 int node = acpi_map_pxm_to_node(rhsa->proximity_domain); 478 int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
478 479
479 if (!node_online(node)) 480 if (!node_online(node))
480 node = -1; 481 node = NUMA_NO_NODE;
481 drhd->iommu->node = node; 482 drhd->iommu->node = node;
482 return 0; 483 return 0;
483 } 484 }
@@ -1062,7 +1063,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
1062 iommu->msagaw = msagaw; 1063 iommu->msagaw = msagaw;
1063 iommu->segment = drhd->segment; 1064 iommu->segment = drhd->segment;
1064 1065
1065 iommu->node = -1; 1066 iommu->node = NUMA_NO_NODE;
1066 1067
1067 ver = readl(iommu->reg + DMAR_VER_REG); 1068 ver = readl(iommu->reg + DMAR_VER_REG);
1068 pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", 1069 pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 78188bf7e90d..39a33dec4d0b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -47,6 +47,7 @@
47#include <linux/dma-contiguous.h> 47#include <linux/dma-contiguous.h>
48#include <linux/dma-direct.h> 48#include <linux/dma-direct.h>
49#include <linux/crash_dump.h> 49#include <linux/crash_dump.h>
50#include <linux/numa.h>
50#include <asm/irq_remapping.h> 51#include <asm/irq_remapping.h>
51#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
52#include <asm/iommu.h> 53#include <asm/iommu.h>
@@ -1716,7 +1717,7 @@ static struct dmar_domain *alloc_domain(int flags)
1716 return NULL; 1717 return NULL;
1717 1718
1718 memset(domain, 0, sizeof(*domain)); 1719 memset(domain, 0, sizeof(*domain));
1719 domain->nid = -1; 1720 domain->nid = NUMA_NO_NODE;
1720 domain->flags = flags; 1721 domain->flags = flags;
1721 domain->has_iotlb_device = false; 1722 domain->has_iotlb_device = false;
1722 INIT_LIST_HEAD(&domain->devices); 1723 INIT_LIST_HEAD(&domain->devices);
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 0441abe87880..9e443df44b3b 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -22,6 +22,7 @@
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/err.h> 23#include <linux/err.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/numa.h>
25#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
26#if defined CONFIG_X86_64 27#if defined CONFIG_X86_64
27#include <asm/uv/bios.h> 28#include <asm/uv/bios.h>
@@ -61,7 +62,7 @@ static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
61 XPC_NOTIFY_MSG_SIZE_UV) 62 XPC_NOTIFY_MSG_SIZE_UV)
62#define XPC_NOTIFY_IRQ_NAME "xpc_notify" 63#define XPC_NOTIFY_IRQ_NAME "xpc_notify"
63 64
64static int xpc_mq_node = -1; 65static int xpc_mq_node = NUMA_NO_NODE;
65 66
66static struct xpc_gru_mq_uv *xpc_activate_mq_uv; 67static struct xpc_gru_mq_uv *xpc_activate_mq_uv;
67static struct xpc_gru_mq_uv *xpc_notify_mq_uv; 68static struct xpc_gru_mq_uv *xpc_notify_mq_uv;
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index f8240b87df22..869ec842729e 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -557,6 +557,36 @@ vmballoon_page_in_frames(enum vmballoon_page_size_type page_size)
557} 557}
558 558
559/** 559/**
560 * vmballoon_mark_page_offline() - mark a page as offline
561 * @page: pointer for the page.
562 * @page_size: the size of the page.
563 */
564static void
565vmballoon_mark_page_offline(struct page *page,
566 enum vmballoon_page_size_type page_size)
567{
568 int i;
569
570 for (i = 0; i < vmballoon_page_in_frames(page_size); i++)
571 __SetPageOffline(page + i);
572}
573
574/**
575 * vmballoon_mark_page_online() - mark a page as online
576 * @page: pointer for the page.
577 * @page_size: the size of the page.
578 */
579static void
580vmballoon_mark_page_online(struct page *page,
581 enum vmballoon_page_size_type page_size)
582{
583 int i;
584
585 for (i = 0; i < vmballoon_page_in_frames(page_size); i++)
586 __ClearPageOffline(page + i);
587}
588
589/**
560 * vmballoon_send_get_target() - Retrieve desired balloon size from the host. 590 * vmballoon_send_get_target() - Retrieve desired balloon size from the host.
561 * 591 *
562 * @b: pointer to the balloon. 592 * @b: pointer to the balloon.
@@ -612,6 +642,7 @@ static int vmballoon_alloc_page_list(struct vmballoon *b,
612 ctl->page_size); 642 ctl->page_size);
613 643
614 if (page) { 644 if (page) {
645 vmballoon_mark_page_offline(page, ctl->page_size);
615 /* Success. Add the page to the list and continue. */ 646 /* Success. Add the page to the list and continue. */
616 list_add(&page->lru, &ctl->pages); 647 list_add(&page->lru, &ctl->pages);
617 continue; 648 continue;
@@ -850,6 +881,7 @@ static void vmballoon_release_page_list(struct list_head *page_list,
850 881
851 list_for_each_entry_safe(page, tmp, page_list, lru) { 882 list_for_each_entry_safe(page, tmp, page_list, lru) {
852 list_del(&page->lru); 883 list_del(&page->lru);
884 vmballoon_mark_page_online(page, page_size);
853 __free_pages(page, vmballoon_page_order(page_size)); 885 __free_pages(page, vmballoon_page_order(page_size));
854 } 886 }
855 887
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a4e7584a50cb..e100054a3765 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -27,6 +27,7 @@
27#include <linux/bpf.h> 27#include <linux/bpf.h>
28#include <linux/bpf_trace.h> 28#include <linux/bpf_trace.h>
29#include <linux/atomic.h> 29#include <linux/atomic.h>
30#include <linux/numa.h>
30#include <scsi/fc/fc_fcoe.h> 31#include <scsi/fc/fc_fcoe.h>
31#include <net/udp_tunnel.h> 32#include <net/udp_tunnel.h>
32#include <net/pkt_cls.h> 33#include <net/pkt_cls.h>
@@ -6418,7 +6419,7 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring)
6418{ 6419{
6419 struct device *dev = tx_ring->dev; 6420 struct device *dev = tx_ring->dev;
6420 int orig_node = dev_to_node(dev); 6421 int orig_node = dev_to_node(dev);
6421 int ring_node = -1; 6422 int ring_node = NUMA_NO_NODE;
6422 int size; 6423 int size;
6423 6424
6424 size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count; 6425 size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count;
@@ -6512,7 +6513,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
6512{ 6513{
6513 struct device *dev = rx_ring->dev; 6514 struct device *dev = rx_ring->dev;
6514 int orig_node = dev_to_node(dev); 6515 int orig_node = dev_to_node(dev);
6515 int ring_node = -1; 6516 int ring_node = NUMA_NO_NODE;
6516 int size; 6517 int size;
6517 6518
6518 size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; 6519 size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index ceb5048de9a7..39b229f9e256 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -369,14 +369,20 @@ static enum bp_state reserve_additional_memory(void)
369 return BP_ECANCELED; 369 return BP_ECANCELED;
370} 370}
371 371
372static void xen_online_page(struct page *page) 372static void xen_online_page(struct page *page, unsigned int order)
373{ 373{
374 __online_page_set_limits(page); 374 unsigned long i, size = (1 << order);
375 unsigned long start_pfn = page_to_pfn(page);
376 struct page *p;
375 377
378 pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
376 mutex_lock(&balloon_mutex); 379 mutex_lock(&balloon_mutex);
377 380 for (i = 0; i < size; i++) {
378 __balloon_append(page); 381 p = pfn_to_page(start_pfn + i);
379 382 __online_page_set_limits(p);
383 __SetPageOffline(p);
384 __balloon_append(p);
385 }
380 mutex_unlock(&balloon_mutex); 386 mutex_unlock(&balloon_mutex);
381} 387}
382 388
@@ -441,6 +447,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
441 xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); 447 xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]);
442 448
443 /* Relinquish the page back to the allocator. */ 449 /* Relinquish the page back to the allocator. */
450 __ClearPageOffline(page);
444 free_reserved_page(page); 451 free_reserved_page(page);
445 } 452 }
446 453
@@ -467,6 +474,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
467 state = BP_EAGAIN; 474 state = BP_EAGAIN;
468 break; 475 break;
469 } 476 }
477 __SetPageOffline(page);
470 adjust_managed_page_count(page, -1); 478 adjust_managed_page_count(page, -1);
471 xenmem_reservation_scrub_page(page); 479 xenmem_reservation_scrub_page(page);
472 list_add(&page->lru, &pages); 480 list_add(&page->lru, &pages);
diff --git a/fs/file.c b/fs/file.c
index 3209ee271c41..a10487aa0a84 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -457,6 +457,7 @@ struct files_struct init_files = {
457 .full_fds_bits = init_files.full_fds_bits_init, 457 .full_fds_bits = init_files.full_fds_bits_init,
458 }, 458 },
459 .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), 459 .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
460 .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
460}; 461};
461 462
462static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) 463static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a7fa037b876b..b0eef008de67 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
530 inode_lock(inode); 530 inode_lock(inode);
531 531
532 /* protected by i_mutex */ 532 /* protected by i_mutex */
533 if (info->seals & F_SEAL_WRITE) { 533 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
534 inode_unlock(inode); 534 inode_unlock(inode);
535 return -EPERM; 535 return -EPERM;
536 } 536 }
diff --git a/fs/inode.c b/fs/inode.c
index 73432e64f874..e9d97add2b36 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait);
2093void inode_set_flags(struct inode *inode, unsigned int flags, 2093void inode_set_flags(struct inode *inode, unsigned int flags,
2094 unsigned int mask) 2094 unsigned int mask)
2095{ 2095{
2096 unsigned int old_flags, new_flags;
2097
2098 WARN_ON_ONCE(flags & ~mask); 2096 WARN_ON_ONCE(flags & ~mask);
2099 do { 2097 set_mask_bits(&inode->i_flags, mask, flags);
2100 old_flags = READ_ONCE(inode->i_flags);
2101 new_flags = (old_flags & ~mask) | flags;
2102 } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
2103 new_flags) != old_flags));
2104} 2098}
2105EXPORT_SYMBOL(inode_set_flags); 2099EXPORT_SYMBOL(inode_set_flags);
2106 2100
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f8d5021a652e..ae948aaa4c53 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
832 * to see if it supports poll (Neither 'poll' nor 'select' return 832 * to see if it supports poll (Neither 'poll' nor 'select' return
833 * an appropriate error code). When in doubt, set a suitable timeout value. 833 * an appropriate error code). When in doubt, set a suitable timeout value.
834 */ 834 */
835__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
836{
837 struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
838 struct kernfs_open_node *on = kn->attr.open;
839
840 poll_wait(of->file, &on->poll, wait);
841
842 if (of->event != atomic_read(&on->event))
843 return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
844
845 return DEFAULT_POLLMASK;
846}
847
835static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) 848static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
836{ 849{
837 struct kernfs_open_file *of = kernfs_of(filp); 850 struct kernfs_open_file *of = kernfs_of(filp);
838 struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); 851 struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
839 struct kernfs_open_node *on = kn->attr.open; 852 __poll_t ret;
840 853
841 if (!kernfs_get_active(kn)) 854 if (!kernfs_get_active(kn))
842 goto trigger; 855 return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
843 856
844 poll_wait(filp, &on->poll, wait); 857 if (kn->attr.ops->poll)
858 ret = kn->attr.ops->poll(of, wait);
859 else
860 ret = kernfs_generic_poll(of, wait);
845 861
846 kernfs_put_active(kn); 862 kernfs_put_active(kn);
847 863 return ret;
848 if (of->event != atomic_read(&on->event))
849 goto trigger;
850
851 return DEFAULT_POLLMASK;
852
853 trigger:
854 return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
855} 864}
856 865
857static void kernfs_notify_workfn(struct work_struct *work) 866static void kernfs_notify_workfn(struct work_struct *work)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1cbb27808e2..6f0999015a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
7532 return count; 7532 return count;
7533} 7533}
7534 7534
7535int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) 7535static
7536int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
7536{ 7537{
7537 struct ocfs2_super *osb = OCFS2_SB(sb); 7538 struct ocfs2_super *osb = OCFS2_SB(sb);
7538 u64 start, len, trimmed, first_group, last_group, group; 7539 u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
7539 int ret, cnt; 7540 int ret, cnt;
7540 u32 first_bit, last_bit, minlen; 7541 u32 first_bit, last_bit, minlen;
7541 struct buffer_head *main_bm_bh = NULL; 7542 struct buffer_head *main_bm_bh = NULL;
@@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7543 struct buffer_head *gd_bh = NULL; 7544 struct buffer_head *gd_bh = NULL;
7544 struct ocfs2_dinode *main_bm; 7545 struct ocfs2_dinode *main_bm;
7545 struct ocfs2_group_desc *gd = NULL; 7546 struct ocfs2_group_desc *gd = NULL;
7546 struct ocfs2_trim_fs_info info, *pinfo = NULL;
7547 7547
7548 start = range->start >> osb->s_clustersize_bits; 7548 start = range->start >> osb->s_clustersize_bits;
7549 len = range->len >> osb->s_clustersize_bits; 7549 len = range->len >> osb->s_clustersize_bits;
@@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7552 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) 7552 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
7553 return -EINVAL; 7553 return -EINVAL;
7554 7554
7555 trace_ocfs2_trim_mainbm(start, len, minlen);
7556
7557next_group:
7555 main_bm_inode = ocfs2_get_system_file_inode(osb, 7558 main_bm_inode = ocfs2_get_system_file_inode(osb,
7556 GLOBAL_BITMAP_SYSTEM_INODE, 7559 GLOBAL_BITMAP_SYSTEM_INODE,
7557 OCFS2_INVALID_SLOT); 7560 OCFS2_INVALID_SLOT);
@@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7570 } 7573 }
7571 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; 7574 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
7572 7575
7573 if (start >= le32_to_cpu(main_bm->i_clusters)) { 7576 /*
7574 ret = -EINVAL; 7577 * Do some check before trim the first group.
7575 goto out_unlock; 7578 */
7576 } 7579 if (!group) {
7577 7580 if (start >= le32_to_cpu(main_bm->i_clusters)) {
7578 len = range->len >> osb->s_clustersize_bits; 7581 ret = -EINVAL;
7579 if (start + len > le32_to_cpu(main_bm->i_clusters))
7580 len = le32_to_cpu(main_bm->i_clusters) - start;
7581
7582 trace_ocfs2_trim_fs(start, len, minlen);
7583
7584 ocfs2_trim_fs_lock_res_init(osb);
7585 ret = ocfs2_trim_fs_lock(osb, NULL, 1);
7586 if (ret < 0) {
7587 if (ret != -EAGAIN) {
7588 mlog_errno(ret);
7589 ocfs2_trim_fs_lock_res_uninit(osb);
7590 goto out_unlock; 7582 goto out_unlock;
7591 } 7583 }
7592 7584
7593 mlog(ML_NOTICE, "Wait for trim on device (%s) to " 7585 if (start + len > le32_to_cpu(main_bm->i_clusters))
7594 "finish, which is running from another node.\n", 7586 len = le32_to_cpu(main_bm->i_clusters) - start;
7595 osb->dev_str);
7596 ret = ocfs2_trim_fs_lock(osb, &info, 0);
7597 if (ret < 0) {
7598 mlog_errno(ret);
7599 ocfs2_trim_fs_lock_res_uninit(osb);
7600 goto out_unlock;
7601 }
7602 7587
7603 if (info.tf_valid && info.tf_success && 7588 /*
7604 info.tf_start == start && info.tf_len == len && 7589 * Determine first and last group to examine based on
7605 info.tf_minlen == minlen) { 7590 * start and len
7606 /* Avoid sending duplicated trim to a shared device */ 7591 */
7607 mlog(ML_NOTICE, "The same trim on device (%s) was " 7592 first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7608 "just done from node (%u), return.\n", 7593 if (first_group == osb->first_cluster_group_blkno)
7609 osb->dev_str, info.tf_nodenum); 7594 first_bit = start;
7610 range->len = info.tf_trimlen; 7595 else
7611 goto out_trimunlock; 7596 first_bit = start - ocfs2_blocks_to_clusters(sb,
7612 } 7597 first_group);
7598 last_group = ocfs2_which_cluster_group(main_bm_inode,
7599 start + len - 1);
7600 group = first_group;
7613 } 7601 }
7614 7602
7615 info.tf_nodenum = osb->node_num; 7603 do {
7616 info.tf_start = start;
7617 info.tf_len = len;
7618 info.tf_minlen = minlen;
7619
7620 /* Determine first and last group to examine based on start and len */
7621 first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7622 if (first_group == osb->first_cluster_group_blkno)
7623 first_bit = start;
7624 else
7625 first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
7626 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7627 last_bit = osb->bitmap_cpg;
7628
7629 trimmed = 0;
7630 for (group = first_group; group <= last_group;) {
7631 if (first_bit + len >= osb->bitmap_cpg) 7604 if (first_bit + len >= osb->bitmap_cpg)
7632 last_bit = osb->bitmap_cpg; 7605 last_bit = osb->bitmap_cpg;
7633 else 7606 else
@@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7659 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); 7632 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7660 else 7633 else
7661 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); 7634 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7662 } 7635 } while (0);
7663 range->len = trimmed * sb->s_blocksize;
7664 7636
7665 info.tf_trimlen = range->len;
7666 info.tf_success = (ret ? 0 : 1);
7667 pinfo = &info;
7668out_trimunlock:
7669 ocfs2_trim_fs_unlock(osb, pinfo);
7670 ocfs2_trim_fs_lock_res_uninit(osb);
7671out_unlock: 7637out_unlock:
7672 ocfs2_inode_unlock(main_bm_inode, 0); 7638 ocfs2_inode_unlock(main_bm_inode, 0);
7673 brelse(main_bm_bh); 7639 brelse(main_bm_bh);
7640 main_bm_bh = NULL;
7674out_mutex: 7641out_mutex:
7675 inode_unlock(main_bm_inode); 7642 inode_unlock(main_bm_inode);
7676 iput(main_bm_inode); 7643 iput(main_bm_inode);
7644
7645 /*
7646 * If all the groups trim are not done or failed, but we should release
7647 * main_bm related locks for avoiding the current IO starve, then go to
7648 * trim the next group
7649 */
7650 if (ret >= 0 && group <= last_group)
7651 goto next_group;
7677out: 7652out:
7653 range->len = trimmed * sb->s_blocksize;
7654 return ret;
7655}
7656
7657int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7658{
7659 int ret;
7660 struct ocfs2_super *osb = OCFS2_SB(sb);
7661 struct ocfs2_trim_fs_info info, *pinfo = NULL;
7662
7663 ocfs2_trim_fs_lock_res_init(osb);
7664
7665 trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
7666
7667 ret = ocfs2_trim_fs_lock(osb, NULL, 1);
7668 if (ret < 0) {
7669 if (ret != -EAGAIN) {
7670 mlog_errno(ret);
7671 ocfs2_trim_fs_lock_res_uninit(osb);
7672 return ret;
7673 }
7674
7675 mlog(ML_NOTICE, "Wait for trim on device (%s) to "
7676 "finish, which is running from another node.\n",
7677 osb->dev_str);
7678 ret = ocfs2_trim_fs_lock(osb, &info, 0);
7679 if (ret < 0) {
7680 mlog_errno(ret);
7681 ocfs2_trim_fs_lock_res_uninit(osb);
7682 return ret;
7683 }
7684
7685 if (info.tf_valid && info.tf_success &&
7686 info.tf_start == range->start &&
7687 info.tf_len == range->len &&
7688 info.tf_minlen == range->minlen) {
7689 /* Avoid sending duplicated trim to a shared device */
7690 mlog(ML_NOTICE, "The same trim on device (%s) was "
7691 "just done from node (%u), return.\n",
7692 osb->dev_str, info.tf_nodenum);
7693 range->len = info.tf_trimlen;
7694 goto out;
7695 }
7696 }
7697
7698 info.tf_nodenum = osb->node_num;
7699 info.tf_start = range->start;
7700 info.tf_len = range->len;
7701 info.tf_minlen = range->minlen;
7702
7703 ret = ocfs2_trim_mainbm(sb, range);
7704
7705 info.tf_trimlen = range->len;
7706 info.tf_success = (ret < 0 ? 0 : 1);
7707 pinfo = &info;
7708out:
7709 ocfs2_trim_fs_unlock(osb, pinfo);
7710 ocfs2_trim_fs_lock_res_uninit(osb);
7678 return ret; 7711 return ret;
7679} 7712}
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 0e4166cc23a0..4ac775e32240 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
621 struct o2nm_node *node = to_o2nm_node(item); 621 struct o2nm_node *node = to_o2nm_node(item);
622 struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); 622 struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
623 623
624 o2net_disconnect_node(node); 624 if (cluster->cl_nodes[node->nd_num] == node) {
625 o2net_disconnect_node(node);
625 626
626 if (cluster->cl_has_local && 627 if (cluster->cl_has_local &&
627 (cluster->cl_local_node == node->nd_num)) { 628 (cluster->cl_local_node == node->nd_num)) {
628 cluster->cl_has_local = 0; 629 cluster->cl_has_local = 0;
629 cluster->cl_local_node = O2NM_INVALID_NODE_NUM; 630 cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
630 o2net_stop_listening(node); 631 o2net_stop_listening(node);
632 }
631 } 633 }
632 634
633 /* XXX call into net to stop this node from trading messages */ 635 /* XXX call into net to stop this node from trading messages */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7c835824247e..af405586c5b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
686{ 686{
687 struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; 687 struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
688 688
689 /* Only one trimfs thread are allowed to work at the same time. */
690 mutex_lock(&osb->obs_trim_fs_mutex);
691
689 ocfs2_lock_res_init_once(lockres); 692 ocfs2_lock_res_init_once(lockres);
690 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); 693 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
691 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, 694 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
698 701
699 ocfs2_simple_drop_lockres(osb, lockres); 702 ocfs2_simple_drop_lockres(osb, lockres);
700 ocfs2_lock_res_free(lockres); 703 ocfs2_lock_res_free(lockres);
704
705 mutex_unlock(&osb->obs_trim_fs_mutex);
701} 706}
702 707
703static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, 708static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0027b5..1f029fbe8b8d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -407,6 +407,7 @@ struct ocfs2_super
407 struct ocfs2_lock_res osb_rename_lockres; 407 struct ocfs2_lock_res osb_rename_lockres;
408 struct ocfs2_lock_res osb_nfs_sync_lockres; 408 struct ocfs2_lock_res osb_nfs_sync_lockres;
409 struct ocfs2_lock_res osb_trim_fs_lockres; 409 struct ocfs2_lock_res osb_trim_fs_lockres;
410 struct mutex obs_trim_fs_mutex;
410 struct ocfs2_dlm_debug *osb_dlm_debug; 411 struct ocfs2_dlm_debug *osb_dlm_debug;
411 412
412 struct dentry *osb_debug_root; 413 struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a90ba8f..dc4bce1649c1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
712 712
713DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); 713DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
714 714
715DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
716
715DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); 717DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
716 718
717/* End of trace events for fs/ocfs2/alloc.c. */ 719/* End of trace events for fs/ocfs2/alloc.c. */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d7407994f308..ea0756d83250 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -55,7 +55,7 @@ struct ocfs2_slot_info {
55 unsigned int si_blocks; 55 unsigned int si_blocks;
56 struct buffer_head **si_bh; 56 struct buffer_head **si_bh;
57 unsigned int si_num_slots; 57 unsigned int si_num_slots;
58 struct ocfs2_slot *si_slots; 58 struct ocfs2_slot si_slots[];
59}; 59};
60 60
61 61
@@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
420 struct inode *inode = NULL; 420 struct inode *inode = NULL;
421 struct ocfs2_slot_info *si; 421 struct ocfs2_slot_info *si;
422 422
423 si = kzalloc(sizeof(struct ocfs2_slot_info) + 423 si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
424 (sizeof(struct ocfs2_slot) * osb->max_slots),
425 GFP_KERNEL);
426 if (!si) { 424 if (!si) {
427 status = -ENOMEM; 425 status = -ENOMEM;
428 mlog_errno(status); 426 mlog_errno(status);
@@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
431 429
432 si->si_extended = ocfs2_uses_extended_slot_map(osb); 430 si->si_extended = ocfs2_uses_extended_slot_map(osb);
433 si->si_num_slots = osb->max_slots; 431 si->si_num_slots = osb->max_slots;
434 si->si_slots = (struct ocfs2_slot *)((char *)si +
435 sizeof(struct ocfs2_slot_info));
436 432
437 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, 433 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
438 OCFS2_INVALID_SLOT); 434 OCFS2_INVALID_SLOT);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b09398..96ae7cedd487 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
1847 if (ocfs2_is_hard_readonly(osb)) 1847 if (ocfs2_is_hard_readonly(osb))
1848 goto leave; 1848 goto leave;
1849 1849
1850 mutex_init(&osb->obs_trim_fs_mutex);
1851
1850 status = ocfs2_dlm_init(osb); 1852 status = ocfs2_dlm_init(osb);
1851 if (status < 0) { 1853 if (status < 0) {
1852 mlog_errno(status); 1854 mlog_errno(status);
diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..51d5fd8840ab 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
140 struct page *page = buf->page; 140 struct page *page = buf->page;
141 141
142 if (page_count(page) == 1) { 142 if (page_count(page) == 1) {
143 if (memcg_kmem_enabled()) 143 memcg_kmem_uncharge(page, 0);
144 memcg_kmem_uncharge(page, 0);
145 __SetPageLocked(page); 144 __SetPageLocked(page);
146 return 0; 145 return 0;
147 } 146 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9d428d5a0ac8..2edbb657f859 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
343#ifdef CONFIG_SECCOMP 343#ifdef CONFIG_SECCOMP
344 seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); 344 seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
345#endif 345#endif
346 seq_printf(m, "\nSpeculation_Store_Bypass:\t"); 346 seq_puts(m, "\nSpeculation_Store_Bypass:\t");
347 switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { 347 switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
348 case -EINVAL: 348 case -EINVAL:
349 seq_printf(m, "unknown"); 349 seq_puts(m, "unknown");
350 break; 350 break;
351 case PR_SPEC_NOT_AFFECTED: 351 case PR_SPEC_NOT_AFFECTED:
352 seq_printf(m, "not vulnerable"); 352 seq_puts(m, "not vulnerable");
353 break; 353 break;
354 case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: 354 case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
355 seq_printf(m, "thread force mitigated"); 355 seq_puts(m, "thread force mitigated");
356 break; 356 break;
357 case PR_SPEC_PRCTL | PR_SPEC_DISABLE: 357 case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
358 seq_printf(m, "thread mitigated"); 358 seq_puts(m, "thread mitigated");
359 break; 359 break;
360 case PR_SPEC_PRCTL | PR_SPEC_ENABLE: 360 case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
361 seq_printf(m, "thread vulnerable"); 361 seq_puts(m, "thread vulnerable");
362 break; 362 break;
363 case PR_SPEC_DISABLE: 363 case PR_SPEC_DISABLE:
364 seq_printf(m, "globally mitigated"); 364 seq_puts(m, "globally mitigated");
365 break; 365 break;
366 default: 366 default:
367 seq_printf(m, "vulnerable"); 367 seq_puts(m, "vulnerable");
368 break; 368 break;
369 } 369 }
370 seq_putc(m, '\n'); 370 seq_putc(m, '\n');
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f5ed9512d193..511b279ec69c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -456,7 +456,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
456 struct pid *pid, struct task_struct *task) 456 struct pid *pid, struct task_struct *task)
457{ 457{
458 if (unlikely(!sched_info_on())) 458 if (unlikely(!sched_info_on()))
459 seq_printf(m, "0 0 0\n"); 459 seq_puts(m, "0 0 0\n");
460 else 460 else
461 seq_printf(m, "%llu %llu %lu\n", 461 seq_printf(m, "%llu %llu %lu\n",
462 (unsigned long long)task->se.sum_exec_runtime, 462 (unsigned long long)task->se.sum_exec_runtime,
@@ -3161,7 +3161,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
3161 return d_splice_alias(inode, dentry); 3161 return d_splice_alias(inode, dentry);
3162} 3162}
3163 3163
3164struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3164struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
3165{ 3165{
3166 struct task_struct *task; 3166 struct task_struct *task;
3167 unsigned tgid; 3167 unsigned tgid;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 95b14196f284..4fc5a9b68f76 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -162,7 +162,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc
162extern void pid_update_inode(struct task_struct *, struct inode *); 162extern void pid_update_inode(struct task_struct *, struct inode *);
163extern int pid_delete_dentry(const struct dentry *); 163extern int pid_delete_dentry(const struct dentry *);
164extern int proc_pid_readdir(struct file *, struct dir_context *); 164extern int proc_pid_readdir(struct file *, struct dir_context *);
165extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); 165struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
166extern loff_t mem_lseek(struct file *, loff_t, int); 166extern loff_t mem_lseek(struct file *, loff_t, int);
167 167
168/* Lookups */ 168/* Lookups */
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 40b05e0d4274..544d1ee15aee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page)
152 else if (page_count(page) == 0 && is_free_buddy_page(page)) 152 else if (page_count(page) == 0 && is_free_buddy_page(page))
153 u |= 1 << KPF_BUDDY; 153 u |= 1 << KPF_BUDDY;
154 154
155 if (PageBalloon(page)) 155 if (PageOffline(page))
156 u |= 1 << KPF_BALLOON; 156 u |= 1 << KPF_OFFLINE;
157 if (PageTable(page)) 157 if (PageTable(page))
158 u |= 1 << KPF_PGTABLE; 158 u |= 1 << KPF_PGTABLE;
159 159
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f4b1a9d2eca6..621e6ec322ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -154,7 +154,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat,
154 154
155static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) 155static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
156{ 156{
157 if (!proc_pid_lookup(dir, dentry, flags)) 157 if (!proc_pid_lookup(dentry, flags))
158 return NULL; 158 return NULL;
159 159
160 return proc_lookup(dir, dentry, flags); 160 return proc_lookup(dir, dentry, flags);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 127265e5c55f..57c0a1047250 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s)
38 struct inode *root_inode = d_inode(s->s_root); 38 struct inode *root_inode = d_inode(s->s_root);
39 struct pid_namespace *ns = proc_pid_ns(root_inode); 39 struct pid_namespace *ns = proc_pid_ns(root_inode);
40 struct dentry *self; 40 struct dentry *self;
41 int ret = -ENOMEM;
41 42
42 inode_lock(root_inode); 43 inode_lock(root_inode);
43 self = d_alloc_name(s->s_root, "self"); 44 self = d_alloc_name(s->s_root, "self");
@@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s)
51 inode->i_gid = GLOBAL_ROOT_GID; 52 inode->i_gid = GLOBAL_ROOT_GID;
52 inode->i_op = &proc_self_inode_operations; 53 inode->i_op = &proc_self_inode_operations;
53 d_add(self, inode); 54 d_add(self, inode);
55 ret = 0;
54 } else { 56 } else {
55 dput(self); 57 dput(self);
56 self = ERR_PTR(-ENOMEM);
57 } 58 }
58 } else {
59 self = ERR_PTR(-ENOMEM);
60 } 59 }
61 inode_unlock(root_inode); 60 inode_unlock(root_inode);
62 if (IS_ERR(self)) { 61
62 if (ret)
63 pr_err("proc_fill_super: can't allocate /proc/self\n"); 63 pr_err("proc_fill_super: can't allocate /proc/self\n");
64 return PTR_ERR(self); 64 else
65 } 65 ns->proc_self = self;
66 ns->proc_self = self; 66
67 return 0; 67 return ret;
68} 68}
69 69
70void __init proc_self_init(void) 70void __init proc_self_init(void)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 76175211b304..80c305f206bb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -23,21 +23,21 @@
23 23
24#ifdef arch_idle_time 24#ifdef arch_idle_time
25 25
26static u64 get_idle_time(int cpu) 26static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
27{ 27{
28 u64 idle; 28 u64 idle;
29 29
30 idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; 30 idle = kcs->cpustat[CPUTIME_IDLE];
31 if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) 31 if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
32 idle += arch_idle_time(cpu); 32 idle += arch_idle_time(cpu);
33 return idle; 33 return idle;
34} 34}
35 35
36static u64 get_iowait_time(int cpu) 36static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
37{ 37{
38 u64 iowait; 38 u64 iowait;
39 39
40 iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; 40 iowait = kcs->cpustat[CPUTIME_IOWAIT];
41 if (cpu_online(cpu) && nr_iowait_cpu(cpu)) 41 if (cpu_online(cpu) && nr_iowait_cpu(cpu))
42 iowait += arch_idle_time(cpu); 42 iowait += arch_idle_time(cpu);
43 return iowait; 43 return iowait;
@@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu)
45 45
46#else 46#else
47 47
48static u64 get_idle_time(int cpu) 48static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
49{ 49{
50 u64 idle, idle_usecs = -1ULL; 50 u64 idle, idle_usecs = -1ULL;
51 51
@@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu)
54 54
55 if (idle_usecs == -1ULL) 55 if (idle_usecs == -1ULL)
56 /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ 56 /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
57 idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; 57 idle = kcs->cpustat[CPUTIME_IDLE];
58 else 58 else
59 idle = idle_usecs * NSEC_PER_USEC; 59 idle = idle_usecs * NSEC_PER_USEC;
60 60
61 return idle; 61 return idle;
62} 62}
63 63
64static u64 get_iowait_time(int cpu) 64static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
65{ 65{
66 u64 iowait, iowait_usecs = -1ULL; 66 u64 iowait, iowait_usecs = -1ULL;
67 67
@@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu)
70 70
71 if (iowait_usecs == -1ULL) 71 if (iowait_usecs == -1ULL)
72 /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ 72 /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
73 iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; 73 iowait = kcs->cpustat[CPUTIME_IOWAIT];
74 else 74 else
75 iowait = iowait_usecs * NSEC_PER_USEC; 75 iowait = iowait_usecs * NSEC_PER_USEC;
76 76
@@ -120,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v)
120 getboottime64(&boottime); 120 getboottime64(&boottime);
121 121
122 for_each_possible_cpu(i) { 122 for_each_possible_cpu(i) {
123 user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; 123 struct kernel_cpustat *kcs = &kcpustat_cpu(i);
124 nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; 124
125 system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; 125 user += kcs->cpustat[CPUTIME_USER];
126 idle += get_idle_time(i); 126 nice += kcs->cpustat[CPUTIME_NICE];
127 iowait += get_iowait_time(i); 127 system += kcs->cpustat[CPUTIME_SYSTEM];
128 irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; 128 idle += get_idle_time(kcs, i);
129 softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; 129 iowait += get_iowait_time(kcs, i);
130 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 130 irq += kcs->cpustat[CPUTIME_IRQ];
131 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 131 softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
132 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 132 steal += kcs->cpustat[CPUTIME_STEAL];
133 guest += kcs->cpustat[CPUTIME_GUEST];
134 guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
133 sum += kstat_cpu_irqs_sum(i); 135 sum += kstat_cpu_irqs_sum(i);
134 sum += arch_irq_stat_cpu(i); 136 sum += arch_irq_stat_cpu(i);
135 137
@@ -155,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v)
155 seq_putc(p, '\n'); 157 seq_putc(p, '\n');
156 158
157 for_each_online_cpu(i) { 159 for_each_online_cpu(i) {
160 struct kernel_cpustat *kcs = &kcpustat_cpu(i);
161
158 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 162 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
159 user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; 163 user = kcs->cpustat[CPUTIME_USER];
160 nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; 164 nice = kcs->cpustat[CPUTIME_NICE];
161 system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; 165 system = kcs->cpustat[CPUTIME_SYSTEM];
162 idle = get_idle_time(i); 166 idle = get_idle_time(kcs, i);
163 iowait = get_iowait_time(i); 167 iowait = get_iowait_time(kcs, i);
164 irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; 168 irq = kcs->cpustat[CPUTIME_IRQ];
165 softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; 169 softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
166 steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 170 steal = kcs->cpustat[CPUTIME_STEAL];
167 guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 171 guest = kcs->cpustat[CPUTIME_GUEST];
168 guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 172 guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
169 seq_printf(p, "cpu%d", i); 173 seq_printf(p, "cpu%d", i);
170 seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); 174 seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
171 seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); 175 seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85b0ef890b28..beccb0b1d57c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -948,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
948 pte_t ptent = *pte; 948 pte_t ptent = *pte;
949 949
950 if (pte_present(ptent)) { 950 if (pte_present(ptent)) {
951 ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); 951 pte_t old_pte;
952 ptent = pte_wrprotect(ptent); 952
953 old_pte = ptep_modify_prot_start(vma, addr, pte);
954 ptent = pte_wrprotect(old_pte);
953 ptent = pte_clear_soft_dirty(ptent); 955 ptent = pte_clear_soft_dirty(ptent);
954 ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); 956 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
955 } else if (is_swap_pte(ptent)) { 957 } else if (is_swap_pte(ptent)) {
956 ptent = pte_swp_clear_soft_dirty(ptent); 958 ptent = pte_swp_clear_soft_dirty(ptent);
957 set_pte_at(vma->vm_mm, addr, pte, ptent); 959 set_pte_at(vma->vm_mm, addr, pte, ptent);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index f912872fbf91..36bf0f2e102e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
178 seq_file_path(m, file, ""); 178 seq_file_path(m, file, "");
179 } else if (mm && is_stack(vma)) { 179 } else if (mm && is_stack(vma)) {
180 seq_pad(m, ' '); 180 seq_pad(m, ' ');
181 seq_printf(m, "[stack]"); 181 seq_puts(m, "[stack]");
182 } 182 }
183 183
184 seq_putc(m, '\n'); 184 seq_putc(m, '\n');
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index b905010ca9eb..f61ae53533f5 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s)
38 struct inode *root_inode = d_inode(s->s_root); 38 struct inode *root_inode = d_inode(s->s_root);
39 struct pid_namespace *ns = proc_pid_ns(root_inode); 39 struct pid_namespace *ns = proc_pid_ns(root_inode);
40 struct dentry *thread_self; 40 struct dentry *thread_self;
41 int ret = -ENOMEM;
41 42
42 inode_lock(root_inode); 43 inode_lock(root_inode);
43 thread_self = d_alloc_name(s->s_root, "thread-self"); 44 thread_self = d_alloc_name(s->s_root, "thread-self");
@@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s)
51 inode->i_gid = GLOBAL_ROOT_GID; 52 inode->i_gid = GLOBAL_ROOT_GID;
52 inode->i_op = &proc_thread_self_inode_operations; 53 inode->i_op = &proc_thread_self_inode_operations;
53 d_add(thread_self, inode); 54 d_add(thread_self, inode);
55 ret = 0;
54 } else { 56 } else {
55 dput(thread_self); 57 dput(thread_self);
56 thread_self = ERR_PTR(-ENOMEM);
57 } 58 }
58 } else {
59 thread_self = ERR_PTR(-ENOMEM);
60 } 59 }
61 inode_unlock(root_inode); 60 inode_unlock(root_inode);
62 if (IS_ERR(thread_self)) { 61
62 if (ret)
63 pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); 63 pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
64 return PTR_ERR(thread_self); 64 else
65 } 65 ns->proc_thread_self = thread_self;
66 ns->proc_thread_self = thread_self; 66
67 return 0; 67 return ret;
68} 68}
69 69
70void __init proc_thread_self_init(void) 70void __init proc_thread_self_init(void)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 05e61e6c843f..fa782fba51ee 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -606,7 +606,7 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd)
606 return 0; 606 return 0;
607} 607}
608 608
609static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, 609static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
610 unsigned long addr, 610 unsigned long addr,
611 pte_t *ptep) 611 pte_t *ptep)
612{ 612{
@@ -615,10 +615,10 @@ static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm,
615 * non-present, preventing the hardware from asynchronously 615 * non-present, preventing the hardware from asynchronously
616 * updating it. 616 * updating it.
617 */ 617 */
618 return ptep_get_and_clear(mm, addr, ptep); 618 return ptep_get_and_clear(vma->vm_mm, addr, ptep);
619} 619}
620 620
621static inline void __ptep_modify_prot_commit(struct mm_struct *mm, 621static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
622 unsigned long addr, 622 unsigned long addr,
623 pte_t *ptep, pte_t pte) 623 pte_t *ptep, pte_t pte)
624{ 624{
@@ -626,7 +626,7 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm,
626 * The pte is non-present, so there's no hardware state to 626 * The pte is non-present, so there's no hardware state to
627 * preserve. 627 * preserve.
628 */ 628 */
629 set_pte_at(mm, addr, ptep, pte); 629 set_pte_at(vma->vm_mm, addr, ptep, pte);
630} 630}
631 631
632#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION 632#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -644,22 +644,22 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm,
644 * queue the update to be done at some later time. The update must be 644 * queue the update to be done at some later time. The update must be
645 * actually committed before the pte lock is released, however. 645 * actually committed before the pte lock is released, however.
646 */ 646 */
647static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, 647static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
648 unsigned long addr, 648 unsigned long addr,
649 pte_t *ptep) 649 pte_t *ptep)
650{ 650{
651 return __ptep_modify_prot_start(mm, addr, ptep); 651 return __ptep_modify_prot_start(vma, addr, ptep);
652} 652}
653 653
654/* 654/*
655 * Commit an update to a pte, leaving any hardware-controlled bits in 655 * Commit an update to a pte, leaving any hardware-controlled bits in
656 * the PTE unmodified. 656 * the PTE unmodified.
657 */ 657 */
658static inline void ptep_modify_prot_commit(struct mm_struct *mm, 658static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
659 unsigned long addr, 659 unsigned long addr,
660 pte_t *ptep, pte_t pte) 660 pte_t *ptep, pte_t old_pte, pte_t pte)
661{ 661{
662 __ptep_modify_prot_commit(mm, addr, ptep, pte); 662 __ptep_modify_prot_commit(vma, addr, ptep, pte);
663} 663}
664#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ 664#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
665#endif /* CONFIG_MMU */ 665#endif /* CONFIG_MMU */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c28a47cbe355..f9b029180241 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -365,7 +365,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
365 rcu_read_lock(); 365 rcu_read_lock();
366 366
367 /* 367 /*
368 * Paired with store_release in inode_switch_wb_work_fn() and 368 * Paired with store_release in inode_switch_wbs_work_fn() and
369 * ensures that we see the new wb if we see cleared I_WB_SWITCH. 369 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
370 */ 370 */
371 cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; 371 cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 53051f3d8f25..f111c780ef1d 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -4,15 +4,18 @@
4 * 4 *
5 * Common interface definitions for making balloon pages movable by compaction. 5 * Common interface definitions for making balloon pages movable by compaction.
6 * 6 *
7 * Despite being perfectly possible to perform ballooned pages migration, they 7 * Balloon page migration makes use of the general non-lru movable page
8 * make a special corner case to compaction scans because balloon pages are not 8 * feature.
9 * enlisted at any LRU list like the other pages we do compact / migrate. 9 *
10 * page->private is used to reference the responsible balloon device.
11 * page->mapping is used in context of non-lru page migration to reference
12 * the address space operations for page isolation/migration/compaction.
10 * 13 *
11 * As the page isolation scanning step a compaction thread does is a lockless 14 * As the page isolation scanning step a compaction thread does is a lockless
12 * procedure (from a page standpoint), it might bring some racy situations while 15 * procedure (from a page standpoint), it might bring some racy situations while
13 * performing balloon page compaction. In order to sort out these racy scenarios 16 * performing balloon page compaction. In order to sort out these racy scenarios
14 * and safely perform balloon's page compaction and migration we must, always, 17 * and safely perform balloon's page compaction and migration we must, always,
15 * ensure following these three simple rules: 18 * ensure following these simple rules:
16 * 19 *
17 * i. when updating a balloon's page ->mapping element, strictly do it under 20 * i. when updating a balloon's page ->mapping element, strictly do it under
18 * the following lock order, independently of the far superior 21 * the following lock order, independently of the far superior
@@ -21,19 +24,8 @@
21 * +--spin_lock_irq(&b_dev_info->pages_lock); 24 * +--spin_lock_irq(&b_dev_info->pages_lock);
22 * ... page->mapping updates here ... 25 * ... page->mapping updates here ...
23 * 26 *
24 * ii. before isolating or dequeueing a balloon page from the balloon device 27 * ii. isolation or dequeueing procedure must remove the page from balloon
25 * pages list, the page reference counter must be raised by one and the 28 * device page list under b_dev_info->pages_lock.
26 * extra refcount must be dropped when the page is enqueued back into
27 * the balloon device page list, thus a balloon page keeps its reference
28 * counter raised only while it is under our special handling;
29 *
30 * iii. after the lockless scan step have selected a potential balloon page for
31 * isolation, re-test the PageBalloon mark and the PagePrivate flag
32 * under the proper page lock, to ensure isolating a valid balloon page
33 * (not yet isolated, nor under release procedure)
34 *
35 * iv. isolation or dequeueing procedure must clear PagePrivate flag under
36 * page lock together with removing page from balloon device page list.
37 * 29 *
38 * The functions provided by this interface are placed to help on coping with 30 * The functions provided by this interface are placed to help on coping with
39 * the aforementioned balloon page corner case, as well as to ensure the simple 31 * the aforementioned balloon page corner case, as well as to ensure the simple
@@ -103,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping,
103static inline void balloon_page_insert(struct balloon_dev_info *balloon, 95static inline void balloon_page_insert(struct balloon_dev_info *balloon,
104 struct page *page) 96 struct page *page)
105{ 97{
106 __SetPageBalloon(page); 98 __SetPageOffline(page);
107 __SetPageMovable(page, balloon->inode->i_mapping); 99 __SetPageMovable(page, balloon->inode->i_mapping);
108 set_page_private(page, (unsigned long)balloon); 100 set_page_private(page, (unsigned long)balloon);
109 list_add(&page->lru, &balloon->pages); 101 list_add(&page->lru, &balloon->pages);
@@ -119,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
119 */ 111 */
120static inline void balloon_page_delete(struct page *page) 112static inline void balloon_page_delete(struct page *page)
121{ 113{
122 __ClearPageBalloon(page); 114 __ClearPageOffline(page);
123 __ClearPageMovable(page); 115 __ClearPageMovable(page);
124 set_page_private(page, 0); 116 set_page_private(page, 0);
125 /* 117 /*
@@ -149,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
149static inline void balloon_page_insert(struct balloon_dev_info *balloon, 141static inline void balloon_page_insert(struct balloon_dev_info *balloon,
150 struct page *page) 142 struct page *page)
151{ 143{
152 __SetPageBalloon(page); 144 __SetPageOffline(page);
153 list_add(&page->lru, &balloon->pages); 145 list_add(&page->lru, &balloon->pages);
154} 146}
155 147
156static inline void balloon_page_delete(struct page *page) 148static inline void balloon_page_delete(struct page *page)
157{ 149{
158 __ClearPageBalloon(page); 150 __ClearPageOffline(page);
159 list_del(&page->lru); 151 list_del(&page->lru);
160} 152}
161 153
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8fcbae1b8db0..aad3babef007 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -32,6 +32,7 @@ struct kernfs_node;
32struct kernfs_ops; 32struct kernfs_ops;
33struct kernfs_open_file; 33struct kernfs_open_file;
34struct seq_file; 34struct seq_file;
35struct poll_table_struct;
35 36
36#define MAX_CGROUP_TYPE_NAMELEN 32 37#define MAX_CGROUP_TYPE_NAMELEN 32
37#define MAX_CGROUP_ROOT_NAMELEN 64 38#define MAX_CGROUP_ROOT_NAMELEN 64
@@ -574,6 +575,9 @@ struct cftype {
574 ssize_t (*write)(struct kernfs_open_file *of, 575 ssize_t (*write)(struct kernfs_open_file *of,
575 char *buf, size_t nbytes, loff_t off); 576 char *buf, size_t nbytes, loff_t off);
576 577
578 __poll_t (*poll)(struct kernfs_open_file *of,
579 struct poll_table_struct *pt);
580
577#ifdef CONFIG_DEBUG_LOCK_ALLOC 581#ifdef CONFIG_DEBUG_LOCK_ALLOC
578 struct lock_class_key lockdep_key; 582 struct lock_class_key lockdep_key;
579#endif 583#endif
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 68250a57aace..9569e7c786d3 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -88,14 +88,13 @@ extern int sysctl_compact_memory;
88extern int sysctl_compaction_handler(struct ctl_table *table, int write, 88extern int sysctl_compaction_handler(struct ctl_table *table, int write,
89 void __user *buffer, size_t *length, loff_t *ppos); 89 void __user *buffer, size_t *length, loff_t *ppos);
90extern int sysctl_extfrag_threshold; 90extern int sysctl_extfrag_threshold;
91extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
92 void __user *buffer, size_t *length, loff_t *ppos);
93extern int sysctl_compact_unevictable_allowed; 91extern int sysctl_compact_unevictable_allowed;
94 92
95extern int fragmentation_index(struct zone *zone, unsigned int order); 93extern int fragmentation_index(struct zone *zone, unsigned int order);
96extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, 94extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
97 unsigned int order, unsigned int alloc_flags, 95 unsigned int order, unsigned int alloc_flags,
98 const struct alloc_context *ac, enum compact_priority prio); 96 const struct alloc_context *ac, enum compact_priority prio,
97 struct page **page);
99extern void reset_isolation_suitable(pg_data_t *pgdat); 98extern void reset_isolation_suitable(pg_data_t *pgdat);
100extern enum compact_result compaction_suitable(struct zone *zone, int order, 99extern enum compact_result compaction_suitable(struct zone *zone, int order,
101 unsigned int alloc_flags, int classzone_idx); 100 unsigned int alloc_flags, int classzone_idx);
@@ -227,8 +226,8 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i
227 226
228#endif /* CONFIG_COMPACTION */ 227#endif /* CONFIG_COMPACTION */
229 228
230#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
231struct node; 229struct node;
230#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
232extern int compaction_register_node(struct node *node); 231extern int compaction_register_node(struct node *node);
233extern void compaction_unregister_node(struct node *node); 232extern void compaction_unregister_node(struct node *node);
234 233
diff --git a/include/linux/device.h b/include/linux/device.h
index 6cb4640b6160..4d2f13e8c540 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1095,7 +1095,7 @@ static inline void set_dev_node(struct device *dev, int node)
1095#else 1095#else
1096static inline int dev_to_node(struct device *dev) 1096static inline int dev_to_node(struct device *dev)
1097{ 1097{
1098 return -1; 1098 return NUMA_NO_NODE;
1099} 1099}
1100static inline void set_dev_node(struct device *dev, int node) 1100static inline void set_dev_node(struct device *dev, int node)
1101{ 1101{
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 011965c08b93..6d775984905b 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -7,6 +7,13 @@
7#include <linux/bitops.h> 7#include <linux/bitops.h>
8#include <linux/jump_label.h> 8#include <linux/jump_label.h>
9 9
10/*
11 * Return code to denote that requested number of
12 * frontswap pages are unused(moved to page cache).
13 * Used in in shmem_unuse and try_to_unuse.
14 */
15#define FRONTSWAP_PAGES_UNUSED 2
16
10struct frontswap_ops { 17struct frontswap_ops {
11 void (*init)(unsigned); /* this swap type was just swapon'ed */ 18 void (*init)(unsigned); /* this swap type was just swapon'ed */
12 int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ 19 int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd423fec8d83..08f26046233e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2091,7 +2091,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
2091 * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to 2091 * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
2092 * synchronize competing switching instances and to tell 2092 * synchronize competing switching instances and to tell
2093 * wb stat updates to grab the i_pages lock. See 2093 * wb stat updates to grab the i_pages lock. See
2094 * inode_switch_wb_work_fn() for details. 2094 * inode_switch_wbs_work_fn() for details.
2095 * 2095 *
2096 * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper 2096 * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper
2097 * and work dirs among overlayfs mounts. 2097 * and work dirs among overlayfs mounts.
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5f5e25fd6149..fdab7de7490d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -24,21 +24,21 @@ struct vm_area_struct;
24#define ___GFP_HIGH 0x20u 24#define ___GFP_HIGH 0x20u
25#define ___GFP_IO 0x40u 25#define ___GFP_IO 0x40u
26#define ___GFP_FS 0x80u 26#define ___GFP_FS 0x80u
27#define ___GFP_WRITE 0x100u 27#define ___GFP_ZERO 0x100u
28#define ___GFP_NOWARN 0x200u 28#define ___GFP_ATOMIC 0x200u
29#define ___GFP_RETRY_MAYFAIL 0x400u 29#define ___GFP_DIRECT_RECLAIM 0x400u
30#define ___GFP_NOFAIL 0x800u 30#define ___GFP_KSWAPD_RECLAIM 0x800u
31#define ___GFP_NORETRY 0x1000u 31#define ___GFP_WRITE 0x1000u
32#define ___GFP_MEMALLOC 0x2000u 32#define ___GFP_NOWARN 0x2000u
33#define ___GFP_COMP 0x4000u 33#define ___GFP_RETRY_MAYFAIL 0x4000u
34#define ___GFP_ZERO 0x8000u 34#define ___GFP_NOFAIL 0x8000u
35#define ___GFP_NOMEMALLOC 0x10000u 35#define ___GFP_NORETRY 0x10000u
36#define ___GFP_HARDWALL 0x20000u 36#define ___GFP_MEMALLOC 0x20000u
37#define ___GFP_THISNODE 0x40000u 37#define ___GFP_COMP 0x40000u
38#define ___GFP_ATOMIC 0x80000u 38#define ___GFP_NOMEMALLOC 0x80000u
39#define ___GFP_ACCOUNT 0x100000u 39#define ___GFP_HARDWALL 0x100000u
40#define ___GFP_DIRECT_RECLAIM 0x200000u 40#define ___GFP_THISNODE 0x200000u
41#define ___GFP_KSWAPD_RECLAIM 0x400000u 41#define ___GFP_ACCOUNT 0x400000u
42#ifdef CONFIG_LOCKDEP 42#ifdef CONFIG_LOCKDEP
43#define ___GFP_NOLOCKDEP 0x800000u 43#define ___GFP_NOLOCKDEP 0x800000u
44#else 44#else
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 087fd5f48c91..ea35263eb76b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -371,6 +371,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
371 nodemask_t *nmask); 371 nodemask_t *nmask);
372struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, 372struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
373 unsigned long address); 373 unsigned long address);
374struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
375 int nid, nodemask_t *nmask);
374int huge_add_to_page_cache(struct page *page, struct address_space *mapping, 376int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
375 pgoff_t idx); 377 pgoff_t idx);
376 378
@@ -493,17 +495,54 @@ static inline pgoff_t basepage_index(struct page *page)
493extern int dissolve_free_huge_page(struct page *page); 495extern int dissolve_free_huge_page(struct page *page);
494extern int dissolve_free_huge_pages(unsigned long start_pfn, 496extern int dissolve_free_huge_pages(unsigned long start_pfn,
495 unsigned long end_pfn); 497 unsigned long end_pfn);
496static inline bool hugepage_migration_supported(struct hstate *h) 498
497{
498#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION 499#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
500#ifndef arch_hugetlb_migration_supported
501static inline bool arch_hugetlb_migration_supported(struct hstate *h)
502{
499 if ((huge_page_shift(h) == PMD_SHIFT) || 503 if ((huge_page_shift(h) == PMD_SHIFT) ||
500 (huge_page_shift(h) == PGDIR_SHIFT)) 504 (huge_page_shift(h) == PUD_SHIFT) ||
505 (huge_page_shift(h) == PGDIR_SHIFT))
501 return true; 506 return true;
502 else 507 else
503 return false; 508 return false;
509}
510#endif
504#else 511#else
512static inline bool arch_hugetlb_migration_supported(struct hstate *h)
513{
505 return false; 514 return false;
515}
506#endif 516#endif
517
518static inline bool hugepage_migration_supported(struct hstate *h)
519{
520 return arch_hugetlb_migration_supported(h);
521}
522
523/*
524 * Movability check is different as compared to migration check.
525 * It determines whether or not a huge page should be placed on
526 * movable zone or not. Movability of any huge page should be
527 * required only if huge page size is supported for migration.
528 * There wont be any reason for the huge page to be movable if
529 * it is not migratable to start with. Also the size of the huge
530 * page should be large enough to be placed under a movable zone
531 * and still feasible enough to be migratable. Just the presence
532 * in movable zone does not make the migration feasible.
533 *
534 * So even though large huge page sizes like the gigantic ones
535 * are migratable they should not be movable because its not
536 * feasible to migrate them from movable zone.
537 */
538static inline bool hugepage_movable_supported(struct hstate *h)
539{
540 if (!hugepage_migration_supported(h))
541 return false;
542
543 if (hstate_is_gigantic(h))
544 return false;
545 return true;
507} 546}
508 547
509static inline spinlock_t *huge_pte_lockptr(struct hstate *h, 548static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
@@ -543,6 +582,26 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
543 set_huge_pte_at(mm, addr, ptep, pte); 582 set_huge_pte_at(mm, addr, ptep, pte);
544} 583}
545#endif 584#endif
585
586#ifndef huge_ptep_modify_prot_start
587#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
588static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
589 unsigned long addr, pte_t *ptep)
590{
591 return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
592}
593#endif
594
595#ifndef huge_ptep_modify_prot_commit
596#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
597static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
598 unsigned long addr, pte_t *ptep,
599 pte_t old_pte, pte_t pte)
600{
601 set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
602}
603#endif
604
546#else /* CONFIG_HUGETLB_PAGE */ 605#else /* CONFIG_HUGETLB_PAGE */
547struct hstate {}; 606struct hstate {};
548#define alloc_huge_page(v, a, r) NULL 607#define alloc_huge_page(v, a, r) NULL
@@ -602,6 +661,11 @@ static inline bool hugepage_migration_supported(struct hstate *h)
602 return false; 661 return false;
603} 662}
604 663
664static inline bool hugepage_movable_supported(struct hstate *h)
665{
666 return false;
667}
668
605static inline spinlock_t *huge_pte_lockptr(struct hstate *h, 669static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
606 struct mm_struct *mm, pte_t *pte) 670 struct mm_struct *mm, pte_t *pte)
607{ 671{
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index d314150658a4..a61dc075e2ce 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,7 +2,7 @@
2#ifndef _LINUX_KASAN_CHECKS_H 2#ifndef _LINUX_KASAN_CHECKS_H
3#define _LINUX_KASAN_CHECKS_H 3#define _LINUX_KASAN_CHECKS_H
4 4
5#ifdef CONFIG_KASAN 5#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL)
6void kasan_check_read(const volatile void *p, unsigned int size); 6void kasan_check_read(const volatile void *p, unsigned int size);
7void kasan_check_write(const volatile void *p, unsigned int size); 7void kasan_check_write(const volatile void *p, unsigned int size);
8#else 8#else
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 5b36b1287a5a..0cac1207bb00 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -25,6 +25,7 @@ struct seq_file;
25struct vm_area_struct; 25struct vm_area_struct;
26struct super_block; 26struct super_block;
27struct file_system_type; 27struct file_system_type;
28struct poll_table_struct;
28 29
29struct kernfs_open_node; 30struct kernfs_open_node;
30struct kernfs_iattrs; 31struct kernfs_iattrs;
@@ -261,6 +262,9 @@ struct kernfs_ops {
261 ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, 262 ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
262 loff_t off); 263 loff_t off);
263 264
265 __poll_t (*poll)(struct kernfs_open_file *of,
266 struct poll_table_struct *pt);
267
264 int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); 268 int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
265 269
266#ifdef CONFIG_DEBUG_LOCK_ALLOC 270#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -350,6 +354,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
350int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, 354int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
351 const char *new_name, const void *new_ns); 355 const char *new_name, const void *new_ns);
352int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); 356int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
357__poll_t kernfs_generic_poll(struct kernfs_open_file *of,
358 struct poll_table_struct *pt);
353void kernfs_notify(struct kernfs_node *kn); 359void kernfs_notify(struct kernfs_node *kn);
354 360
355const void *kernfs_super_ns(struct super_block *sb); 361const void *kernfs_super_ns(struct super_block *sb);
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 161e8164abcf..e48b1e453ff5 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -53,6 +53,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
53 53
54void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); 54void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
55void ksm_migrate_page(struct page *newpage, struct page *oldpage); 55void ksm_migrate_page(struct page *newpage, struct page *oldpage);
56bool reuse_ksm_page(struct page *page,
57 struct vm_area_struct *vma, unsigned long address);
56 58
57#else /* !CONFIG_KSM */ 59#else /* !CONFIG_KSM */
58 60
@@ -86,6 +88,11 @@ static inline void rmap_walk_ksm(struct page *page,
86static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) 88static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
87{ 89{
88} 90}
91static inline bool reuse_ksm_page(struct page *page,
92 struct vm_area_struct *vma, unsigned long address)
93{
94 return false;
95}
89#endif /* CONFIG_MMU */ 96#endif /* CONFIG_MMU */
90#endif /* !CONFIG_KSM */ 97#endif /* !CONFIG_KSM */
91 98
diff --git a/include/linux/list.h b/include/linux/list.h
index edb7628e46ed..79626b5ab36c 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -207,6 +207,17 @@ static inline void list_bulk_move_tail(struct list_head *head,
207} 207}
208 208
209/** 209/**
210 * list_is_first -- tests whether @ list is the first entry in list @head
211 * @list: the entry to test
212 * @head: the head of the list
213 */
214static inline int list_is_first(const struct list_head *list,
215 const struct list_head *head)
216{
217 return list->prev == head;
218}
219
220/**
210 * list_is_last - tests whether @list is the last entry in list @head 221 * list_is_last - tests whether @list is the last entry in list @head
211 * @list: the entry to test 222 * @list: the entry to test
212 * @head: the head of the list 223 * @head: the head of the list
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 83ae11cbd12c..1f3d880b7ca1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -429,6 +429,11 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
429} 429}
430struct mem_cgroup *mem_cgroup_from_id(unsigned short id); 430struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
431 431
432static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
433{
434 return mem_cgroup_from_css(seq_css(m));
435}
436
432static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) 437static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
433{ 438{
434 struct mem_cgroup_per_node *mz; 439 struct mem_cgroup_per_node *mz;
@@ -937,6 +942,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
937 return NULL; 942 return NULL;
938} 943}
939 944
945static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
946{
947 return NULL;
948}
949
940static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) 950static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
941{ 951{
942 return NULL; 952 return NULL;
@@ -1273,12 +1283,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
1273 1283
1274struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); 1284struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
1275void memcg_kmem_put_cache(struct kmem_cache *cachep); 1285void memcg_kmem_put_cache(struct kmem_cache *cachep);
1276int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
1277 struct mem_cgroup *memcg);
1278 1286
1279#ifdef CONFIG_MEMCG_KMEM 1287#ifdef CONFIG_MEMCG_KMEM
1280int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); 1288int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
1281void memcg_kmem_uncharge(struct page *page, int order); 1289void __memcg_kmem_uncharge(struct page *page, int order);
1290int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
1291 struct mem_cgroup *memcg);
1282 1292
1283extern struct static_key_false memcg_kmem_enabled_key; 1293extern struct static_key_false memcg_kmem_enabled_key;
1284extern struct workqueue_struct *memcg_kmem_cache_wq; 1294extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1300,6 +1310,26 @@ static inline bool memcg_kmem_enabled(void)
1300 return static_branch_unlikely(&memcg_kmem_enabled_key); 1310 return static_branch_unlikely(&memcg_kmem_enabled_key);
1301} 1311}
1302 1312
1313static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
1314{
1315 if (memcg_kmem_enabled())
1316 return __memcg_kmem_charge(page, gfp, order);
1317 return 0;
1318}
1319
1320static inline void memcg_kmem_uncharge(struct page *page, int order)
1321{
1322 if (memcg_kmem_enabled())
1323 __memcg_kmem_uncharge(page, order);
1324}
1325
1326static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp,
1327 int order, struct mem_cgroup *memcg)
1328{
1329 if (memcg_kmem_enabled())
1330 return __memcg_kmem_charge_memcg(page, gfp, order, memcg);
1331 return 0;
1332}
1303/* 1333/*
1304 * helper for accessing a memcg's index. It will be used as an index in the 1334 * helper for accessing a memcg's index. It will be used as an index in the
1305 * child cache array in kmem_cache, and also to derive its name. This function 1335 * child cache array in kmem_cache, and also to derive its name. This function
@@ -1325,6 +1355,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order)
1325{ 1355{
1326} 1356}
1327 1357
1358static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
1359{
1360 return 0;
1361}
1362
1363static inline void __memcg_kmem_uncharge(struct page *page, int order)
1364{
1365}
1366
1328#define for_each_memcg_cache_index(_idx) \ 1367#define for_each_memcg_cache_index(_idx) \
1329 for (; NULL; ) 1368 for (; NULL; )
1330 1369
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 368267c1b71b..52869d6d38b3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -89,7 +89,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
89 unsigned long *valid_start, unsigned long *valid_end); 89 unsigned long *valid_start, unsigned long *valid_end);
90extern void __offline_isolated_pages(unsigned long, unsigned long); 90extern void __offline_isolated_pages(unsigned long, unsigned long);
91 91
92typedef void (*online_page_callback_t)(struct page *page); 92typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
93 93
94extern int set_online_page_callback(online_page_callback_t callback); 94extern int set_online_page_callback(online_page_callback_t callback);
95extern int restore_online_page_callback(online_page_callback_t callback); 95extern int restore_online_page_callback(online_page_callback_t callback);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..20ec56f8e2bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1536,7 +1536,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1536 unsigned int gup_flags, struct page **pages, int *locked); 1536 unsigned int gup_flags, struct page **pages, int *locked);
1537long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 1537long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1538 struct page **pages, unsigned int gup_flags); 1538 struct page **pages, unsigned int gup_flags);
1539#ifdef CONFIG_FS_DAX 1539
1540#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA)
1540long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, 1541long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1541 unsigned int gup_flags, struct page **pages, 1542 unsigned int gup_flags, struct page **pages,
1542 struct vm_area_struct **vmas); 1543 struct vm_area_struct **vmas);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0a36a22228e7..ab9b48420200 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -80,7 +80,7 @@ struct page {
80 struct { /* Page cache and anonymous pages */ 80 struct { /* Page cache and anonymous pages */
81 /** 81 /**
82 * @lru: Pageout list, eg. active_list protected by 82 * @lru: Pageout list, eg. active_list protected by
83 * zone_lru_lock. Sometimes used as a generic list 83 * pgdat->lru_lock. Sometimes used as a generic list
84 * by the page owner. 84 * by the page owner.
85 */ 85 */
86 struct list_head lru; 86 struct list_head lru;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 842f9189537b..fba7741533be 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -480,6 +480,8 @@ struct zone {
480 unsigned long compact_cached_free_pfn; 480 unsigned long compact_cached_free_pfn;
481 /* pfn where async and sync compaction migration scanner should start */ 481 /* pfn where async and sync compaction migration scanner should start */
482 unsigned long compact_cached_migrate_pfn[2]; 482 unsigned long compact_cached_migrate_pfn[2];
483 unsigned long compact_init_migrate_pfn;
484 unsigned long compact_init_free_pfn;
483#endif 485#endif
484 486
485#ifdef CONFIG_COMPACTION 487#ifdef CONFIG_COMPACTION
@@ -728,10 +730,6 @@ typedef struct pglist_data {
728 730
729#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 731#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
730#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) 732#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
731static inline spinlock_t *zone_lru_lock(struct zone *zone)
732{
733 return &zone->zone_pgdat->lru_lock;
734}
735 733
736static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) 734static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
737{ 735{
@@ -1299,7 +1297,7 @@ void memory_present(int nid, unsigned long start, unsigned long end);
1299 1297
1300/* 1298/*
1301 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we 1299 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
1302 * need to check pfn validility within that MAX_ORDER_NR_PAGES block. 1300 * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
1303 * pfn_valid_within() should be used in this case; we optimise this away 1301 * pfn_valid_within() should be used in this case; we optimise this away
1304 * when we have no holes within a MAX_ORDER_NR_PAGES block. 1302 * when we have no holes within a MAX_ORDER_NR_PAGES block.
1305 */ 1303 */
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 5a30ad594ccc..27e7fa36f707 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -444,8 +444,8 @@ static inline int next_memory_node(int nid)
444 return next_node(nid, node_states[N_MEMORY]); 444 return next_node(nid, node_states[N_MEMORY]);
445} 445}
446 446
447extern int nr_node_ids; 447extern unsigned int nr_node_ids;
448extern int nr_online_nodes; 448extern unsigned int nr_online_nodes;
449 449
450static inline void node_set_online(int nid) 450static inline void node_set_online(int nid)
451{ 451{
@@ -485,8 +485,8 @@ static inline int num_node_state(enum node_states state)
485#define first_online_node 0 485#define first_online_node 0
486#define first_memory_node 0 486#define first_memory_node 0
487#define next_online_node(nid) (MAX_NUMNODES) 487#define next_online_node(nid) (MAX_NUMNODES)
488#define nr_node_ids 1 488#define nr_node_ids 1U
489#define nr_online_nodes 1 489#define nr_online_nodes 1U
490 490
491#define node_set_online(node) node_set_state((node), N_ONLINE) 491#define node_set_online(node) node_set_state((node), N_ONLINE)
492#define node_set_offline(node) node_clear_state((node), N_ONLINE) 492#define node_set_offline(node) node_clear_state((node), N_ONLINE)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 39b4494e29f1..9f8712a4b1a5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -17,8 +17,37 @@
17/* 17/*
18 * Various page->flags bits: 18 * Various page->flags bits:
19 * 19 *
20 * PG_reserved is set for special pages, which can never be swapped out. Some 20 * PG_reserved is set for special pages. The "struct page" of such a page
21 * of them might not even exist... 21 * should in general not be touched (e.g. set dirty) except by its owner.
22 * Pages marked as PG_reserved include:
23 * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
24 * initrd, HW tables)
25 * - Pages reserved or allocated early during boot (before the page allocator
26 * was initialized). This includes (depending on the architecture) the
27 * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
28 * much more. Once (if ever) freed, PG_reserved is cleared and they will
29 * be given to the page allocator.
30 * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
31 * to read/write these pages might end badly. Don't touch!
32 * - The zero page(s)
33 * - Pages not added to the page allocator when onlining a section because
34 * they were excluded via the online_page_callback() or because they are
35 * PG_hwpoison.
36 * - Pages allocated in the context of kexec/kdump (loaded kernel image,
37 * control pages, vmcoreinfo)
38 * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
39 * not marked PG_reserved (as they might be in use by somebody else who does
40 * not respect the caching strategy).
41 * - Pages part of an offline section (struct pages of offline sections should
42 * not be trusted as they will be initialized when first onlined).
43 * - MCA pages on ia64
44 * - Pages holding CPU notes for POWER Firmware Assisted Dump
45 * - Device memory (e.g. PMEM, DAX, HMM)
46 * Some PG_reserved pages will be excluded from the hibernation image.
47 * PG_reserved does in general not hinder anybody from dumping or swapping
48 * and is no longer required for remap_pfn_range(). ioremap might require it.
49 * Consequently, PG_reserved for a page mapped into user space can indicate
50 * the zero page, the vDSO, MMIO pages or device memory.
22 * 51 *
23 * The PG_private bitflag is set on pagecache pages if they contain filesystem 52 * The PG_private bitflag is set on pagecache pages if they contain filesystem
24 * specific data (which is normally at page->private). It can be used by 53 * specific data (which is normally at page->private). It can be used by
@@ -671,7 +700,7 @@ PAGEFLAG_FALSE(DoubleMap)
671/* Reserve 0x0000007f to catch underflows of page_mapcount */ 700/* Reserve 0x0000007f to catch underflows of page_mapcount */
672#define PAGE_MAPCOUNT_RESERVE -128 701#define PAGE_MAPCOUNT_RESERVE -128
673#define PG_buddy 0x00000080 702#define PG_buddy 0x00000080
674#define PG_balloon 0x00000100 703#define PG_offline 0x00000100
675#define PG_kmemcg 0x00000200 704#define PG_kmemcg 0x00000200
676#define PG_table 0x00000400 705#define PG_table 0x00000400
677 706
@@ -706,10 +735,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \
706PAGE_TYPE_OPS(Buddy, buddy) 735PAGE_TYPE_OPS(Buddy, buddy)
707 736
708/* 737/*
709 * PageBalloon() is true for pages that are on the balloon page list 738 * PageOffline() indicates that the page is logically offline although the
710 * (see mm/balloon_compaction.c). 739 * containing section is online. (e.g. inflated in a balloon driver or
740 * not onlined when onlining the section).
741 * The content of these pages is effectively stale. Such pages should not
742 * be touched (read/write/dump/save) except by their owner.
711 */ 743 */
712PAGE_TYPE_OPS(Balloon, balloon) 744PAGE_TYPE_OPS(Offline, offline)
713 745
714/* 746/*
715 * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on 747 * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e2d7039af6a3..b477a70cc2e4 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -164,7 +164,7 @@ void release_pages(struct page **pages, int nr);
164 * will find the page or it will not. Likewise, the old find_get_page could run 164 * will find the page or it will not. Likewise, the old find_get_page could run
165 * either before the insertion or afterwards, depending on timing. 165 * either before the insertion or afterwards, depending on timing.
166 */ 166 */
167static inline int page_cache_get_speculative(struct page *page) 167static inline int __page_cache_add_speculative(struct page *page, int count)
168{ 168{
169#ifdef CONFIG_TINY_RCU 169#ifdef CONFIG_TINY_RCU
170# ifdef CONFIG_PREEMPT_COUNT 170# ifdef CONFIG_PREEMPT_COUNT
@@ -180,10 +180,10 @@ static inline int page_cache_get_speculative(struct page *page)
180 * SMP requires. 180 * SMP requires.
181 */ 181 */
182 VM_BUG_ON_PAGE(page_count(page) == 0, page); 182 VM_BUG_ON_PAGE(page_count(page) == 0, page);
183 page_ref_inc(page); 183 page_ref_add(page, count);
184 184
185#else 185#else
186 if (unlikely(!get_page_unless_zero(page))) { 186 if (unlikely(!page_ref_add_unless(page, count, 0))) {
187 /* 187 /*
188 * Either the page has been freed, or will be freed. 188 * Either the page has been freed, or will be freed.
189 * In either case, retry here and the caller should 189 * In either case, retry here and the caller should
@@ -197,27 +197,14 @@ static inline int page_cache_get_speculative(struct page *page)
197 return 1; 197 return 1;
198} 198}
199 199
200/* 200static inline int page_cache_get_speculative(struct page *page)
201 * Same as above, but add instead of inc (could just be merged)
202 */
203static inline int page_cache_add_speculative(struct page *page, int count)
204{ 201{
205 VM_BUG_ON(in_interrupt()); 202 return __page_cache_add_speculative(page, 1);
206 203}
207#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
208# ifdef CONFIG_PREEMPT_COUNT
209 VM_BUG_ON(!in_atomic() && !irqs_disabled());
210# endif
211 VM_BUG_ON_PAGE(page_count(page) == 0, page);
212 page_ref_add(page, count);
213
214#else
215 if (unlikely(!page_ref_add_unless(page, count, 0)))
216 return 0;
217#endif
218 VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page);
219 204
220 return 1; 205static inline int page_cache_add_speculative(struct page *page, int count)
206{
207 return __page_cache_add_speculative(page, count);
221} 208}
222 209
223#ifdef CONFIG_NUMA 210#ifdef CONFIG_NUMA
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 15927ebc22f2..5046bad0c1c5 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -30,7 +30,7 @@
30 */ 30 */
31#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) 31#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
32 32
33/********** mm/debug-pagealloc.c **********/ 33/********** mm/page_poison.c **********/
34#ifdef CONFIG_PAGE_POISONING_ZERO 34#ifdef CONFIG_PAGE_POISONING_ZERO
35#define PAGE_POISON 0x00 35#define PAGE_POISON 0x00
36#else 36#else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 903ef29b62c3..f073bd59df32 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -48,6 +48,7 @@ struct pid_namespace;
48struct pipe_inode_info; 48struct pipe_inode_info;
49struct rcu_node; 49struct rcu_node;
50struct reclaim_state; 50struct reclaim_state;
51struct capture_control;
51struct robust_list_head; 52struct robust_list_head;
52struct sched_attr; 53struct sched_attr;
53struct sched_param; 54struct sched_param;
@@ -950,6 +951,9 @@ struct task_struct {
950 951
951 struct io_context *io_context; 952 struct io_context *io_context;
952 953
954#ifdef CONFIG_COMPACTION
955 struct capture_control *capture_control;
956#endif
953 /* Ptrace state: */ 957 /* Ptrace state: */
954 unsigned long ptrace_message; 958 unsigned long ptrace_message;
955 kernel_siginfo_t *last_siginfo; 959 kernel_siginfo_t *last_siginfo;
@@ -1395,6 +1399,7 @@ extern struct pid *cad_pid;
1395#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ 1399#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
1396#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ 1400#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
1397#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1401#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
1402#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
1398#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ 1403#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
1399#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ 1404#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
1400 1405
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 3bfa6a0cbba4..0cd9f10423fb 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -148,17 +148,25 @@ static inline bool in_vfork(struct task_struct *tsk)
148 * Applies per-task gfp context to the given allocation flags. 148 * Applies per-task gfp context to the given allocation flags.
149 * PF_MEMALLOC_NOIO implies GFP_NOIO 149 * PF_MEMALLOC_NOIO implies GFP_NOIO
150 * PF_MEMALLOC_NOFS implies GFP_NOFS 150 * PF_MEMALLOC_NOFS implies GFP_NOFS
151 * PF_MEMALLOC_NOCMA implies no allocation from CMA region.
151 */ 152 */
152static inline gfp_t current_gfp_context(gfp_t flags) 153static inline gfp_t current_gfp_context(gfp_t flags)
153{ 154{
154 /* 155 if (unlikely(current->flags &
155 * NOIO implies both NOIO and NOFS and it is a weaker context 156 (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) {
156 * so always make sure it makes precedence 157 /*
157 */ 158 * NOIO implies both NOIO and NOFS and it is a weaker context
158 if (unlikely(current->flags & PF_MEMALLOC_NOIO)) 159 * so always make sure it makes precedence
159 flags &= ~(__GFP_IO | __GFP_FS); 160 */
160 else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) 161 if (current->flags & PF_MEMALLOC_NOIO)
161 flags &= ~__GFP_FS; 162 flags &= ~(__GFP_IO | __GFP_FS);
163 else if (current->flags & PF_MEMALLOC_NOFS)
164 flags &= ~__GFP_FS;
165#ifdef CONFIG_CMA
166 if (current->flags & PF_MEMALLOC_NOCMA)
167 flags &= ~__GFP_MOVABLE;
168#endif
169 }
162 return flags; 170 return flags;
163} 171}
164 172
@@ -248,6 +256,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
248 current->flags = (current->flags & ~PF_MEMALLOC) | flags; 256 current->flags = (current->flags & ~PF_MEMALLOC) | flags;
249} 257}
250 258
259#ifdef CONFIG_CMA
260static inline unsigned int memalloc_nocma_save(void)
261{
262 unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
263
264 current->flags |= PF_MEMALLOC_NOCMA;
265 return flags;
266}
267
268static inline void memalloc_nocma_restore(unsigned int flags)
269{
270 current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
271}
272#else
273static inline unsigned int memalloc_nocma_save(void)
274{
275 return 0;
276}
277
278static inline void memalloc_nocma_restore(unsigned int flags)
279{
280}
281#endif
282
251#ifdef CONFIG_MEMCG 283#ifdef CONFIG_MEMCG
252/** 284/**
253 * memalloc_use_memcg - Starts the remote memcg charging scope. 285 * memalloc_use_memcg - Starts the remote memcg charging scope.
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index f155dc607112..f3fb1edb3526 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -72,7 +72,8 @@ extern void shmem_unlock_mapping(struct address_space *mapping);
72extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 72extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
73 pgoff_t index, gfp_t gfp_mask); 73 pgoff_t index, gfp_t gfp_mask);
74extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); 74extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
75extern int shmem_unuse(swp_entry_t entry, struct page *page); 75extern int shmem_unuse(unsigned int type, bool frontswap,
76 unsigned long *fs_pages_to_unuse);
76 77
77extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); 78extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
78extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, 79extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 3a1a1dbc6f49..d2153789bd9f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -81,12 +81,12 @@ struct kmem_cache_order_objects {
81 */ 81 */
82struct kmem_cache { 82struct kmem_cache {
83 struct kmem_cache_cpu __percpu *cpu_slab; 83 struct kmem_cache_cpu __percpu *cpu_slab;
84 /* Used for retriving partial slabs etc */ 84 /* Used for retrieving partial slabs, etc. */
85 slab_flags_t flags; 85 slab_flags_t flags;
86 unsigned long min_partial; 86 unsigned long min_partial;
87 unsigned int size; /* The size of an object including meta data */ 87 unsigned int size; /* The size of an object including metadata */
88 unsigned int object_size;/* The size of an object without meta data */ 88 unsigned int object_size;/* The size of an object without metadata */
89 unsigned int offset; /* Free pointer offset. */ 89 unsigned int offset; /* Free pointer offset */
90#ifdef CONFIG_SLUB_CPU_PARTIAL 90#ifdef CONFIG_SLUB_CPU_PARTIAL
91 /* Number of per cpu partial objects to keep around */ 91 /* Number of per cpu partial objects to keep around */
92 unsigned int cpu_partial; 92 unsigned int cpu_partial;
@@ -110,7 +110,7 @@ struct kmem_cache {
110#endif 110#endif
111#ifdef CONFIG_MEMCG 111#ifdef CONFIG_MEMCG
112 struct memcg_cache_params memcg_params; 112 struct memcg_cache_params memcg_params;
113 /* for propagation, maximum size of a stored attr */ 113 /* For propagation, maximum size of a stored attr */
114 unsigned int max_attr_size; 114 unsigned int max_attr_size;
115#ifdef CONFIG_SYSFS 115#ifdef CONFIG_SYSFS
116 struct kset *memcg_kset; 116 struct kset *memcg_kset;
@@ -151,7 +151,7 @@ struct kmem_cache {
151#else 151#else
152#define slub_cpu_partial(s) (0) 152#define slub_cpu_partial(s) (0)
153#define slub_set_cpu_partial(s, n) 153#define slub_set_cpu_partial(s, n)
154#endif // CONFIG_SLUB_CPU_PARTIAL 154#endif /* CONFIG_SLUB_CPU_PARTIAL */
155 155
156#ifdef CONFIG_SYSFS 156#ifdef CONFIG_SYSFS
157#define SLAB_SUPPORTS_SYSFS 157#define SLAB_SUPPORTS_SYSFS
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 622025ac1461..fc50e21b3b88 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -307,7 +307,7 @@ struct vma_swap_readahead {
307}; 307};
308 308
309/* linux/mm/workingset.c */ 309/* linux/mm/workingset.c */
310void *workingset_eviction(struct address_space *mapping, struct page *page); 310void *workingset_eviction(struct page *page);
311void workingset_refault(struct page *page, void *shadow); 311void workingset_refault(struct page *page, void *shadow);
312void workingset_activation(struct page *page); 312void workingset_activation(struct page *page);
313 313
@@ -625,7 +625,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
625 return vm_swappiness; 625 return vm_swappiness;
626 626
627 /* root ? */ 627 /* root ? */
628 if (mem_cgroup_disabled() || !memcg->css.parent) 628 if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
629 return vm_swappiness; 629 return vm_swappiness;
630 630
631 return memcg->swappiness; 631 return memcg->swappiness;
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd9a350..a2f8658f1c55 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -41,6 +41,7 @@
41#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ 41#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
42#define F_SEAL_GROW 0x0004 /* prevent file from growing */ 42#define F_SEAL_GROW 0x0004 /* prevent file from growing */
43#define F_SEAL_WRITE 0x0008 /* prevent writes */ 43#define F_SEAL_WRITE 0x0008 /* prevent writes */
44#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
44/* (1U << 31) is reserved for signed error codes */ 45/* (1U << 31) is reserved for signed error codes */
45 46
46/* 47/*
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 21b9113c69da..6f2f2720f3ac 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -32,7 +32,7 @@
32 32
33#define KPF_KSM 21 33#define KPF_KSM 21
34#define KPF_THP 22 34#define KPF_THP 22
35#define KPF_BALLOON 23 35#define KPF_OFFLINE 23
36#define KPF_ZERO_PAGE 24 36#define KPF_ZERO_PAGE 24
37#define KPF_IDLE 25 37#define KPF_IDLE 25
38#define KPF_PGTABLE 26 38#define KPF_PGTABLE 26
diff --git a/init/init_task.c b/init/init_task.c
index 46dbf546264d..df0257c5928c 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -10,6 +10,7 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/audit.h> 12#include <linux/audit.h>
13#include <linux/numa.h>
13 14
14#include <asm/pgtable.h> 15#include <asm/pgtable.h>
15#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -154,7 +155,7 @@ struct task_struct init_task
154 .vtime.state = VTIME_SYS, 155 .vtime.state = VTIME_SYS,
155#endif 156#endif
156#ifdef CONFIG_NUMA_BALANCING 157#ifdef CONFIG_NUMA_BALANCING
157 .numa_preferred_nid = -1, 158 .numa_preferred_nid = NUMA_NO_NODE,
158 .numa_group = NULL, 159 .numa_group = NULL,
159 .numa_faults = NULL, 160 .numa_faults = NULL,
160#endif 161#endif
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index cef98502b124..17828333f7c3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3534,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3534 return ret ?: nbytes; 3534 return ret ?: nbytes;
3535} 3535}
3536 3536
3537static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3538{
3539 struct cftype *cft = of->kn->priv;
3540
3541 if (cft->poll)
3542 return cft->poll(of, pt);
3543
3544 return kernfs_generic_poll(of, pt);
3545}
3546
3537static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 3547static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3538{ 3548{
3539 return seq_cft(seq)->seq_start(seq, ppos); 3549 return seq_cft(seq)->seq_start(seq, ppos);
@@ -3572,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = {
3572 .open = cgroup_file_open, 3582 .open = cgroup_file_open,
3573 .release = cgroup_file_release, 3583 .release = cgroup_file_release,
3574 .write = cgroup_file_write, 3584 .write = cgroup_file_write,
3585 .poll = cgroup_file_poll,
3575 .seq_show = cgroup_seqfile_show, 3586 .seq_show = cgroup_seqfile_show,
3576}; 3587};
3577 3588
@@ -3580,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = {
3580 .open = cgroup_file_open, 3591 .open = cgroup_file_open,
3581 .release = cgroup_file_release, 3592 .release = cgroup_file_release,
3582 .write = cgroup_file_write, 3593 .write = cgroup_file_write,
3594 .poll = cgroup_file_poll,
3583 .seq_start = cgroup_seqfile_start, 3595 .seq_start = cgroup_seqfile_start,
3584 .seq_next = cgroup_seqfile_next, 3596 .seq_next = cgroup_seqfile_next,
3585 .seq_stop = cgroup_seqfile_stop, 3597 .seq_stop = cgroup_seqfile_stop,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 933cb3e45b98..093c9f917ed0 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void)
464 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 464 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
465#ifdef CONFIG_HUGETLB_PAGE 465#ifdef CONFIG_HUGETLB_PAGE
466 VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); 466 VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
467#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
468 VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
467#endif 469#endif
468 470
469 arch_crash_save_vmcoreinfo(); 471 arch_crash_save_vmcoreinfo();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9cf20cc5ebe3..5942eeafb9ac 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
20#include <linux/freezer.h> 20#include <linux/freezer.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/numa.h>
23#include <trace/events/sched.h> 24#include <trace/events/sched.h>
24 25
25static DEFINE_SPINLOCK(kthread_create_lock); 26static DEFINE_SPINLOCK(kthread_create_lock);
@@ -681,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
681{ 682{
682 struct kthread_worker *worker; 683 struct kthread_worker *worker;
683 struct task_struct *task; 684 struct task_struct *task;
684 int node = -1; 685 int node = NUMA_NO_NODE;
685 686
686 worker = kzalloc(sizeof(*worker), GFP_KERNEL); 687 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
687 if (!worker) 688 if (!worker)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 640b2034edd6..4802b039b89f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
1215 if (!pfn_valid(pfn)) 1215 if (!pfn_valid(pfn))
1216 return NULL; 1216 return NULL;
1217 1217
1218 page = pfn_to_page(pfn); 1218 page = pfn_to_online_page(pfn);
1219 if (page_zone(page) != zone) 1219 if (!page || page_zone(page) != zone)
1220 return NULL; 1220 return NULL;
1221 1221
1222 BUG_ON(!PageHighMem(page)); 1222 BUG_ON(!PageHighMem(page));
1223 1223
1224 if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || 1224 if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
1225 PageReserved(page)) 1225 return NULL;
1226
1227 if (PageReserved(page) || PageOffline(page))
1226 return NULL; 1228 return NULL;
1227 1229
1228 if (page_is_guard(page)) 1230 if (page_is_guard(page))
@@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
1277 if (!pfn_valid(pfn)) 1279 if (!pfn_valid(pfn))
1278 return NULL; 1280 return NULL;
1279 1281
1280 page = pfn_to_page(pfn); 1282 page = pfn_to_online_page(pfn);
1281 if (page_zone(page) != zone) 1283 if (!page || page_zone(page) != zone)
1282 return NULL; 1284 return NULL;
1283 1285
1284 BUG_ON(PageHighMem(page)); 1286 BUG_ON(PageHighMem(page));
@@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
1286 if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) 1288 if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
1287 return NULL; 1289 return NULL;
1288 1290
1291 if (PageOffline(page))
1292 return NULL;
1293
1289 if (PageReserved(page) 1294 if (PageReserved(page)
1290 && (!kernel_page_present(page) || pfn_is_nosave(pfn))) 1295 && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
1291 return NULL; 1296 return NULL;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f3901b84d217..ead464a0f2e5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2220,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2220 INIT_HLIST_HEAD(&p->preempt_notifiers); 2220 INIT_HLIST_HEAD(&p->preempt_notifiers);
2221#endif 2221#endif
2222 2222
2223#ifdef CONFIG_COMPACTION
2224 p->capture_control = NULL;
2225#endif
2223 init_numa_balancing(clone_flags, p); 2226 init_numa_balancing(clone_flags, p);
2224} 2227}
2225 2228
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8213ff6e365d..ea74d43924b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1173,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1173 1173
1174 /* New address space, reset the preferred nid */ 1174 /* New address space, reset the preferred nid */
1175 if (!(clone_flags & CLONE_VM)) { 1175 if (!(clone_flags & CLONE_VM)) {
1176 p->numa_preferred_nid = -1; 1176 p->numa_preferred_nid = NUMA_NO_NODE;
1177 return; 1177 return;
1178 } 1178 }
1179 1179
@@ -1193,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1193 1193
1194static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 1194static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1195{ 1195{
1196 rq->nr_numa_running += (p->numa_preferred_nid != -1); 1196 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1197 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); 1197 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1198} 1198}
1199 1199
1200static void account_numa_dequeue(struct rq *rq, struct task_struct *p) 1200static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1201{ 1201{
1202 rq->nr_numa_running -= (p->numa_preferred_nid != -1); 1202 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1203 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); 1203 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1204} 1204}
1205 1205
@@ -1413,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1413 * two full passes of the "multi-stage node selection" test that is 1413 * two full passes of the "multi-stage node selection" test that is
1414 * executed below. 1414 * executed below.
1415 */ 1415 */
1416 if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && 1416 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1417 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) 1417 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1418 return true; 1418 return true;
1419 1419
@@ -1861,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1861 unsigned long interval = HZ; 1861 unsigned long interval = HZ;
1862 1862
1863 /* This task has no NUMA fault statistics yet */ 1863 /* This task has no NUMA fault statistics yet */
1864 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1864 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
1865 return; 1865 return;
1866 1866
1867 /* Periodically retry migrating the task to the preferred node */ 1867 /* Periodically retry migrating the task to the preferred node */
@@ -2108,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
2108 2108
2109static void task_numa_placement(struct task_struct *p) 2109static void task_numa_placement(struct task_struct *p)
2110{ 2110{
2111 int seq, nid, max_nid = -1; 2111 int seq, nid, max_nid = NUMA_NO_NODE;
2112 unsigned long max_faults = 0; 2112 unsigned long max_faults = 0;
2113 unsigned long fault_types[2] = { 0, 0 }; 2113 unsigned long fault_types[2] = { 0, 0 };
2114 unsigned long total_faults; 2114 unsigned long total_faults;
@@ -2651,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
2651 * the preferred node. 2651 * the preferred node.
2652 */ 2652 */
2653 if (dst_nid == p->numa_preferred_nid || 2653 if (dst_nid == p->numa_preferred_nid ||
2654 (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) 2654 (p->numa_preferred_nid != NUMA_NO_NODE &&
2655 src_nid != p->numa_preferred_nid))
2655 return; 2656 return;
2656 } 2657 }
2657 2658
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7c2b9bc88ee8..14f30b4a1b64 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1471,7 +1471,7 @@ static struct ctl_table vm_table[] = {
1471 .data = &sysctl_extfrag_threshold, 1471 .data = &sysctl_extfrag_threshold,
1472 .maxlen = sizeof(int), 1472 .maxlen = sizeof(int),
1473 .mode = 0644, 1473 .mode = 0644,
1474 .proc_handler = sysctl_extfrag_handler, 1474 .proc_handler = proc_dointvec_minmax,
1475 .extra1 = &min_extfrag_threshold, 1475 .extra1 = &min_extfrag_threshold,
1476 .extra2 = &max_extfrag_threshold, 1476 .extra2 = &max_extfrag_threshold,
1477 }, 1477 },
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d4df5b24d75e..e6a7b01932e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -222,7 +222,6 @@ config ENABLE_MUST_CHECK
222config FRAME_WARN 222config FRAME_WARN
223 int "Warn for stack frames larger than (needs gcc 4.4)" 223 int "Warn for stack frames larger than (needs gcc 4.4)"
224 range 0 8192 224 range 0 8192
225 default 3072 if KASAN_EXTRA
226 default 2048 if GCC_PLUGIN_LATENT_ENTROPY 225 default 2048 if GCC_PLUGIN_LATENT_ENTROPY
227 default 1280 if (!64BIT && PARISC) 226 default 1280 if (!64BIT && PARISC)
228 default 1024 if (!64BIT && !PARISC) 227 default 1024 if (!64BIT && !PARISC)
@@ -266,23 +265,6 @@ config UNUSED_SYMBOLS
266 you really need it, and what the merge plan to the mainline kernel for 265 you really need it, and what the merge plan to the mainline kernel for
267 your module is. 266 your module is.
268 267
269config PAGE_OWNER
270 bool "Track page owner"
271 depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
272 select DEBUG_FS
273 select STACKTRACE
274 select STACKDEPOT
275 select PAGE_EXTENSION
276 help
277 This keeps track of what call chain is the owner of a page, may
278 help to find bare alloc_page(s) leaks. Even if you include this
279 feature on your build, it is disabled in default. You should pass
280 "page_owner=on" to boot parameter in order to enable it. Eats
281 a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
282 for user-space helper.
283
284 If unsure, say N.
285
286config DEBUG_FS 268config DEBUG_FS
287 bool "Debug Filesystem" 269 bool "Debug Filesystem"
288 help 270 help
@@ -1876,6 +1858,19 @@ config TEST_LKM
1876 1858
1877 If unsure, say N. 1859 If unsure, say N.
1878 1860
1861config TEST_VMALLOC
1862 tristate "Test module for stress/performance analysis of vmalloc allocator"
1863 default n
1864 depends on MMU
1865 depends on m
1866 help
1867 This builds the "test_vmalloc" module that should be used for
1868 stress and performance analysis. So, any new change for vmalloc
1869 subsystem can be evaluated from performance and stability point
1870 of view.
1871
1872 If unsure, say N.
1873
1879config TEST_USER_COPY 1874config TEST_USER_COPY
1880 tristate "Test user/kernel boundary protections" 1875 tristate "Test user/kernel boundary protections"
1881 depends on m 1876 depends on m
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 9737059ec58b..9950b660e62d 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -78,16 +78,6 @@ config KASAN_SW_TAGS
78 78
79endchoice 79endchoice
80 80
81config KASAN_EXTRA
82 bool "KASAN: extra checks"
83 depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST
84 help
85 This enables further checks in generic KASAN, for now it only
86 includes the address-use-after-scope check that can lead to
87 excessive kernel stack usage, frame size warnings and longer
88 compile time.
89 See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715
90
91choice 81choice
92 prompt "Instrumentation type" 82 prompt "Instrumentation type"
93 depends on KASAN 83 depends on KASAN
diff --git a/lib/Makefile b/lib/Makefile
index e1b59da71418..cbfacd55aeca 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -60,6 +60,7 @@ UBSAN_SANITIZE_test_ubsan.o := y
60obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o 60obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
61obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o 61obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
62obj-$(CONFIG_TEST_LKM) += test_module.o 62obj-$(CONFIG_TEST_LKM) += test_module.o
63obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o
63obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o 64obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o
64obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o 65obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
65obj-$(CONFIG_TEST_SORT) += test_sort.o 66obj-$(CONFIG_TEST_SORT) += test_sort.o
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 8d666ab84b5c..087a3e9a0202 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -5,6 +5,7 @@
5#include <linux/cpumask.h> 5#include <linux/cpumask.h>
6#include <linux/export.h> 6#include <linux/export.h>
7#include <linux/memblock.h> 7#include <linux/memblock.h>
8#include <linux/numa.h>
8 9
9/** 10/**
10 * cpumask_next - get the next cpu in a cpumask 11 * cpumask_next - get the next cpu in a cpumask
@@ -206,7 +207,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
206 /* Wrap: we always want a cpu. */ 207 /* Wrap: we always want a cpu. */
207 i %= num_online_cpus(); 208 i %= num_online_cpus();
208 209
209 if (node == -1) { 210 if (node == NUMA_NO_NODE) {
210 for_each_cpu(cpu, cpu_online_mask) 211 for_each_cpu(cpu, cpu_online_mask)
211 if (i-- == 0) 212 if (i-- == 0)
212 return cpu; 213 return cpu;
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 51b78405bf24..7de2702621dc 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -480,29 +480,6 @@ static noinline void __init copy_user_test(void)
480 kfree(kmem); 480 kfree(kmem);
481} 481}
482 482
483static noinline void __init use_after_scope_test(void)
484{
485 volatile char *volatile p;
486
487 pr_info("use-after-scope on int\n");
488 {
489 int local = 0;
490
491 p = (char *)&local;
492 }
493 p[0] = 1;
494 p[3] = 1;
495
496 pr_info("use-after-scope on array\n");
497 {
498 char local[1024] = {0};
499
500 p = local;
501 }
502 p[0] = 1;
503 p[1023] = 1;
504}
505
506static noinline void __init kasan_alloca_oob_left(void) 483static noinline void __init kasan_alloca_oob_left(void)
507{ 484{
508 volatile int i = 10; 485 volatile int i = 10;
@@ -682,7 +659,6 @@ static int __init kmalloc_tests_init(void)
682 kasan_alloca_oob_right(); 659 kasan_alloca_oob_right();
683 ksize_unpoisons_memory(); 660 ksize_unpoisons_memory();
684 copy_user_test(); 661 copy_user_test();
685 use_after_scope_test();
686 kmem_cache_double_free(); 662 kmem_cache_double_free();
687 kmem_cache_invalid_free(); 663 kmem_cache_invalid_free();
688 kasan_memchr(); 664 kasan_memchr();
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
new file mode 100644
index 000000000000..83cdcaa82bf6
--- /dev/null
+++ b/lib/test_vmalloc.c
@@ -0,0 +1,551 @@
1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Test module for stress and analyze performance of vmalloc allocator.
5 * (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
6 */
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/vmalloc.h>
11#include <linux/random.h>
12#include <linux/kthread.h>
13#include <linux/moduleparam.h>
14#include <linux/completion.h>
15#include <linux/delay.h>
16#include <linux/rwsem.h>
17#include <linux/mm.h>
18
19#define __param(type, name, init, msg) \
20 static type name = init; \
21 module_param(name, type, 0444); \
22 MODULE_PARM_DESC(name, msg) \
23
24__param(bool, single_cpu_test, false,
25 "Use single first online CPU to run tests");
26
27__param(bool, sequential_test_order, false,
28 "Use sequential stress tests order");
29
30__param(int, test_repeat_count, 1,
31 "Set test repeat counter");
32
33__param(int, test_loop_count, 1000000,
34 "Set test loop counter");
35
36__param(int, run_test_mask, INT_MAX,
37 "Set tests specified in the mask.\n\n"
38 "\t\tid: 1, name: fix_size_alloc_test\n"
39 "\t\tid: 2, name: full_fit_alloc_test\n"
40 "\t\tid: 4, name: long_busy_list_alloc_test\n"
41 "\t\tid: 8, name: random_size_alloc_test\n"
42 "\t\tid: 16, name: fix_align_alloc_test\n"
43 "\t\tid: 32, name: random_size_align_alloc_test\n"
44 "\t\tid: 64, name: align_shift_alloc_test\n"
45 "\t\tid: 128, name: pcpu_alloc_test\n"
46 /* Add a new test case description here. */
47);
48
49/*
50 * Depends on single_cpu_test parameter. If it is true, then
51 * use first online CPU to trigger a test on, otherwise go with
52 * all online CPUs.
53 */
54static cpumask_t cpus_run_test_mask = CPU_MASK_NONE;
55
56/*
57 * Read write semaphore for synchronization of setup
58 * phase that is done in main thread and workers.
59 */
60static DECLARE_RWSEM(prepare_for_test_rwsem);
61
62/*
63 * Completion tracking for worker threads.
64 */
65static DECLARE_COMPLETION(test_all_done_comp);
66static atomic_t test_n_undone = ATOMIC_INIT(0);
67
68static inline void
69test_report_one_done(void)
70{
71 if (atomic_dec_and_test(&test_n_undone))
72 complete(&test_all_done_comp);
73}
74
75static int random_size_align_alloc_test(void)
76{
77 unsigned long size, align, rnd;
78 void *ptr;
79 int i;
80
81 for (i = 0; i < test_loop_count; i++) {
82 get_random_bytes(&rnd, sizeof(rnd));
83
84 /*
85 * Maximum 1024 pages, if PAGE_SIZE is 4096.
86 */
87 align = 1 << (rnd % 23);
88
89 /*
90 * Maximum 10 pages.
91 */
92 size = ((rnd % 10) + 1) * PAGE_SIZE;
93
94 ptr = __vmalloc_node_range(size, align,
95 VMALLOC_START, VMALLOC_END,
96 GFP_KERNEL | __GFP_ZERO,
97 PAGE_KERNEL,
98 0, 0, __builtin_return_address(0));
99
100 if (!ptr)
101 return -1;
102
103 vfree(ptr);
104 }
105
106 return 0;
107}
108
109/*
110 * This test case is supposed to be failed.
111 */
112static int align_shift_alloc_test(void)
113{
114 unsigned long align;
115 void *ptr;
116 int i;
117
118 for (i = 0; i < BITS_PER_LONG; i++) {
119 align = ((unsigned long) 1) << i;
120
121 ptr = __vmalloc_node_range(PAGE_SIZE, align,
122 VMALLOC_START, VMALLOC_END,
123 GFP_KERNEL | __GFP_ZERO,
124 PAGE_KERNEL,
125 0, 0, __builtin_return_address(0));
126
127 if (!ptr)
128 return -1;
129
130 vfree(ptr);
131 }
132
133 return 0;
134}
135
136static int fix_align_alloc_test(void)
137{
138 void *ptr;
139 int i;
140
141 for (i = 0; i < test_loop_count; i++) {
142 ptr = __vmalloc_node_range(5 * PAGE_SIZE,
143 THREAD_ALIGN << 1,
144 VMALLOC_START, VMALLOC_END,
145 GFP_KERNEL | __GFP_ZERO,
146 PAGE_KERNEL,
147 0, 0, __builtin_return_address(0));
148
149 if (!ptr)
150 return -1;
151
152 vfree(ptr);
153 }
154
155 return 0;
156}
157
158static int random_size_alloc_test(void)
159{
160 unsigned int n;
161 void *p;
162 int i;
163
164 for (i = 0; i < test_loop_count; i++) {
165 get_random_bytes(&n, sizeof(i));
166 n = (n % 100) + 1;
167
168 p = vmalloc(n * PAGE_SIZE);
169
170 if (!p)
171 return -1;
172
173 *((__u8 *)p) = 1;
174 vfree(p);
175 }
176
177 return 0;
178}
179
180static int long_busy_list_alloc_test(void)
181{
182 void *ptr_1, *ptr_2;
183 void **ptr;
184 int rv = -1;
185 int i;
186
187 ptr = vmalloc(sizeof(void *) * 15000);
188 if (!ptr)
189 return rv;
190
191 for (i = 0; i < 15000; i++)
192 ptr[i] = vmalloc(1 * PAGE_SIZE);
193
194 for (i = 0; i < test_loop_count; i++) {
195 ptr_1 = vmalloc(100 * PAGE_SIZE);
196 if (!ptr_1)
197 goto leave;
198
199 ptr_2 = vmalloc(1 * PAGE_SIZE);
200 if (!ptr_2) {
201 vfree(ptr_1);
202 goto leave;
203 }
204
205 *((__u8 *)ptr_1) = 0;
206 *((__u8 *)ptr_2) = 1;
207
208 vfree(ptr_1);
209 vfree(ptr_2);
210 }
211
212 /* Success */
213 rv = 0;
214
215leave:
216 for (i = 0; i < 15000; i++)
217 vfree(ptr[i]);
218
219 vfree(ptr);
220 return rv;
221}
222
223static int full_fit_alloc_test(void)
224{
225 void **ptr, **junk_ptr, *tmp;
226 int junk_length;
227 int rv = -1;
228 int i;
229
230 junk_length = fls(num_online_cpus());
231 junk_length *= (32 * 1024 * 1024 / PAGE_SIZE);
232
233 ptr = vmalloc(sizeof(void *) * junk_length);
234 if (!ptr)
235 return rv;
236
237 junk_ptr = vmalloc(sizeof(void *) * junk_length);
238 if (!junk_ptr) {
239 vfree(ptr);
240 return rv;
241 }
242
243 for (i = 0; i < junk_length; i++) {
244 ptr[i] = vmalloc(1 * PAGE_SIZE);
245 junk_ptr[i] = vmalloc(1 * PAGE_SIZE);
246 }
247
248 for (i = 0; i < junk_length; i++)
249 vfree(junk_ptr[i]);
250
251 for (i = 0; i < test_loop_count; i++) {
252 tmp = vmalloc(1 * PAGE_SIZE);
253
254 if (!tmp)
255 goto error;
256
257 *((__u8 *)tmp) = 1;
258 vfree(tmp);
259 }
260
261 /* Success */
262 rv = 0;
263
264error:
265 for (i = 0; i < junk_length; i++)
266 vfree(ptr[i]);
267
268 vfree(ptr);
269 vfree(junk_ptr);
270
271 return rv;
272}
273
274static int fix_size_alloc_test(void)
275{
276 void *ptr;
277 int i;
278
279 for (i = 0; i < test_loop_count; i++) {
280 ptr = vmalloc(3 * PAGE_SIZE);
281
282 if (!ptr)
283 return -1;
284
285 *((__u8 *)ptr) = 0;
286
287 vfree(ptr);
288 }
289
290 return 0;
291}
292
293static int
294pcpu_alloc_test(void)
295{
296 int rv = 0;
297#ifndef CONFIG_NEED_PER_CPU_KM
298 void __percpu **pcpu;
299 size_t size, align;
300 int i;
301
302 pcpu = vmalloc(sizeof(void __percpu *) * 35000);
303 if (!pcpu)
304 return -1;
305
306 for (i = 0; i < 35000; i++) {
307 unsigned int r;
308
309 get_random_bytes(&r, sizeof(i));
310 size = (r % (PAGE_SIZE / 4)) + 1;
311
312 /*
313 * Maximum PAGE_SIZE
314 */
315 get_random_bytes(&r, sizeof(i));
316 align = 1 << ((i % 11) + 1);
317
318 pcpu[i] = __alloc_percpu(size, align);
319 if (!pcpu[i])
320 rv = -1;
321 }
322
323 for (i = 0; i < 35000; i++)
324 free_percpu(pcpu[i]);
325
326 vfree(pcpu);
327#endif
328 return rv;
329}
330
331struct test_case_desc {
332 const char *test_name;
333 int (*test_func)(void);
334};
335
336static struct test_case_desc test_case_array[] = {
337 { "fix_size_alloc_test", fix_size_alloc_test },
338 { "full_fit_alloc_test", full_fit_alloc_test },
339 { "long_busy_list_alloc_test", long_busy_list_alloc_test },
340 { "random_size_alloc_test", random_size_alloc_test },
341 { "fix_align_alloc_test", fix_align_alloc_test },
342 { "random_size_align_alloc_test", random_size_align_alloc_test },
343 { "align_shift_alloc_test", align_shift_alloc_test },
344 { "pcpu_alloc_test", pcpu_alloc_test },
345 /* Add a new test case here. */
346};
347
348struct test_case_data {
349 int test_failed;
350 int test_passed;
351 u64 time;
352};
353
354/* Split it to get rid of: WARNING: line over 80 characters */
355static struct test_case_data
356 per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)];
357
358static struct test_driver {
359 struct task_struct *task;
360 unsigned long start;
361 unsigned long stop;
362 int cpu;
363} per_cpu_test_driver[NR_CPUS];
364
365static void shuffle_array(int *arr, int n)
366{
367 unsigned int rnd;
368 int i, j, x;
369
370 for (i = n - 1; i > 0; i--) {
371 get_random_bytes(&rnd, sizeof(rnd));
372
373 /* Cut the range. */
374 j = rnd % i;
375
376 /* Swap indexes. */
377 x = arr[i];
378 arr[i] = arr[j];
379 arr[j] = x;
380 }
381}
382
383static int test_func(void *private)
384{
385 struct test_driver *t = private;
386 cpumask_t newmask = CPU_MASK_NONE;
387 int random_array[ARRAY_SIZE(test_case_array)];
388 int index, i, j, ret;
389 ktime_t kt;
390 u64 delta;
391
392 cpumask_set_cpu(t->cpu, &newmask);
393 set_cpus_allowed_ptr(current, &newmask);
394
395 for (i = 0; i < ARRAY_SIZE(test_case_array); i++)
396 random_array[i] = i;
397
398 if (!sequential_test_order)
399 shuffle_array(random_array, ARRAY_SIZE(test_case_array));
400
401 /*
402 * Block until initialization is done.
403 */
404 down_read(&prepare_for_test_rwsem);
405
406 t->start = get_cycles();
407 for (i = 0; i < ARRAY_SIZE(test_case_array); i++) {
408 index = random_array[i];
409
410 /*
411 * Skip tests if run_test_mask has been specified.
412 */
413 if (!((run_test_mask & (1 << index)) >> index))
414 continue;
415
416 kt = ktime_get();
417 for (j = 0; j < test_repeat_count; j++) {
418 ret = test_case_array[index].test_func();
419 if (!ret)
420 per_cpu_test_data[t->cpu][index].test_passed++;
421 else
422 per_cpu_test_data[t->cpu][index].test_failed++;
423 }
424
425 /*
426 * Take an average time that test took.
427 */
428 delta = (u64) ktime_us_delta(ktime_get(), kt);
429 do_div(delta, (u32) test_repeat_count);
430
431 per_cpu_test_data[t->cpu][index].time = delta;
432 }
433 t->stop = get_cycles();
434
435 up_read(&prepare_for_test_rwsem);
436 test_report_one_done();
437
438 /*
439 * Wait for the kthread_stop() call.
440 */
441 while (!kthread_should_stop())
442 msleep(10);
443
444 return 0;
445}
446
447static void
448init_test_configurtion(void)
449{
450 /*
451 * Reset all data of all CPUs.
452 */
453 memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data));
454
455 if (single_cpu_test)
456 cpumask_set_cpu(cpumask_first(cpu_online_mask),
457 &cpus_run_test_mask);
458 else
459 cpumask_and(&cpus_run_test_mask, cpu_online_mask,
460 cpu_online_mask);
461
462 if (test_repeat_count <= 0)
463 test_repeat_count = 1;
464
465 if (test_loop_count <= 0)
466 test_loop_count = 1;
467}
468
469static void do_concurrent_test(void)
470{
471 int cpu, ret;
472
473 /*
474 * Set some basic configurations plus sanity check.
475 */
476 init_test_configurtion();
477
478 /*
479 * Put on hold all workers.
480 */
481 down_write(&prepare_for_test_rwsem);
482
483 for_each_cpu(cpu, &cpus_run_test_mask) {
484 struct test_driver *t = &per_cpu_test_driver[cpu];
485
486 t->cpu = cpu;
487 t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu);
488
489 if (!IS_ERR(t->task))
490 /* Success. */
491 atomic_inc(&test_n_undone);
492 else
493 pr_err("Failed to start kthread for %d CPU\n", cpu);
494 }
495
496 /*
497 * Now let the workers do their job.
498 */
499 up_write(&prepare_for_test_rwsem);
500
501 /*
502 * Sleep quiet until all workers are done with 1 second
503 * interval. Since the test can take a lot of time we
504 * can run into a stack trace of the hung task. That is
505 * why we go with completion_timeout and HZ value.
506 */
507 do {
508 ret = wait_for_completion_timeout(&test_all_done_comp, HZ);
509 } while (!ret);
510
511 for_each_cpu(cpu, &cpus_run_test_mask) {
512 struct test_driver *t = &per_cpu_test_driver[cpu];
513 int i;
514
515 if (!IS_ERR(t->task))
516 kthread_stop(t->task);
517
518 for (i = 0; i < ARRAY_SIZE(test_case_array); i++) {
519 if (!((run_test_mask & (1 << i)) >> i))
520 continue;
521
522 pr_info(
523 "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n",
524 test_case_array[i].test_name,
525 per_cpu_test_data[cpu][i].test_passed,
526 per_cpu_test_data[cpu][i].test_failed,
527 test_repeat_count, test_loop_count,
528 per_cpu_test_data[cpu][i].time);
529 }
530
531 pr_info("All test took CPU%d=%lu cycles\n",
532 cpu, t->stop - t->start);
533 }
534}
535
536static int vmalloc_test_init(void)
537{
538 do_concurrent_test();
539 return -EAGAIN; /* Fail will directly unload the module */
540}
541
542static void vmalloc_test_exit(void)
543{
544}
545
546module_init(vmalloc_test_init)
547module_exit(vmalloc_test_exit)
548
549MODULE_LICENSE("GPL");
550MODULE_AUTHOR("Uladzislau Rezki");
551MODULE_DESCRIPTION("vmalloc test module");
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 9a7b8b049d04..e3df921208c0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -39,6 +39,23 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
39 Enable debug page memory allocations by default? This value 39 Enable debug page memory allocations by default? This value
40 can be overridden by debug_pagealloc=off|on. 40 can be overridden by debug_pagealloc=off|on.
41 41
42config PAGE_OWNER
43 bool "Track page owner"
44 depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
45 select DEBUG_FS
46 select STACKTRACE
47 select STACKDEPOT
48 select PAGE_EXTENSION
49 help
50 This keeps track of what call chain is the owner of a page, may
51 help to find bare alloc_page(s) leaks. Even if you include this
52 feature on your build, it is disabled in default. You should pass
53 "page_owner=on" to boot parameter in order to enable it. Eats
54 a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
55 for user-space helper.
56
57 If unsure, say N.
58
42config PAGE_POISONING 59config PAGE_POISONING
43 bool "Poison pages after freeing" 60 bool "Poison pages after freeing"
44 select PAGE_POISONING_NO_SANITY if HIBERNATION 61 select PAGE_POISONING_NO_SANITY if HIBERNATION
diff --git a/mm/cma.c b/mm/cma.c
index c7b39dd3b4f6..f4f3a8a57d86 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -353,12 +353,14 @@ int __init cma_declare_contiguous(phys_addr_t base,
353 353
354 ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); 354 ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
355 if (ret) 355 if (ret)
356 goto err; 356 goto free_mem;
357 357
358 pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, 358 pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
359 &base); 359 &base);
360 return 0; 360 return 0;
361 361
362free_mem:
363 memblock_free(base, size);
362err: 364err:
363 pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); 365 pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
364 return ret; 366 return ret;
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index ad6723e9d110..8d7b2fd52225 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -21,8 +21,6 @@ struct cma_mem {
21 unsigned long n; 21 unsigned long n;
22}; 22};
23 23
24static struct dentry *cma_debugfs_root;
25
26static int cma_debugfs_get(void *data, u64 *val) 24static int cma_debugfs_get(void *data, u64 *val)
27{ 25{
28 unsigned long *p = data; 26 unsigned long *p = data;
@@ -162,7 +160,7 @@ static int cma_alloc_write(void *data, u64 val)
162} 160}
163DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); 161DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
164 162
165static void cma_debugfs_add_one(struct cma *cma, int idx) 163static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
166{ 164{
167 struct dentry *tmp; 165 struct dentry *tmp;
168 char name[16]; 166 char name[16];
@@ -170,7 +168,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
170 168
171 scnprintf(name, sizeof(name), "cma-%s", cma->name); 169 scnprintf(name, sizeof(name), "cma-%s", cma->name);
172 170
173 tmp = debugfs_create_dir(name, cma_debugfs_root); 171 tmp = debugfs_create_dir(name, root_dentry);
174 172
175 debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); 173 debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
176 debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); 174 debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
@@ -188,14 +186,13 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
188 186
189static int __init cma_debugfs_init(void) 187static int __init cma_debugfs_init(void)
190{ 188{
189 struct dentry *cma_debugfs_root;
191 int i; 190 int i;
192 191
193 cma_debugfs_root = debugfs_create_dir("cma", NULL); 192 cma_debugfs_root = debugfs_create_dir("cma", NULL);
194 if (!cma_debugfs_root)
195 return -ENOMEM;
196 193
197 for (i = 0; i < cma_area_count; i++) 194 for (i = 0; i < cma_area_count; i++)
198 cma_debugfs_add_one(&cma_areas[i], i); 195 cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root);
199 196
200 return 0; 197 return 0;
201} 198}
diff --git a/mm/compaction.c b/mm/compaction.c
index ef29490b0f46..f171a83707ce 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -66,7 +66,7 @@ static unsigned long release_freepages(struct list_head *freelist)
66 return high_pfn; 66 return high_pfn;
67} 67}
68 68
69static void map_pages(struct list_head *list) 69static void split_map_pages(struct list_head *list)
70{ 70{
71 unsigned int i, order, nr_pages; 71 unsigned int i, order, nr_pages;
72 struct page *page, *next; 72 struct page *page, *next;
@@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page)
237 return false; 237 return false;
238} 238}
239 239
240static bool
241__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
242 bool check_target)
243{
244 struct page *page = pfn_to_online_page(pfn);
245 struct page *end_page;
246 unsigned long block_pfn;
247
248 if (!page)
249 return false;
250 if (zone != page_zone(page))
251 return false;
252 if (pageblock_skip_persistent(page))
253 return false;
254
255 /*
256 * If skip is already cleared do no further checking once the
257 * restart points have been set.
258 */
259 if (check_source && check_target && !get_pageblock_skip(page))
260 return true;
261
262 /*
263 * If clearing skip for the target scanner, do not select a
264 * non-movable pageblock as the starting point.
265 */
266 if (!check_source && check_target &&
267 get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
268 return false;
269
270 /*
271 * Only clear the hint if a sample indicates there is either a
272 * free page or an LRU page in the block. One or other condition
273 * is necessary for the block to be a migration source/target.
274 */
275 block_pfn = pageblock_start_pfn(pfn);
276 pfn = max(block_pfn, zone->zone_start_pfn);
277 page = pfn_to_page(pfn);
278 if (zone != page_zone(page))
279 return false;
280 pfn = block_pfn + pageblock_nr_pages;
281 pfn = min(pfn, zone_end_pfn(zone));
282 end_page = pfn_to_page(pfn);
283
284 do {
285 if (pfn_valid_within(pfn)) {
286 if (check_source && PageLRU(page)) {
287 clear_pageblock_skip(page);
288 return true;
289 }
290
291 if (check_target && PageBuddy(page)) {
292 clear_pageblock_skip(page);
293 return true;
294 }
295 }
296
297 page += (1 << PAGE_ALLOC_COSTLY_ORDER);
298 pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
299 } while (page < end_page);
300
301 return false;
302}
303
240/* 304/*
241 * This function is called to clear all cached information on pageblocks that 305 * This function is called to clear all cached information on pageblocks that
242 * should be skipped for page isolation when the migrate and free page scanner 306 * should be skipped for page isolation when the migrate and free page scanner
@@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page)
244 */ 308 */
245static void __reset_isolation_suitable(struct zone *zone) 309static void __reset_isolation_suitable(struct zone *zone)
246{ 310{
247 unsigned long start_pfn = zone->zone_start_pfn; 311 unsigned long migrate_pfn = zone->zone_start_pfn;
248 unsigned long end_pfn = zone_end_pfn(zone); 312 unsigned long free_pfn = zone_end_pfn(zone);
249 unsigned long pfn; 313 unsigned long reset_migrate = free_pfn;
314 unsigned long reset_free = migrate_pfn;
315 bool source_set = false;
316 bool free_set = false;
317
318 if (!zone->compact_blockskip_flush)
319 return;
250 320
251 zone->compact_blockskip_flush = false; 321 zone->compact_blockskip_flush = false;
252 322
253 /* Walk the zone and mark every pageblock as suitable for isolation */ 323 /*
254 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 324 * Walk the zone and update pageblock skip information. Source looks
255 struct page *page; 325 * for PageLRU while target looks for PageBuddy. When the scanner
256 326 * is found, both PageBuddy and PageLRU are checked as the pageblock
327 * is suitable as both source and target.
328 */
329 for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
330 free_pfn -= pageblock_nr_pages) {
257 cond_resched(); 331 cond_resched();
258 332
259 page = pfn_to_online_page(pfn); 333 /* Update the migrate PFN */
260 if (!page) 334 if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
261 continue; 335 migrate_pfn < reset_migrate) {
262 if (zone != page_zone(page)) 336 source_set = true;
263 continue; 337 reset_migrate = migrate_pfn;
264 if (pageblock_skip_persistent(page)) 338 zone->compact_init_migrate_pfn = reset_migrate;
265 continue; 339 zone->compact_cached_migrate_pfn[0] = reset_migrate;
340 zone->compact_cached_migrate_pfn[1] = reset_migrate;
341 }
266 342
267 clear_pageblock_skip(page); 343 /* Update the free PFN */
344 if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
345 free_pfn > reset_free) {
346 free_set = true;
347 reset_free = free_pfn;
348 zone->compact_init_free_pfn = reset_free;
349 zone->compact_cached_free_pfn = reset_free;
350 }
268 } 351 }
269 352
270 reset_cached_positions(zone); 353 /* Leave no distance if no suitable block was reset */
354 if (reset_migrate >= reset_free) {
355 zone->compact_cached_migrate_pfn[0] = migrate_pfn;
356 zone->compact_cached_migrate_pfn[1] = migrate_pfn;
357 zone->compact_cached_free_pfn = free_pfn;
358 }
271} 359}
272 360
273void reset_isolation_suitable(pg_data_t *pgdat) 361void reset_isolation_suitable(pg_data_t *pgdat)
@@ -286,15 +374,53 @@ void reset_isolation_suitable(pg_data_t *pgdat)
286} 374}
287 375
288/* 376/*
377 * Sets the pageblock skip bit if it was clear. Note that this is a hint as
378 * locks are not required for read/writers. Returns true if it was already set.
379 */
380static bool test_and_set_skip(struct compact_control *cc, struct page *page,
381 unsigned long pfn)
382{
383 bool skip;
384
385 /* Do no update if skip hint is being ignored */
386 if (cc->ignore_skip_hint)
387 return false;
388
389 if (!IS_ALIGNED(pfn, pageblock_nr_pages))
390 return false;
391
392 skip = get_pageblock_skip(page);
393 if (!skip && !cc->no_set_skip_hint)
394 set_pageblock_skip(page);
395
396 return skip;
397}
398
399static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
400{
401 struct zone *zone = cc->zone;
402
403 pfn = pageblock_end_pfn(pfn);
404
405 /* Set for isolation rather than compaction */
406 if (cc->no_set_skip_hint)
407 return;
408
409 if (pfn > zone->compact_cached_migrate_pfn[0])
410 zone->compact_cached_migrate_pfn[0] = pfn;
411 if (cc->mode != MIGRATE_ASYNC &&
412 pfn > zone->compact_cached_migrate_pfn[1])
413 zone->compact_cached_migrate_pfn[1] = pfn;
414}
415
416/*
289 * If no pages were isolated then mark this pageblock to be skipped in the 417 * If no pages were isolated then mark this pageblock to be skipped in the
290 * future. The information is later cleared by __reset_isolation_suitable(). 418 * future. The information is later cleared by __reset_isolation_suitable().
291 */ 419 */
292static void update_pageblock_skip(struct compact_control *cc, 420static void update_pageblock_skip(struct compact_control *cc,
293 struct page *page, unsigned long nr_isolated, 421 struct page *page, unsigned long pfn)
294 bool migrate_scanner)
295{ 422{
296 struct zone *zone = cc->zone; 423 struct zone *zone = cc->zone;
297 unsigned long pfn;
298 424
299 if (cc->no_set_skip_hint) 425 if (cc->no_set_skip_hint)
300 return; 426 return;
@@ -302,24 +428,11 @@ static void update_pageblock_skip(struct compact_control *cc,
302 if (!page) 428 if (!page)
303 return; 429 return;
304 430
305 if (nr_isolated)
306 return;
307
308 set_pageblock_skip(page); 431 set_pageblock_skip(page);
309 432
310 pfn = page_to_pfn(page);
311
312 /* Update where async and sync compaction should restart */ 433 /* Update where async and sync compaction should restart */
313 if (migrate_scanner) { 434 if (pfn < zone->compact_cached_free_pfn)
314 if (pfn > zone->compact_cached_migrate_pfn[0]) 435 zone->compact_cached_free_pfn = pfn;
315 zone->compact_cached_migrate_pfn[0] = pfn;
316 if (cc->mode != MIGRATE_ASYNC &&
317 pfn > zone->compact_cached_migrate_pfn[1])
318 zone->compact_cached_migrate_pfn[1] = pfn;
319 } else {
320 if (pfn < zone->compact_cached_free_pfn)
321 zone->compact_cached_free_pfn = pfn;
322 }
323} 436}
324#else 437#else
325static inline bool isolation_suitable(struct compact_control *cc, 438static inline bool isolation_suitable(struct compact_control *cc,
@@ -334,32 +447,42 @@ static inline bool pageblock_skip_persistent(struct page *page)
334} 447}
335 448
336static inline void update_pageblock_skip(struct compact_control *cc, 449static inline void update_pageblock_skip(struct compact_control *cc,
337 struct page *page, unsigned long nr_isolated, 450 struct page *page, unsigned long pfn)
338 bool migrate_scanner) 451{
452}
453
454static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
455{
456}
457
458static bool test_and_set_skip(struct compact_control *cc, struct page *page,
459 unsigned long pfn)
339{ 460{
461 return false;
340} 462}
341#endif /* CONFIG_COMPACTION */ 463#endif /* CONFIG_COMPACTION */
342 464
343/* 465/*
344 * Compaction requires the taking of some coarse locks that are potentially 466 * Compaction requires the taking of some coarse locks that are potentially
345 * very heavily contended. For async compaction, back out if the lock cannot 467 * very heavily contended. For async compaction, trylock and record if the
346 * be taken immediately. For sync compaction, spin on the lock if needed. 468 * lock is contended. The lock will still be acquired but compaction will
469 * abort when the current block is finished regardless of success rate.
470 * Sync compaction acquires the lock.
347 * 471 *
348 * Returns true if the lock is held 472 * Always returns true which makes it easier to track lock state in callers.
349 * Returns false if the lock is not held and compaction should abort
350 */ 473 */
351static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, 474static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
352 struct compact_control *cc) 475 struct compact_control *cc)
353{ 476{
354 if (cc->mode == MIGRATE_ASYNC) { 477 /* Track if the lock is contended in async mode */
355 if (!spin_trylock_irqsave(lock, *flags)) { 478 if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
356 cc->contended = true; 479 if (spin_trylock_irqsave(lock, *flags))
357 return false; 480 return true;
358 } 481
359 } else { 482 cc->contended = true;
360 spin_lock_irqsave(lock, *flags);
361 } 483 }
362 484
485 spin_lock_irqsave(lock, *flags);
363 return true; 486 return true;
364} 487}
365 488
@@ -391,37 +514,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
391 return true; 514 return true;
392 } 515 }
393 516
394 if (need_resched()) { 517 cond_resched();
395 if (cc->mode == MIGRATE_ASYNC) {
396 cc->contended = true;
397 return true;
398 }
399 cond_resched();
400 }
401
402 return false;
403}
404
405/*
406 * Aside from avoiding lock contention, compaction also periodically checks
407 * need_resched() and either schedules in sync compaction or aborts async
408 * compaction. This is similar to what compact_unlock_should_abort() does, but
409 * is used where no lock is concerned.
410 *
411 * Returns false when no scheduling was needed, or sync compaction scheduled.
412 * Returns true when async compaction should abort.
413 */
414static inline bool compact_should_abort(struct compact_control *cc)
415{
416 /* async compaction aborts if contended */
417 if (need_resched()) {
418 if (cc->mode == MIGRATE_ASYNC) {
419 cc->contended = true;
420 return true;
421 }
422
423 cond_resched();
424 }
425 518
426 return false; 519 return false;
427} 520}
@@ -435,19 +528,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
435 unsigned long *start_pfn, 528 unsigned long *start_pfn,
436 unsigned long end_pfn, 529 unsigned long end_pfn,
437 struct list_head *freelist, 530 struct list_head *freelist,
531 unsigned int stride,
438 bool strict) 532 bool strict)
439{ 533{
440 int nr_scanned = 0, total_isolated = 0; 534 int nr_scanned = 0, total_isolated = 0;
441 struct page *cursor, *valid_page = NULL; 535 struct page *cursor;
442 unsigned long flags = 0; 536 unsigned long flags = 0;
443 bool locked = false; 537 bool locked = false;
444 unsigned long blockpfn = *start_pfn; 538 unsigned long blockpfn = *start_pfn;
445 unsigned int order; 539 unsigned int order;
446 540
541 /* Strict mode is for isolation, speed is secondary */
542 if (strict)
543 stride = 1;
544
447 cursor = pfn_to_page(blockpfn); 545 cursor = pfn_to_page(blockpfn);
448 546
449 /* Isolate free pages. */ 547 /* Isolate free pages. */
450 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 548 for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
451 int isolated; 549 int isolated;
452 struct page *page = cursor; 550 struct page *page = cursor;
453 551
@@ -465,9 +563,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
465 if (!pfn_valid_within(blockpfn)) 563 if (!pfn_valid_within(blockpfn))
466 goto isolate_fail; 564 goto isolate_fail;
467 565
468 if (!valid_page)
469 valid_page = page;
470
471 /* 566 /*
472 * For compound pages such as THP and hugetlbfs, we can save 567 * For compound pages such as THP and hugetlbfs, we can save
473 * potentially a lot of iterations if we skip them at once. 568 * potentially a lot of iterations if we skip them at once.
@@ -495,18 +590,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
495 * recheck as well. 590 * recheck as well.
496 */ 591 */
497 if (!locked) { 592 if (!locked) {
498 /* 593 locked = compact_lock_irqsave(&cc->zone->lock,
499 * The zone lock must be held to isolate freepages.
500 * Unfortunately this is a very coarse lock and can be
501 * heavily contended if there are parallel allocations
502 * or parallel compactions. For async compaction do not
503 * spin on the lock and we acquire the lock as late as
504 * possible.
505 */
506 locked = compact_trylock_irqsave(&cc->zone->lock,
507 &flags, cc); 594 &flags, cc);
508 if (!locked)
509 break;
510 595
511 /* Recheck this is a buddy page under lock */ 596 /* Recheck this is a buddy page under lock */
512 if (!PageBuddy(page)) 597 if (!PageBuddy(page))
@@ -565,10 +650,6 @@ isolate_fail:
565 if (strict && blockpfn < end_pfn) 650 if (strict && blockpfn < end_pfn)
566 total_isolated = 0; 651 total_isolated = 0;
567 652
568 /* Update the pageblock-skip if the whole pageblock was scanned */
569 if (blockpfn == end_pfn)
570 update_pageblock_skip(cc, valid_page, total_isolated, false);
571
572 cc->total_free_scanned += nr_scanned; 653 cc->total_free_scanned += nr_scanned;
573 if (total_isolated) 654 if (total_isolated)
574 count_compact_events(COMPACTISOLATED, total_isolated); 655 count_compact_events(COMPACTISOLATED, total_isolated);
@@ -626,7 +707,7 @@ isolate_freepages_range(struct compact_control *cc,
626 break; 707 break;
627 708
628 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 709 isolated = isolate_freepages_block(cc, &isolate_start_pfn,
629 block_end_pfn, &freelist, true); 710 block_end_pfn, &freelist, 0, true);
630 711
631 /* 712 /*
632 * In strict mode, isolate_freepages_block() returns 0 if 713 * In strict mode, isolate_freepages_block() returns 0 if
@@ -644,7 +725,7 @@ isolate_freepages_range(struct compact_control *cc,
644 } 725 }
645 726
646 /* __isolate_free_page() does not map the pages */ 727 /* __isolate_free_page() does not map the pages */
647 map_pages(&freelist); 728 split_map_pages(&freelist);
648 729
649 if (pfn < end_pfn) { 730 if (pfn < end_pfn) {
650 /* Loop terminated early, cleanup. */ 731 /* Loop terminated early, cleanup. */
@@ -657,16 +738,16 @@ isolate_freepages_range(struct compact_control *cc,
657} 738}
658 739
659/* Similar to reclaim, but different enough that they don't share logic */ 740/* Similar to reclaim, but different enough that they don't share logic */
660static bool too_many_isolated(struct zone *zone) 741static bool too_many_isolated(pg_data_t *pgdat)
661{ 742{
662 unsigned long active, inactive, isolated; 743 unsigned long active, inactive, isolated;
663 744
664 inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + 745 inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
665 node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); 746 node_page_state(pgdat, NR_INACTIVE_ANON);
666 active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + 747 active = node_page_state(pgdat, NR_ACTIVE_FILE) +
667 node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); 748 node_page_state(pgdat, NR_ACTIVE_ANON);
668 isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + 749 isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
669 node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); 750 node_page_state(pgdat, NR_ISOLATED_ANON);
670 751
671 return isolated > (inactive + active) / 2; 752 return isolated > (inactive + active) / 2;
672} 753}
@@ -693,7 +774,7 @@ static unsigned long
693isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, 774isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
694 unsigned long end_pfn, isolate_mode_t isolate_mode) 775 unsigned long end_pfn, isolate_mode_t isolate_mode)
695{ 776{
696 struct zone *zone = cc->zone; 777 pg_data_t *pgdat = cc->zone->zone_pgdat;
697 unsigned long nr_scanned = 0, nr_isolated = 0; 778 unsigned long nr_scanned = 0, nr_isolated = 0;
698 struct lruvec *lruvec; 779 struct lruvec *lruvec;
699 unsigned long flags = 0; 780 unsigned long flags = 0;
@@ -702,13 +783,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
702 unsigned long start_pfn = low_pfn; 783 unsigned long start_pfn = low_pfn;
703 bool skip_on_failure = false; 784 bool skip_on_failure = false;
704 unsigned long next_skip_pfn = 0; 785 unsigned long next_skip_pfn = 0;
786 bool skip_updated = false;
705 787
706 /* 788 /*
707 * Ensure that there are not too many pages isolated from the LRU 789 * Ensure that there are not too many pages isolated from the LRU
708 * list by either parallel reclaimers or compaction. If there are, 790 * list by either parallel reclaimers or compaction. If there are,
709 * delay for some time until fewer pages are isolated 791 * delay for some time until fewer pages are isolated
710 */ 792 */
711 while (unlikely(too_many_isolated(zone))) { 793 while (unlikely(too_many_isolated(pgdat))) {
712 /* async migration should just abort */ 794 /* async migration should just abort */
713 if (cc->mode == MIGRATE_ASYNC) 795 if (cc->mode == MIGRATE_ASYNC)
714 return 0; 796 return 0;
@@ -719,8 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
719 return 0; 801 return 0;
720 } 802 }
721 803
722 if (compact_should_abort(cc)) 804 cond_resched();
723 return 0;
724 805
725 if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { 806 if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
726 skip_on_failure = true; 807 skip_on_failure = true;
@@ -758,8 +839,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
758 * if contended. 839 * if contended.
759 */ 840 */
760 if (!(low_pfn % SWAP_CLUSTER_MAX) 841 if (!(low_pfn % SWAP_CLUSTER_MAX)
761 && compact_unlock_should_abort(zone_lru_lock(zone), flags, 842 && compact_unlock_should_abort(&pgdat->lru_lock,
762 &locked, cc)) 843 flags, &locked, cc))
763 break; 844 break;
764 845
765 if (!pfn_valid_within(low_pfn)) 846 if (!pfn_valid_within(low_pfn))
@@ -768,8 +849,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
768 849
769 page = pfn_to_page(low_pfn); 850 page = pfn_to_page(low_pfn);
770 851
771 if (!valid_page) 852 /*
853 * Check if the pageblock has already been marked skipped.
854 * Only the aligned PFN is checked as the caller isolates
855 * COMPACT_CLUSTER_MAX at a time so the second call must
856 * not falsely conclude that the block should be skipped.
857 */
858 if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
859 if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
860 low_pfn = end_pfn;
861 goto isolate_abort;
862 }
772 valid_page = page; 863 valid_page = page;
864 }
773 865
774 /* 866 /*
775 * Skip if free. We read page order here without zone lock 867 * Skip if free. We read page order here without zone lock
@@ -818,7 +910,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
818 if (unlikely(__PageMovable(page)) && 910 if (unlikely(__PageMovable(page)) &&
819 !PageIsolated(page)) { 911 !PageIsolated(page)) {
820 if (locked) { 912 if (locked) {
821 spin_unlock_irqrestore(zone_lru_lock(zone), 913 spin_unlock_irqrestore(&pgdat->lru_lock,
822 flags); 914 flags);
823 locked = false; 915 locked = false;
824 } 916 }
@@ -848,10 +940,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
848 940
849 /* If we already hold the lock, we can skip some rechecking */ 941 /* If we already hold the lock, we can skip some rechecking */
850 if (!locked) { 942 if (!locked) {
851 locked = compact_trylock_irqsave(zone_lru_lock(zone), 943 locked = compact_lock_irqsave(&pgdat->lru_lock,
852 &flags, cc); 944 &flags, cc);
853 if (!locked) 945
854 break; 946 /* Try get exclusive access under lock */
947 if (!skip_updated) {
948 skip_updated = true;
949 if (test_and_set_skip(cc, page, low_pfn))
950 goto isolate_abort;
951 }
855 952
856 /* Recheck PageLRU and PageCompound under lock */ 953 /* Recheck PageLRU and PageCompound under lock */
857 if (!PageLRU(page)) 954 if (!PageLRU(page))
@@ -868,7 +965,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
868 } 965 }
869 } 966 }
870 967
871 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 968 lruvec = mem_cgroup_page_lruvec(page, pgdat);
872 969
873 /* Try isolate the page */ 970 /* Try isolate the page */
874 if (__isolate_lru_page(page, isolate_mode) != 0) 971 if (__isolate_lru_page(page, isolate_mode) != 0)
@@ -887,16 +984,13 @@ isolate_success:
887 nr_isolated++; 984 nr_isolated++;
888 985
889 /* 986 /*
890 * Record where we could have freed pages by migration and not 987 * Avoid isolating too much unless this block is being
891 * yet flushed them to buddy allocator. 988 * rescanned (e.g. dirty/writeback pages, parallel allocation)
892 * - this is the lowest page that was isolated and likely be 989 * or a lock is contended. For contention, isolate quickly to
893 * then freed by migration. 990 * potentially remove one source of contention.
894 */ 991 */
895 if (!cc->last_migrated_pfn) 992 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
896 cc->last_migrated_pfn = low_pfn; 993 !cc->rescan && !cc->contended) {
897
898 /* Avoid isolating too much */
899 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
900 ++low_pfn; 994 ++low_pfn;
901 break; 995 break;
902 } 996 }
@@ -913,12 +1007,11 @@ isolate_fail:
913 */ 1007 */
914 if (nr_isolated) { 1008 if (nr_isolated) {
915 if (locked) { 1009 if (locked) {
916 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 1010 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
917 locked = false; 1011 locked = false;
918 } 1012 }
919 putback_movable_pages(&cc->migratepages); 1013 putback_movable_pages(&cc->migratepages);
920 cc->nr_migratepages = 0; 1014 cc->nr_migratepages = 0;
921 cc->last_migrated_pfn = 0;
922 nr_isolated = 0; 1015 nr_isolated = 0;
923 } 1016 }
924 1017
@@ -939,15 +1032,23 @@ isolate_fail:
939 if (unlikely(low_pfn > end_pfn)) 1032 if (unlikely(low_pfn > end_pfn))
940 low_pfn = end_pfn; 1033 low_pfn = end_pfn;
941 1034
1035isolate_abort:
942 if (locked) 1036 if (locked)
943 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 1037 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
944 1038
945 /* 1039 /*
946 * Update the pageblock-skip information and cached scanner pfn, 1040 * Updated the cached scanner pfn once the pageblock has been scanned
947 * if the whole pageblock was scanned without isolating any page. 1041 * Pages will either be migrated in which case there is no point
1042 * scanning in the near future or migration failed in which case the
1043 * failure reason may persist. The block is marked for skipping if
1044 * there were no pages isolated in the block or if the block is
1045 * rescanned twice in a row.
948 */ 1046 */
949 if (low_pfn == end_pfn) 1047 if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
950 update_pageblock_skip(cc, valid_page, nr_isolated, true); 1048 if (valid_page && !skip_updated)
1049 set_pageblock_skip(valid_page);
1050 update_cached_migrate(cc, low_pfn);
1051 }
951 1052
952 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, 1053 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
953 nr_scanned, nr_isolated); 1054 nr_scanned, nr_isolated);
@@ -1013,6 +1114,9 @@ static bool suitable_migration_source(struct compact_control *cc,
1013{ 1114{
1014 int block_mt; 1115 int block_mt;
1015 1116
1117 if (pageblock_skip_persistent(page))
1118 return false;
1119
1016 if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) 1120 if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
1017 return true; 1121 return true;
1018 1122
@@ -1050,6 +1154,12 @@ static bool suitable_migration_target(struct compact_control *cc,
1050 return false; 1154 return false;
1051} 1155}
1052 1156
1157static inline unsigned int
1158freelist_scan_limit(struct compact_control *cc)
1159{
1160 return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
1161}
1162
1053/* 1163/*
1054 * Test whether the free scanner has reached the same or lower pageblock than 1164 * Test whether the free scanner has reached the same or lower pageblock than
1055 * the migration scanner, and compaction should thus terminate. 1165 * the migration scanner, and compaction should thus terminate.
@@ -1061,6 +1171,248 @@ static inline bool compact_scanners_met(struct compact_control *cc)
1061} 1171}
1062 1172
1063/* 1173/*
1174 * Used when scanning for a suitable migration target which scans freelists
1175 * in reverse. Reorders the list such as the unscanned pages are scanned
1176 * first on the next iteration of the free scanner
1177 */
1178static void
1179move_freelist_head(struct list_head *freelist, struct page *freepage)
1180{
1181 LIST_HEAD(sublist);
1182
1183 if (!list_is_last(freelist, &freepage->lru)) {
1184 list_cut_before(&sublist, freelist, &freepage->lru);
1185 if (!list_empty(&sublist))
1186 list_splice_tail(&sublist, freelist);
1187 }
1188}
1189
1190/*
1191 * Similar to move_freelist_head except used by the migration scanner
1192 * when scanning forward. It's possible for these list operations to
1193 * move against each other if they search the free list exactly in
1194 * lockstep.
1195 */
1196static void
1197move_freelist_tail(struct list_head *freelist, struct page *freepage)
1198{
1199 LIST_HEAD(sublist);
1200
1201 if (!list_is_first(freelist, &freepage->lru)) {
1202 list_cut_position(&sublist, freelist, &freepage->lru);
1203 if (!list_empty(&sublist))
1204 list_splice_tail(&sublist, freelist);
1205 }
1206}
1207
1208static void
1209fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
1210{
1211 unsigned long start_pfn, end_pfn;
1212 struct page *page = pfn_to_page(pfn);
1213
1214 /* Do not search around if there are enough pages already */
1215 if (cc->nr_freepages >= cc->nr_migratepages)
1216 return;
1217
1218 /* Minimise scanning during async compaction */
1219 if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
1220 return;
1221
1222 /* Pageblock boundaries */
1223 start_pfn = pageblock_start_pfn(pfn);
1224 end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone));
1225
1226 /* Scan before */
1227 if (start_pfn != pfn) {
1228 isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
1229 if (cc->nr_freepages >= cc->nr_migratepages)
1230 return;
1231 }
1232
1233 /* Scan after */
1234 start_pfn = pfn + nr_isolated;
1235 if (start_pfn != end_pfn)
1236 isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
1237
1238 /* Skip this pageblock in the future as it's full or nearly full */
1239 if (cc->nr_freepages < cc->nr_migratepages)
1240 set_pageblock_skip(page);
1241}
1242
1243/* Search orders in round-robin fashion */
1244static int next_search_order(struct compact_control *cc, int order)
1245{
1246 order--;
1247 if (order < 0)
1248 order = cc->order - 1;
1249
1250 /* Search wrapped around? */
1251 if (order == cc->search_order) {
1252 cc->search_order--;
1253 if (cc->search_order < 0)
1254 cc->search_order = cc->order - 1;
1255 return -1;
1256 }
1257
1258 return order;
1259}
1260
1261static unsigned long
1262fast_isolate_freepages(struct compact_control *cc)
1263{
1264 unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
1265 unsigned int nr_scanned = 0;
1266 unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
1267 unsigned long nr_isolated = 0;
1268 unsigned long distance;
1269 struct page *page = NULL;
1270 bool scan_start = false;
1271 int order;
1272
1273 /* Full compaction passes in a negative order */
1274 if (cc->order <= 0)
1275 return cc->free_pfn;
1276
1277 /*
1278 * If starting the scan, use a deeper search and use the highest
1279 * PFN found if a suitable one is not found.
1280 */
1281 if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
1282 limit = pageblock_nr_pages >> 1;
1283 scan_start = true;
1284 }
1285
1286 /*
1287 * Preferred point is in the top quarter of the scan space but take
1288 * a pfn from the top half if the search is problematic.
1289 */
1290 distance = (cc->free_pfn - cc->migrate_pfn);
1291 low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
1292 min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
1293
1294 if (WARN_ON_ONCE(min_pfn > low_pfn))
1295 low_pfn = min_pfn;
1296
1297 /*
1298 * Search starts from the last successful isolation order or the next
1299 * order to search after a previous failure
1300 */
1301 cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
1302
1303 for (order = cc->search_order;
1304 !page && order >= 0;
1305 order = next_search_order(cc, order)) {
1306 struct free_area *area = &cc->zone->free_area[order];
1307 struct list_head *freelist;
1308 struct page *freepage;
1309 unsigned long flags;
1310 unsigned int order_scanned = 0;
1311
1312 if (!area->nr_free)
1313 continue;
1314
1315 spin_lock_irqsave(&cc->zone->lock, flags);
1316 freelist = &area->free_list[MIGRATE_MOVABLE];
1317 list_for_each_entry_reverse(freepage, freelist, lru) {
1318 unsigned long pfn;
1319
1320 order_scanned++;
1321 nr_scanned++;
1322 pfn = page_to_pfn(freepage);
1323
1324 if (pfn >= highest)
1325 highest = pageblock_start_pfn(pfn);
1326
1327 if (pfn >= low_pfn) {
1328 cc->fast_search_fail = 0;
1329 cc->search_order = order;
1330 page = freepage;
1331 break;
1332 }
1333
1334 if (pfn >= min_pfn && pfn > high_pfn) {
1335 high_pfn = pfn;
1336
1337 /* Shorten the scan if a candidate is found */
1338 limit >>= 1;
1339 }
1340
1341 if (order_scanned >= limit)
1342 break;
1343 }
1344
1345 /* Use a minimum pfn if a preferred one was not found */
1346 if (!page && high_pfn) {
1347 page = pfn_to_page(high_pfn);
1348
1349 /* Update freepage for the list reorder below */
1350 freepage = page;
1351 }
1352
1353 /* Reorder to so a future search skips recent pages */
1354 move_freelist_head(freelist, freepage);
1355
1356 /* Isolate the page if available */
1357 if (page) {
1358 if (__isolate_free_page(page, order)) {
1359 set_page_private(page, order);
1360 nr_isolated = 1 << order;
1361 cc->nr_freepages += nr_isolated;
1362 list_add_tail(&page->lru, &cc->freepages);
1363 count_compact_events(COMPACTISOLATED, nr_isolated);
1364 } else {
1365 /* If isolation fails, abort the search */
1366 order = -1;
1367 page = NULL;
1368 }
1369 }
1370
1371 spin_unlock_irqrestore(&cc->zone->lock, flags);
1372
1373 /*
1374 * Smaller scan on next order so the total scan ig related
1375 * to freelist_scan_limit.
1376 */
1377 if (order_scanned >= limit)
1378 limit = min(1U, limit >> 1);
1379 }
1380
1381 if (!page) {
1382 cc->fast_search_fail++;
1383 if (scan_start) {
1384 /*
1385 * Use the highest PFN found above min. If one was
1386 * not found, be pessemistic for direct compaction
1387 * and use the min mark.
1388 */
1389 if (highest) {
1390 page = pfn_to_page(highest);
1391 cc->free_pfn = highest;
1392 } else {
1393 if (cc->direct_compaction) {
1394 page = pfn_to_page(min_pfn);
1395 cc->free_pfn = min_pfn;
1396 }
1397 }
1398 }
1399 }
1400
1401 if (highest && highest >= cc->zone->compact_cached_free_pfn) {
1402 highest -= pageblock_nr_pages;
1403 cc->zone->compact_cached_free_pfn = highest;
1404 }
1405
1406 cc->total_free_scanned += nr_scanned;
1407 if (!page)
1408 return cc->free_pfn;
1409
1410 low_pfn = page_to_pfn(page);
1411 fast_isolate_around(cc, low_pfn, nr_isolated);
1412 return low_pfn;
1413}
1414
1415/*
1064 * Based on information in the current compact_control, find blocks 1416 * Based on information in the current compact_control, find blocks
1065 * suitable for isolating free pages from and then isolate them. 1417 * suitable for isolating free pages from and then isolate them.
1066 */ 1418 */
@@ -1073,6 +1425,12 @@ static void isolate_freepages(struct compact_control *cc)
1073 unsigned long block_end_pfn; /* end of current pageblock */ 1425 unsigned long block_end_pfn; /* end of current pageblock */
1074 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 1426 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
1075 struct list_head *freelist = &cc->freepages; 1427 struct list_head *freelist = &cc->freepages;
1428 unsigned int stride;
1429
1430 /* Try a small search of the free lists for a candidate */
1431 isolate_start_pfn = fast_isolate_freepages(cc);
1432 if (cc->nr_freepages)
1433 goto splitmap;
1076 1434
1077 /* 1435 /*
1078 * Initialise the free scanner. The starting point is where we last 1436 * Initialise the free scanner. The starting point is where we last
@@ -1086,10 +1444,11 @@ static void isolate_freepages(struct compact_control *cc)
1086 * is using. 1444 * is using.
1087 */ 1445 */
1088 isolate_start_pfn = cc->free_pfn; 1446 isolate_start_pfn = cc->free_pfn;
1089 block_start_pfn = pageblock_start_pfn(cc->free_pfn); 1447 block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
1090 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 1448 block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
1091 zone_end_pfn(zone)); 1449 zone_end_pfn(zone));
1092 low_pfn = pageblock_end_pfn(cc->migrate_pfn); 1450 low_pfn = pageblock_end_pfn(cc->migrate_pfn);
1451 stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
1093 1452
1094 /* 1453 /*
1095 * Isolate free pages until enough are available to migrate the 1454 * Isolate free pages until enough are available to migrate the
@@ -1100,14 +1459,14 @@ static void isolate_freepages(struct compact_control *cc)
1100 block_end_pfn = block_start_pfn, 1459 block_end_pfn = block_start_pfn,
1101 block_start_pfn -= pageblock_nr_pages, 1460 block_start_pfn -= pageblock_nr_pages,
1102 isolate_start_pfn = block_start_pfn) { 1461 isolate_start_pfn = block_start_pfn) {
1462 unsigned long nr_isolated;
1463
1103 /* 1464 /*
1104 * This can iterate a massively long zone without finding any 1465 * This can iterate a massively long zone without finding any
1105 * suitable migration targets, so periodically check if we need 1466 * suitable migration targets, so periodically check resched.
1106 * to schedule, or even abort async compaction.
1107 */ 1467 */
1108 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1468 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
1109 && compact_should_abort(cc)) 1469 cond_resched();
1110 break;
1111 1470
1112 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 1471 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1113 zone); 1472 zone);
@@ -1123,15 +1482,15 @@ static void isolate_freepages(struct compact_control *cc)
1123 continue; 1482 continue;
1124 1483
1125 /* Found a block suitable for isolating free pages from. */ 1484 /* Found a block suitable for isolating free pages from. */
1126 isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, 1485 nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
1127 freelist, false); 1486 block_end_pfn, freelist, stride, false);
1128 1487
1129 /* 1488 /* Update the skip hint if the full pageblock was scanned */
1130 * If we isolated enough freepages, or aborted due to lock 1489 if (isolate_start_pfn == block_end_pfn)
1131 * contention, terminate. 1490 update_pageblock_skip(cc, page, block_start_pfn);
1132 */ 1491
1133 if ((cc->nr_freepages >= cc->nr_migratepages) 1492 /* Are enough freepages isolated? */
1134 || cc->contended) { 1493 if (cc->nr_freepages >= cc->nr_migratepages) {
1135 if (isolate_start_pfn >= block_end_pfn) { 1494 if (isolate_start_pfn >= block_end_pfn) {
1136 /* 1495 /*
1137 * Restart at previous pageblock if more 1496 * Restart at previous pageblock if more
@@ -1148,10 +1507,14 @@ static void isolate_freepages(struct compact_control *cc)
1148 */ 1507 */
1149 break; 1508 break;
1150 } 1509 }
1151 }
1152 1510
1153 /* __isolate_free_page() does not map the pages */ 1511 /* Adjust stride depending on isolation */
1154 map_pages(freelist); 1512 if (nr_isolated) {
1513 stride = 1;
1514 continue;
1515 }
1516 stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
1517 }
1155 1518
1156 /* 1519 /*
1157 * Record where the free scanner will restart next time. Either we 1520 * Record where the free scanner will restart next time. Either we
@@ -1160,6 +1523,10 @@ static void isolate_freepages(struct compact_control *cc)
1160 * and the loop terminated due to isolate_start_pfn < low_pfn 1523 * and the loop terminated due to isolate_start_pfn < low_pfn
1161 */ 1524 */
1162 cc->free_pfn = isolate_start_pfn; 1525 cc->free_pfn = isolate_start_pfn;
1526
1527splitmap:
1528 /* __isolate_free_page() does not map the pages */
1529 split_map_pages(freelist);
1163} 1530}
1164 1531
1165/* 1532/*
@@ -1172,13 +1539,8 @@ static struct page *compaction_alloc(struct page *migratepage,
1172 struct compact_control *cc = (struct compact_control *)data; 1539 struct compact_control *cc = (struct compact_control *)data;
1173 struct page *freepage; 1540 struct page *freepage;
1174 1541
1175 /*
1176 * Isolate free pages if necessary, and if we are not aborting due to
1177 * contention.
1178 */
1179 if (list_empty(&cc->freepages)) { 1542 if (list_empty(&cc->freepages)) {
1180 if (!cc->contended) 1543 isolate_freepages(cc);
1181 isolate_freepages(cc);
1182 1544
1183 if (list_empty(&cc->freepages)) 1545 if (list_empty(&cc->freepages))
1184 return NULL; 1546 return NULL;
@@ -1217,6 +1579,147 @@ typedef enum {
1217 */ 1579 */
1218int sysctl_compact_unevictable_allowed __read_mostly = 1; 1580int sysctl_compact_unevictable_allowed __read_mostly = 1;
1219 1581
1582static inline void
1583update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
1584{
1585 if (cc->fast_start_pfn == ULONG_MAX)
1586 return;
1587
1588 if (!cc->fast_start_pfn)
1589 cc->fast_start_pfn = pfn;
1590
1591 cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
1592}
1593
1594static inline unsigned long
1595reinit_migrate_pfn(struct compact_control *cc)
1596{
1597 if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
1598 return cc->migrate_pfn;
1599
1600 cc->migrate_pfn = cc->fast_start_pfn;
1601 cc->fast_start_pfn = ULONG_MAX;
1602
1603 return cc->migrate_pfn;
1604}
1605
1606/*
1607 * Briefly search the free lists for a migration source that already has
1608 * some free pages to reduce the number of pages that need migration
1609 * before a pageblock is free.
1610 */
1611static unsigned long fast_find_migrateblock(struct compact_control *cc)
1612{
1613 unsigned int limit = freelist_scan_limit(cc);
1614 unsigned int nr_scanned = 0;
1615 unsigned long distance;
1616 unsigned long pfn = cc->migrate_pfn;
1617 unsigned long high_pfn;
1618 int order;
1619
1620 /* Skip hints are relied on to avoid repeats on the fast search */
1621 if (cc->ignore_skip_hint)
1622 return pfn;
1623
1624 /*
1625 * If the migrate_pfn is not at the start of a zone or the start
1626 * of a pageblock then assume this is a continuation of a previous
1627 * scan restarted due to COMPACT_CLUSTER_MAX.
1628 */
1629 if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
1630 return pfn;
1631
1632 /*
1633 * For smaller orders, just linearly scan as the number of pages
1634 * to migrate should be relatively small and does not necessarily
1635 * justify freeing up a large block for a small allocation.
1636 */
1637 if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
1638 return pfn;
1639
1640 /*
1641 * Only allow kcompactd and direct requests for movable pages to
1642 * quickly clear out a MOVABLE pageblock for allocation. This
1643 * reduces the risk that a large movable pageblock is freed for
1644 * an unmovable/reclaimable small allocation.
1645 */
1646 if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
1647 return pfn;
1648
1649 /*
1650 * When starting the migration scanner, pick any pageblock within the
1651 * first half of the search space. Otherwise try and pick a pageblock
1652 * within the first eighth to reduce the chances that a migration
1653 * target later becomes a source.
1654 */
1655 distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
1656 if (cc->migrate_pfn != cc->zone->zone_start_pfn)
1657 distance >>= 2;
1658 high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
1659
1660 for (order = cc->order - 1;
1661 order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
1662 order--) {
1663 struct free_area *area = &cc->zone->free_area[order];
1664 struct list_head *freelist;
1665 unsigned long flags;
1666 struct page *freepage;
1667
1668 if (!area->nr_free)
1669 continue;
1670
1671 spin_lock_irqsave(&cc->zone->lock, flags);
1672 freelist = &area->free_list[MIGRATE_MOVABLE];
1673 list_for_each_entry(freepage, freelist, lru) {
1674 unsigned long free_pfn;
1675
1676 nr_scanned++;
1677 free_pfn = page_to_pfn(freepage);
1678 if (free_pfn < high_pfn) {
1679 /*
1680 * Avoid if skipped recently. Ideally it would
1681 * move to the tail but even safe iteration of
1682 * the list assumes an entry is deleted, not
1683 * reordered.
1684 */
1685 if (get_pageblock_skip(freepage)) {
1686 if (list_is_last(freelist, &freepage->lru))
1687 break;
1688
1689 continue;
1690 }
1691
1692 /* Reorder to so a future search skips recent pages */
1693 move_freelist_tail(freelist, freepage);
1694
1695 update_fast_start_pfn(cc, free_pfn);
1696 pfn = pageblock_start_pfn(free_pfn);
1697 cc->fast_search_fail = 0;
1698 set_pageblock_skip(freepage);
1699 break;
1700 }
1701
1702 if (nr_scanned >= limit) {
1703 cc->fast_search_fail++;
1704 move_freelist_tail(freelist, freepage);
1705 break;
1706 }
1707 }
1708 spin_unlock_irqrestore(&cc->zone->lock, flags);
1709 }
1710
1711 cc->total_migrate_scanned += nr_scanned;
1712
1713 /*
1714 * If fast scanning failed then use a cached entry for a page block
1715 * that had free pages as the basis for starting a linear scan.
1716 */
1717 if (pfn == cc->migrate_pfn)
1718 pfn = reinit_migrate_pfn(cc);
1719
1720 return pfn;
1721}
1722
1220/* 1723/*
1221 * Isolate all pages that can be migrated from the first suitable block, 1724 * Isolate all pages that can be migrated from the first suitable block,
1222 * starting at the block pointed to by the migrate scanner pfn within 1725 * starting at the block pointed to by the migrate scanner pfn within
@@ -1232,16 +1735,25 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1232 const isolate_mode_t isolate_mode = 1735 const isolate_mode_t isolate_mode =
1233 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | 1736 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1234 (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1737 (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1738 bool fast_find_block;
1235 1739
1236 /* 1740 /*
1237 * Start at where we last stopped, or beginning of the zone as 1741 * Start at where we last stopped, or beginning of the zone as
1238 * initialized by compact_zone() 1742 * initialized by compact_zone(). The first failure will use
1743 * the lowest PFN as the starting point for linear scanning.
1239 */ 1744 */
1240 low_pfn = cc->migrate_pfn; 1745 low_pfn = fast_find_migrateblock(cc);
1241 block_start_pfn = pageblock_start_pfn(low_pfn); 1746 block_start_pfn = pageblock_start_pfn(low_pfn);
1242 if (block_start_pfn < zone->zone_start_pfn) 1747 if (block_start_pfn < zone->zone_start_pfn)
1243 block_start_pfn = zone->zone_start_pfn; 1748 block_start_pfn = zone->zone_start_pfn;
1244 1749
1750 /*
1751 * fast_find_migrateblock marks a pageblock skipped so to avoid
1752 * the isolation_suitable check below, check whether the fast
1753 * search was successful.
1754 */
1755 fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
1756
1245 /* Only scan within a pageblock boundary */ 1757 /* Only scan within a pageblock boundary */
1246 block_end_pfn = pageblock_end_pfn(low_pfn); 1758 block_end_pfn = pageblock_end_pfn(low_pfn);
1247 1759
@@ -1250,6 +1762,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1250 * Do not cross the free scanner. 1762 * Do not cross the free scanner.
1251 */ 1763 */
1252 for (; block_end_pfn <= cc->free_pfn; 1764 for (; block_end_pfn <= cc->free_pfn;
1765 fast_find_block = false,
1253 low_pfn = block_end_pfn, 1766 low_pfn = block_end_pfn,
1254 block_start_pfn = block_end_pfn, 1767 block_start_pfn = block_end_pfn,
1255 block_end_pfn += pageblock_nr_pages) { 1768 block_end_pfn += pageblock_nr_pages) {
@@ -1257,34 +1770,45 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1257 /* 1770 /*
1258 * This can potentially iterate a massively long zone with 1771 * This can potentially iterate a massively long zone with
1259 * many pageblocks unsuitable, so periodically check if we 1772 * many pageblocks unsuitable, so periodically check if we
1260 * need to schedule, or even abort async compaction. 1773 * need to schedule.
1261 */ 1774 */
1262 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1775 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
1263 && compact_should_abort(cc)) 1776 cond_resched();
1264 break;
1265 1777
1266 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 1778 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1267 zone); 1779 zone);
1268 if (!page) 1780 if (!page)
1269 continue; 1781 continue;
1270 1782
1271 /* If isolation recently failed, do not retry */ 1783 /*
1272 if (!isolation_suitable(cc, page)) 1784 * If isolation recently failed, do not retry. Only check the
1785 * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
1786 * to be visited multiple times. Assume skip was checked
1787 * before making it "skip" so other compaction instances do
1788 * not scan the same block.
1789 */
1790 if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
1791 !fast_find_block && !isolation_suitable(cc, page))
1273 continue; 1792 continue;
1274 1793
1275 /* 1794 /*
1276 * For async compaction, also only scan in MOVABLE blocks. 1795 * For async compaction, also only scan in MOVABLE blocks
1277 * Async compaction is optimistic to see if the minimum amount 1796 * without huge pages. Async compaction is optimistic to see
1278 * of work satisfies the allocation. 1797 * if the minimum amount of work satisfies the allocation.
1798 * The cached PFN is updated as it's possible that all
1799 * remaining blocks between source and target are unsuitable
1800 * and the compaction scanners fail to meet.
1279 */ 1801 */
1280 if (!suitable_migration_source(cc, page)) 1802 if (!suitable_migration_source(cc, page)) {
1803 update_cached_migrate(cc, block_end_pfn);
1281 continue; 1804 continue;
1805 }
1282 1806
1283 /* Perform the isolation */ 1807 /* Perform the isolation */
1284 low_pfn = isolate_migratepages_block(cc, low_pfn, 1808 low_pfn = isolate_migratepages_block(cc, low_pfn,
1285 block_end_pfn, isolate_mode); 1809 block_end_pfn, isolate_mode);
1286 1810
1287 if (!low_pfn || cc->contended) 1811 if (!low_pfn)
1288 return ISOLATE_ABORT; 1812 return ISOLATE_ABORT;
1289 1813
1290 /* 1814 /*
@@ -1310,19 +1834,16 @@ static inline bool is_via_compact_memory(int order)
1310 return order == -1; 1834 return order == -1;
1311} 1835}
1312 1836
1313static enum compact_result __compact_finished(struct zone *zone, 1837static enum compact_result __compact_finished(struct compact_control *cc)
1314 struct compact_control *cc)
1315{ 1838{
1316 unsigned int order; 1839 unsigned int order;
1317 const int migratetype = cc->migratetype; 1840 const int migratetype = cc->migratetype;
1318 1841 int ret;
1319 if (cc->contended || fatal_signal_pending(current))
1320 return COMPACT_CONTENDED;
1321 1842
1322 /* Compaction run completes if the migrate and free scanner meet */ 1843 /* Compaction run completes if the migrate and free scanner meet */
1323 if (compact_scanners_met(cc)) { 1844 if (compact_scanners_met(cc)) {
1324 /* Let the next compaction start anew. */ 1845 /* Let the next compaction start anew. */
1325 reset_cached_positions(zone); 1846 reset_cached_positions(cc->zone);
1326 1847
1327 /* 1848 /*
1328 * Mark that the PG_migrate_skip information should be cleared 1849 * Mark that the PG_migrate_skip information should be cleared
@@ -1331,7 +1852,7 @@ static enum compact_result __compact_finished(struct zone *zone,
1331 * based on an allocation request. 1852 * based on an allocation request.
1332 */ 1853 */
1333 if (cc->direct_compaction) 1854 if (cc->direct_compaction)
1334 zone->compact_blockskip_flush = true; 1855 cc->zone->compact_blockskip_flush = true;
1335 1856
1336 if (cc->whole_zone) 1857 if (cc->whole_zone)
1337 return COMPACT_COMPLETE; 1858 return COMPACT_COMPLETE;
@@ -1342,20 +1863,19 @@ static enum compact_result __compact_finished(struct zone *zone,
1342 if (is_via_compact_memory(cc->order)) 1863 if (is_via_compact_memory(cc->order))
1343 return COMPACT_CONTINUE; 1864 return COMPACT_CONTINUE;
1344 1865
1345 if (cc->finishing_block) { 1866 /*
1346 /* 1867 * Always finish scanning a pageblock to reduce the possibility of
1347 * We have finished the pageblock, but better check again that 1868 * fallbacks in the future. This is particularly important when
1348 * we really succeeded. 1869 * migration source is unmovable/reclaimable but it's not worth
1349 */ 1870 * special casing.
1350 if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) 1871 */
1351 cc->finishing_block = false; 1872 if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
1352 else 1873 return COMPACT_CONTINUE;
1353 return COMPACT_CONTINUE;
1354 }
1355 1874
1356 /* Direct compactor: Is a suitable page free? */ 1875 /* Direct compactor: Is a suitable page free? */
1876 ret = COMPACT_NO_SUITABLE_PAGE;
1357 for (order = cc->order; order < MAX_ORDER; order++) { 1877 for (order = cc->order; order < MAX_ORDER; order++) {
1358 struct free_area *area = &zone->free_area[order]; 1878 struct free_area *area = &cc->zone->free_area[order];
1359 bool can_steal; 1879 bool can_steal;
1360 1880
1361 /* Job done if page is free of the right migratetype */ 1881 /* Job done if page is free of the right migratetype */
@@ -1393,21 +1913,23 @@ static enum compact_result __compact_finished(struct zone *zone,
1393 return COMPACT_SUCCESS; 1913 return COMPACT_SUCCESS;
1394 } 1914 }
1395 1915
1396 cc->finishing_block = true; 1916 ret = COMPACT_CONTINUE;
1397 return COMPACT_CONTINUE; 1917 break;
1398 } 1918 }
1399 } 1919 }
1400 1920
1401 return COMPACT_NO_SUITABLE_PAGE; 1921 if (cc->contended || fatal_signal_pending(current))
1922 ret = COMPACT_CONTENDED;
1923
1924 return ret;
1402} 1925}
1403 1926
1404static enum compact_result compact_finished(struct zone *zone, 1927static enum compact_result compact_finished(struct compact_control *cc)
1405 struct compact_control *cc)
1406{ 1928{
1407 int ret; 1929 int ret;
1408 1930
1409 ret = __compact_finished(zone, cc); 1931 ret = __compact_finished(cc);
1410 trace_mm_compaction_finished(zone, cc->order, ret); 1932 trace_mm_compaction_finished(cc->zone, cc->order, ret);
1411 if (ret == COMPACT_NO_SUITABLE_PAGE) 1933 if (ret == COMPACT_NO_SUITABLE_PAGE)
1412 ret = COMPACT_CONTINUE; 1934 ret = COMPACT_CONTINUE;
1413 1935
@@ -1534,15 +2056,18 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
1534 return false; 2056 return false;
1535} 2057}
1536 2058
1537static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) 2059static enum compact_result
2060compact_zone(struct compact_control *cc, struct capture_control *capc)
1538{ 2061{
1539 enum compact_result ret; 2062 enum compact_result ret;
1540 unsigned long start_pfn = zone->zone_start_pfn; 2063 unsigned long start_pfn = cc->zone->zone_start_pfn;
1541 unsigned long end_pfn = zone_end_pfn(zone); 2064 unsigned long end_pfn = zone_end_pfn(cc->zone);
2065 unsigned long last_migrated_pfn;
1542 const bool sync = cc->mode != MIGRATE_ASYNC; 2066 const bool sync = cc->mode != MIGRATE_ASYNC;
2067 bool update_cached;
1543 2068
1544 cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); 2069 cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1545 ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 2070 ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
1546 cc->classzone_idx); 2071 cc->classzone_idx);
1547 /* Compaction is likely to fail */ 2072 /* Compaction is likely to fail */
1548 if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) 2073 if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
@@ -1555,8 +2080,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
1555 * Clear pageblock skip if there were failures recently and compaction 2080 * Clear pageblock skip if there were failures recently and compaction
1556 * is about to be retried after being deferred. 2081 * is about to be retried after being deferred.
1557 */ 2082 */
1558 if (compaction_restarting(zone, cc->order)) 2083 if (compaction_restarting(cc->zone, cc->order))
1559 __reset_isolation_suitable(zone); 2084 __reset_isolation_suitable(cc->zone);
1560 2085
1561 /* 2086 /*
1562 * Setup to move all movable pages to the end of the zone. Used cached 2087 * Setup to move all movable pages to the end of the zone. Used cached
@@ -1564,43 +2089,76 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
1564 * want to compact the whole zone), but check that it is initialised 2089 * want to compact the whole zone), but check that it is initialised
1565 * by ensuring the values are within zone boundaries. 2090 * by ensuring the values are within zone boundaries.
1566 */ 2091 */
2092 cc->fast_start_pfn = 0;
1567 if (cc->whole_zone) { 2093 if (cc->whole_zone) {
1568 cc->migrate_pfn = start_pfn; 2094 cc->migrate_pfn = start_pfn;
1569 cc->free_pfn = pageblock_start_pfn(end_pfn - 1); 2095 cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1570 } else { 2096 } else {
1571 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 2097 cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
1572 cc->free_pfn = zone->compact_cached_free_pfn; 2098 cc->free_pfn = cc->zone->compact_cached_free_pfn;
1573 if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { 2099 if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1574 cc->free_pfn = pageblock_start_pfn(end_pfn - 1); 2100 cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1575 zone->compact_cached_free_pfn = cc->free_pfn; 2101 cc->zone->compact_cached_free_pfn = cc->free_pfn;
1576 } 2102 }
1577 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { 2103 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1578 cc->migrate_pfn = start_pfn; 2104 cc->migrate_pfn = start_pfn;
1579 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 2105 cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1580 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 2106 cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1581 } 2107 }
1582 2108
1583 if (cc->migrate_pfn == start_pfn) 2109 if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
1584 cc->whole_zone = true; 2110 cc->whole_zone = true;
1585 } 2111 }
1586 2112
1587 cc->last_migrated_pfn = 0; 2113 last_migrated_pfn = 0;
2114
2115 /*
2116 * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
2117 * the basis that some migrations will fail in ASYNC mode. However,
2118 * if the cached PFNs match and pageblocks are skipped due to having
2119 * no isolation candidates, then the sync state does not matter.
2120 * Until a pageblock with isolation candidates is found, keep the
2121 * cached PFNs in sync to avoid revisiting the same blocks.
2122 */
2123 update_cached = !sync &&
2124 cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
1588 2125
1589 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, 2126 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1590 cc->free_pfn, end_pfn, sync); 2127 cc->free_pfn, end_pfn, sync);
1591 2128
1592 migrate_prep_local(); 2129 migrate_prep_local();
1593 2130
1594 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 2131 while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
1595 int err; 2132 int err;
2133 unsigned long start_pfn = cc->migrate_pfn;
2134
2135 /*
2136 * Avoid multiple rescans which can happen if a page cannot be
2137 * isolated (dirty/writeback in async mode) or if the migrated
2138 * pages are being allocated before the pageblock is cleared.
2139 * The first rescan will capture the entire pageblock for
2140 * migration. If it fails, it'll be marked skip and scanning
2141 * will proceed as normal.
2142 */
2143 cc->rescan = false;
2144 if (pageblock_start_pfn(last_migrated_pfn) ==
2145 pageblock_start_pfn(start_pfn)) {
2146 cc->rescan = true;
2147 }
1596 2148
1597 switch (isolate_migratepages(zone, cc)) { 2149 switch (isolate_migratepages(cc->zone, cc)) {
1598 case ISOLATE_ABORT: 2150 case ISOLATE_ABORT:
1599 ret = COMPACT_CONTENDED; 2151 ret = COMPACT_CONTENDED;
1600 putback_movable_pages(&cc->migratepages); 2152 putback_movable_pages(&cc->migratepages);
1601 cc->nr_migratepages = 0; 2153 cc->nr_migratepages = 0;
2154 last_migrated_pfn = 0;
1602 goto out; 2155 goto out;
1603 case ISOLATE_NONE: 2156 case ISOLATE_NONE:
2157 if (update_cached) {
2158 cc->zone->compact_cached_migrate_pfn[1] =
2159 cc->zone->compact_cached_migrate_pfn[0];
2160 }
2161
1604 /* 2162 /*
1605 * We haven't isolated and migrated anything, but 2163 * We haven't isolated and migrated anything, but
1606 * there might still be unflushed migrations from 2164 * there might still be unflushed migrations from
@@ -1608,6 +2166,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
1608 */ 2166 */
1609 goto check_drain; 2167 goto check_drain;
1610 case ISOLATE_SUCCESS: 2168 case ISOLATE_SUCCESS:
2169 update_cached = false;
2170 last_migrated_pfn = start_pfn;
1611 ; 2171 ;
1612 } 2172 }
1613 2173
@@ -1639,8 +2199,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
1639 cc->migrate_pfn = block_end_pfn( 2199 cc->migrate_pfn = block_end_pfn(
1640 cc->migrate_pfn - 1, cc->order); 2200 cc->migrate_pfn - 1, cc->order);
1641 /* Draining pcplists is useless in this case */ 2201 /* Draining pcplists is useless in this case */
1642 cc->last_migrated_pfn = 0; 2202 last_migrated_pfn = 0;
1643
1644 } 2203 }
1645 } 2204 }
1646 2205
@@ -1652,21 +2211,26 @@ check_drain:
1652 * compact_finished() can detect immediately if allocation 2211 * compact_finished() can detect immediately if allocation
1653 * would succeed. 2212 * would succeed.
1654 */ 2213 */
1655 if (cc->order > 0 && cc->last_migrated_pfn) { 2214 if (cc->order > 0 && last_migrated_pfn) {
1656 int cpu; 2215 int cpu;
1657 unsigned long current_block_start = 2216 unsigned long current_block_start =
1658 block_start_pfn(cc->migrate_pfn, cc->order); 2217 block_start_pfn(cc->migrate_pfn, cc->order);
1659 2218
1660 if (cc->last_migrated_pfn < current_block_start) { 2219 if (last_migrated_pfn < current_block_start) {
1661 cpu = get_cpu(); 2220 cpu = get_cpu();
1662 lru_add_drain_cpu(cpu); 2221 lru_add_drain_cpu(cpu);
1663 drain_local_pages(zone); 2222 drain_local_pages(cc->zone);
1664 put_cpu(); 2223 put_cpu();
1665 /* No more flushing until we migrate again */ 2224 /* No more flushing until we migrate again */
1666 cc->last_migrated_pfn = 0; 2225 last_migrated_pfn = 0;
1667 } 2226 }
1668 } 2227 }
1669 2228
2229 /* Stop if a page has been captured */
2230 if (capc && capc->page) {
2231 ret = COMPACT_SUCCESS;
2232 break;
2233 }
1670 } 2234 }
1671 2235
1672out: 2236out:
@@ -1685,8 +2249,8 @@ out:
1685 * Only go back, not forward. The cached pfn might have been 2249 * Only go back, not forward. The cached pfn might have been
1686 * already reset to zone end in compact_finished() 2250 * already reset to zone end in compact_finished()
1687 */ 2251 */
1688 if (free_pfn > zone->compact_cached_free_pfn) 2252 if (free_pfn > cc->zone->compact_cached_free_pfn)
1689 zone->compact_cached_free_pfn = free_pfn; 2253 cc->zone->compact_cached_free_pfn = free_pfn;
1690 } 2254 }
1691 2255
1692 count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); 2256 count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
@@ -1700,7 +2264,8 @@ out:
1700 2264
1701static enum compact_result compact_zone_order(struct zone *zone, int order, 2265static enum compact_result compact_zone_order(struct zone *zone, int order,
1702 gfp_t gfp_mask, enum compact_priority prio, 2266 gfp_t gfp_mask, enum compact_priority prio,
1703 unsigned int alloc_flags, int classzone_idx) 2267 unsigned int alloc_flags, int classzone_idx,
2268 struct page **capture)
1704{ 2269{
1705 enum compact_result ret; 2270 enum compact_result ret;
1706 struct compact_control cc = { 2271 struct compact_control cc = {
@@ -1709,6 +2274,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
1709 .total_migrate_scanned = 0, 2274 .total_migrate_scanned = 0,
1710 .total_free_scanned = 0, 2275 .total_free_scanned = 0,
1711 .order = order, 2276 .order = order,
2277 .search_order = order,
1712 .gfp_mask = gfp_mask, 2278 .gfp_mask = gfp_mask,
1713 .zone = zone, 2279 .zone = zone,
1714 .mode = (prio == COMPACT_PRIO_ASYNC) ? 2280 .mode = (prio == COMPACT_PRIO_ASYNC) ?
@@ -1720,14 +2286,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
1720 .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), 2286 .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
1721 .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) 2287 .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
1722 }; 2288 };
2289 struct capture_control capc = {
2290 .cc = &cc,
2291 .page = NULL,
2292 };
2293
2294 if (capture)
2295 current->capture_control = &capc;
1723 INIT_LIST_HEAD(&cc.freepages); 2296 INIT_LIST_HEAD(&cc.freepages);
1724 INIT_LIST_HEAD(&cc.migratepages); 2297 INIT_LIST_HEAD(&cc.migratepages);
1725 2298
1726 ret = compact_zone(zone, &cc); 2299 ret = compact_zone(&cc, &capc);
1727 2300
1728 VM_BUG_ON(!list_empty(&cc.freepages)); 2301 VM_BUG_ON(!list_empty(&cc.freepages));
1729 VM_BUG_ON(!list_empty(&cc.migratepages)); 2302 VM_BUG_ON(!list_empty(&cc.migratepages));
1730 2303
2304 *capture = capc.page;
2305 current->capture_control = NULL;
2306
1731 return ret; 2307 return ret;
1732} 2308}
1733 2309
@@ -1745,7 +2321,7 @@ int sysctl_extfrag_threshold = 500;
1745 */ 2321 */
1746enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 2322enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1747 unsigned int alloc_flags, const struct alloc_context *ac, 2323 unsigned int alloc_flags, const struct alloc_context *ac,
1748 enum compact_priority prio) 2324 enum compact_priority prio, struct page **capture)
1749{ 2325{
1750 int may_perform_io = gfp_mask & __GFP_IO; 2326 int may_perform_io = gfp_mask & __GFP_IO;
1751 struct zoneref *z; 2327 struct zoneref *z;
@@ -1773,7 +2349,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1773 } 2349 }
1774 2350
1775 status = compact_zone_order(zone, order, gfp_mask, prio, 2351 status = compact_zone_order(zone, order, gfp_mask, prio,
1776 alloc_flags, ac_classzone_idx(ac)); 2352 alloc_flags, ac_classzone_idx(ac), capture);
1777 rc = max(status, rc); 2353 rc = max(status, rc);
1778 2354
1779 /* The allocation should succeed, stop compacting */ 2355 /* The allocation should succeed, stop compacting */
@@ -1841,7 +2417,7 @@ static void compact_node(int nid)
1841 INIT_LIST_HEAD(&cc.freepages); 2417 INIT_LIST_HEAD(&cc.freepages);
1842 INIT_LIST_HEAD(&cc.migratepages); 2418 INIT_LIST_HEAD(&cc.migratepages);
1843 2419
1844 compact_zone(zone, &cc); 2420 compact_zone(&cc, NULL);
1845 2421
1846 VM_BUG_ON(!list_empty(&cc.freepages)); 2422 VM_BUG_ON(!list_empty(&cc.freepages));
1847 VM_BUG_ON(!list_empty(&cc.migratepages)); 2423 VM_BUG_ON(!list_empty(&cc.migratepages));
@@ -1876,14 +2452,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
1876 return 0; 2452 return 0;
1877} 2453}
1878 2454
1879int sysctl_extfrag_handler(struct ctl_table *table, int write,
1880 void __user *buffer, size_t *length, loff_t *ppos)
1881{
1882 proc_dointvec_minmax(table, write, buffer, length, ppos);
1883
1884 return 0;
1885}
1886
1887#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 2455#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1888static ssize_t sysfs_compact_node(struct device *dev, 2456static ssize_t sysfs_compact_node(struct device *dev,
1889 struct device_attribute *attr, 2457 struct device_attribute *attr,
@@ -1948,6 +2516,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
1948 struct zone *zone; 2516 struct zone *zone;
1949 struct compact_control cc = { 2517 struct compact_control cc = {
1950 .order = pgdat->kcompactd_max_order, 2518 .order = pgdat->kcompactd_max_order,
2519 .search_order = pgdat->kcompactd_max_order,
1951 .total_migrate_scanned = 0, 2520 .total_migrate_scanned = 0,
1952 .total_free_scanned = 0, 2521 .total_free_scanned = 0,
1953 .classzone_idx = pgdat->kcompactd_classzone_idx, 2522 .classzone_idx = pgdat->kcompactd_classzone_idx,
@@ -1983,7 +2552,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
1983 2552
1984 if (kthread_should_stop()) 2553 if (kthread_should_stop())
1985 return; 2554 return;
1986 status = compact_zone(zone, &cc); 2555 status = compact_zone(&cc, NULL);
1987 2556
1988 if (status == COMPACT_SUCCESS) { 2557 if (status == COMPACT_SUCCESS) {
1989 compaction_defer_reset(zone, cc.order, false); 2558 compaction_defer_reset(zone, cc.order, false);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 6d4b97e7e9e9..76a160083506 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -114,10 +114,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL);
114 * @size: size of the blocks in this pool. 114 * @size: size of the blocks in this pool.
115 * @align: alignment requirement for blocks; must be a power of two 115 * @align: alignment requirement for blocks; must be a power of two
116 * @boundary: returned blocks won't cross this power of two boundary 116 * @boundary: returned blocks won't cross this power of two boundary
117 * Context: !in_interrupt() 117 * Context: not in_interrupt()
118 * 118 *
119 * Returns a dma allocation pool with the requested characteristics, or 119 * Given one of these pools, dma_pool_alloc()
120 * null if one can't be created. Given one of these pools, dma_pool_alloc()
121 * may be used to allocate memory. Such memory will all have "consistent" 120 * may be used to allocate memory. Such memory will all have "consistent"
122 * DMA mappings, accessible by the device and its driver without using 121 * DMA mappings, accessible by the device and its driver without using
123 * cache flushing primitives. The actual size of blocks allocated may be 122 * cache flushing primitives. The actual size of blocks allocated may be
@@ -127,6 +126,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL);
127 * cross that size boundary. This is useful for devices which have 126 * cross that size boundary. This is useful for devices which have
128 * addressing restrictions on individual DMA transfers, such as not crossing 127 * addressing restrictions on individual DMA transfers, such as not crossing
129 * boundaries of 4KBytes. 128 * boundaries of 4KBytes.
129 *
130 * Return: a dma allocation pool with the requested characteristics, or
131 * %NULL if one can't be created.
130 */ 132 */
131struct dma_pool *dma_pool_create(const char *name, struct device *dev, 133struct dma_pool *dma_pool_create(const char *name, struct device *dev,
132 size_t size, size_t align, size_t boundary) 134 size_t size, size_t align, size_t boundary)
@@ -313,7 +315,7 @@ EXPORT_SYMBOL(dma_pool_destroy);
313 * @mem_flags: GFP_* bitmask 315 * @mem_flags: GFP_* bitmask
314 * @handle: pointer to dma address of block 316 * @handle: pointer to dma address of block
315 * 317 *
316 * This returns the kernel virtual address of a currently unused block, 318 * Return: the kernel virtual address of a currently unused block,
317 * and reports its dma address through the handle. 319 * and reports its dma address through the handle.
318 * If such a memory block can't be allocated, %NULL is returned. 320 * If such a memory block can't be allocated, %NULL is returned.
319 */ 321 */
@@ -498,6 +500,9 @@ static int dmam_pool_match(struct device *dev, void *res, void *match_data)
498 * 500 *
499 * Managed dma_pool_create(). DMA pool created with this function is 501 * Managed dma_pool_create(). DMA pool created with this function is
500 * automatically destroyed on driver detach. 502 * automatically destroyed on driver detach.
503 *
504 * Return: a managed dma allocation pool with the requested
505 * characteristics, or %NULL if one can't be created.
501 */ 506 */
502struct dma_pool *dmam_pool_create(const char *name, struct device *dev, 507struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
503 size_t size, size_t align, size_t allocation) 508 size_t size, size_t align, size_t allocation)
diff --git a/mm/failslab.c b/mm/failslab.c
index b135ebb88b6f..ec5aad211c5b 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -48,18 +48,12 @@ static int __init failslab_debugfs_init(void)
48 if (IS_ERR(dir)) 48 if (IS_ERR(dir))
49 return PTR_ERR(dir); 49 return PTR_ERR(dir);
50 50
51 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 51 debugfs_create_bool("ignore-gfp-wait", mode, dir,
52 &failslab.ignore_gfp_reclaim)) 52 &failslab.ignore_gfp_reclaim);
53 goto fail; 53 debugfs_create_bool("cache-filter", mode, dir,
54 if (!debugfs_create_bool("cache-filter", mode, dir, 54 &failslab.cache_filter);
55 &failslab.cache_filter))
56 goto fail;
57 55
58 return 0; 56 return 0;
59fail:
60 debugfs_remove_recursive(dir);
61
62 return -ENOMEM;
63} 57}
64 58
65late_initcall(failslab_debugfs_init); 59late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index 9f5e323e883e..a3b4021c448f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -98,8 +98,8 @@
98 * ->swap_lock (try_to_unmap_one) 98 * ->swap_lock (try_to_unmap_one)
99 * ->private_lock (try_to_unmap_one) 99 * ->private_lock (try_to_unmap_one)
100 * ->i_pages lock (try_to_unmap_one) 100 * ->i_pages lock (try_to_unmap_one)
101 * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) 101 * ->pgdat->lru_lock (follow_page->mark_page_accessed)
102 * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) 102 * ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
103 * ->private_lock (page_remove_rmap->set_page_dirty) 103 * ->private_lock (page_remove_rmap->set_page_dirty)
104 * ->i_pages lock (page_remove_rmap->set_page_dirty) 104 * ->i_pages lock (page_remove_rmap->set_page_dirty)
105 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 105 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
@@ -392,6 +392,8 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
392 * opposed to a regular memory cleansing writeback. The difference between 392 * opposed to a regular memory cleansing writeback. The difference between
393 * these two operations is that if a dirty page/buffer is encountered, it must 393 * these two operations is that if a dirty page/buffer is encountered, it must
394 * be waited upon, and not just skipped over. 394 * be waited upon, and not just skipped over.
395 *
396 * Return: %0 on success, negative error code otherwise.
395 */ 397 */
396int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 398int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
397 loff_t end, int sync_mode) 399 loff_t end, int sync_mode)
@@ -438,6 +440,8 @@ EXPORT_SYMBOL(filemap_fdatawrite_range);
438 * 440 *
439 * This is a mostly non-blocking flush. Not suitable for data-integrity 441 * This is a mostly non-blocking flush. Not suitable for data-integrity
440 * purposes - I/O may not be started against all dirty pages. 442 * purposes - I/O may not be started against all dirty pages.
443 *
444 * Return: %0 on success, negative error code otherwise.
441 */ 445 */
442int filemap_flush(struct address_space *mapping) 446int filemap_flush(struct address_space *mapping)
443{ 447{
@@ -453,6 +457,9 @@ EXPORT_SYMBOL(filemap_flush);
453 * 457 *
454 * Find at least one page in the range supplied, usually used to check if 458 * Find at least one page in the range supplied, usually used to check if
455 * direct writing in this range will trigger a writeback. 459 * direct writing in this range will trigger a writeback.
460 *
461 * Return: %true if at least one page exists in the specified range,
462 * %false otherwise.
456 */ 463 */
457bool filemap_range_has_page(struct address_space *mapping, 464bool filemap_range_has_page(struct address_space *mapping,
458 loff_t start_byte, loff_t end_byte) 465 loff_t start_byte, loff_t end_byte)
@@ -529,6 +536,8 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
529 * Since the error status of the address space is cleared by this function, 536 * Since the error status of the address space is cleared by this function,
530 * callers are responsible for checking the return value and handling and/or 537 * callers are responsible for checking the return value and handling and/or
531 * reporting the error. 538 * reporting the error.
539 *
540 * Return: error status of the address space.
532 */ 541 */
533int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 542int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
534 loff_t end_byte) 543 loff_t end_byte)
@@ -551,6 +560,8 @@ EXPORT_SYMBOL(filemap_fdatawait_range);
551 * Since the error status of the file is advanced by this function, 560 * Since the error status of the file is advanced by this function,
552 * callers are responsible for checking the return value and handling and/or 561 * callers are responsible for checking the return value and handling and/or
553 * reporting the error. 562 * reporting the error.
563 *
564 * Return: error status of the address space vs. the file->f_wb_err cursor.
554 */ 565 */
555int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) 566int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
556{ 567{
@@ -572,6 +583,8 @@ EXPORT_SYMBOL(file_fdatawait_range);
572 * Use this function if callers don't handle errors themselves. Expected 583 * Use this function if callers don't handle errors themselves. Expected
573 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 584 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
574 * fsfreeze(8) 585 * fsfreeze(8)
586 *
587 * Return: error status of the address space.
575 */ 588 */
576int filemap_fdatawait_keep_errors(struct address_space *mapping) 589int filemap_fdatawait_keep_errors(struct address_space *mapping)
577{ 590{
@@ -623,6 +636,8 @@ EXPORT_SYMBOL(filemap_write_and_wait);
623 * 636 *
624 * Note that @lend is inclusive (describes the last byte to be written) so 637 * Note that @lend is inclusive (describes the last byte to be written) so
625 * that this function can be used to write to the very end-of-file (end = -1). 638 * that this function can be used to write to the very end-of-file (end = -1).
639 *
640 * Return: error status of the address space.
626 */ 641 */
627int filemap_write_and_wait_range(struct address_space *mapping, 642int filemap_write_and_wait_range(struct address_space *mapping,
628 loff_t lstart, loff_t lend) 643 loff_t lstart, loff_t lend)
@@ -678,6 +693,8 @@ EXPORT_SYMBOL(__filemap_set_wb_err);
678 * While we handle mapping->wb_err with atomic operations, the f_wb_err 693 * While we handle mapping->wb_err with atomic operations, the f_wb_err
679 * value is protected by the f_lock since we must ensure that it reflects 694 * value is protected by the f_lock since we must ensure that it reflects
680 * the latest value swapped in for this file descriptor. 695 * the latest value swapped in for this file descriptor.
696 *
697 * Return: %0 on success, negative error code otherwise.
681 */ 698 */
682int file_check_and_advance_wb_err(struct file *file) 699int file_check_and_advance_wb_err(struct file *file)
683{ 700{
@@ -720,6 +737,8 @@ EXPORT_SYMBOL(file_check_and_advance_wb_err);
720 * 737 *
721 * After writing out and waiting on the data, we check and advance the 738 * After writing out and waiting on the data, we check and advance the
722 * f_wb_err cursor to the latest value, and return any errors detected there. 739 * f_wb_err cursor to the latest value, and return any errors detected there.
740 *
741 * Return: %0 on success, negative error code otherwise.
723 */ 742 */
724int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) 743int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
725{ 744{
@@ -753,6 +772,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
753 * caller must do that. 772 * caller must do that.
754 * 773 *
755 * The remove + add is atomic. This function cannot fail. 774 * The remove + add is atomic. This function cannot fail.
775 *
776 * Return: %0
756 */ 777 */
757int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 778int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
758{ 779{
@@ -867,6 +888,8 @@ error:
867 * 888 *
868 * This function is used to add a page to the pagecache. It must be locked. 889 * This function is used to add a page to the pagecache. It must be locked.
869 * This function does not add the page to the LRU. The caller must do that. 890 * This function does not add the page to the LRU. The caller must do that.
891 *
892 * Return: %0 on success, negative error code otherwise.
870 */ 893 */
871int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 894int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
872 pgoff_t offset, gfp_t gfp_mask) 895 pgoff_t offset, gfp_t gfp_mask)
@@ -1463,7 +1486,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
1463 * If the slot holds a shadow entry of a previously evicted page, or a 1486 * If the slot holds a shadow entry of a previously evicted page, or a
1464 * swap entry from shmem/tmpfs, it is returned. 1487 * swap entry from shmem/tmpfs, it is returned.
1465 * 1488 *
1466 * Otherwise, %NULL is returned. 1489 * Return: the found page or shadow entry, %NULL if nothing is found.
1467 */ 1490 */
1468struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1491struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
1469{ 1492{
@@ -1521,9 +1544,9 @@ EXPORT_SYMBOL(find_get_entry);
1521 * If the slot holds a shadow entry of a previously evicted page, or a 1544 * If the slot holds a shadow entry of a previously evicted page, or a
1522 * swap entry from shmem/tmpfs, it is returned. 1545 * swap entry from shmem/tmpfs, it is returned.
1523 * 1546 *
1524 * Otherwise, %NULL is returned.
1525 *
1526 * find_lock_entry() may sleep. 1547 * find_lock_entry() may sleep.
1548 *
1549 * Return: the found page or shadow entry, %NULL if nothing is found.
1527 */ 1550 */
1528struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) 1551struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
1529{ 1552{
@@ -1563,12 +1586,14 @@ EXPORT_SYMBOL(find_lock_entry);
1563 * - FGP_CREAT: If page is not present then a new page is allocated using 1586 * - FGP_CREAT: If page is not present then a new page is allocated using
1564 * @gfp_mask and added to the page cache and the VM's LRU 1587 * @gfp_mask and added to the page cache and the VM's LRU
1565 * list. The page is returned locked and with an increased 1588 * list. The page is returned locked and with an increased
1566 * refcount. Otherwise, NULL is returned. 1589 * refcount.
1567 * 1590 *
1568 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even 1591 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
1569 * if the GFP flags specified for FGP_CREAT are atomic. 1592 * if the GFP flags specified for FGP_CREAT are atomic.
1570 * 1593 *
1571 * If there is a page cache page, it is returned with an increased refcount. 1594 * If there is a page cache page, it is returned with an increased refcount.
1595 *
1596 * Return: the found page or %NULL otherwise.
1572 */ 1597 */
1573struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, 1598struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
1574 int fgp_flags, gfp_t gfp_mask) 1599 int fgp_flags, gfp_t gfp_mask)
@@ -1656,8 +1681,7 @@ EXPORT_SYMBOL(pagecache_get_page);
1656 * Any shadow entries of evicted pages, or swap entries from 1681 * Any shadow entries of evicted pages, or swap entries from
1657 * shmem/tmpfs, are included in the returned array. 1682 * shmem/tmpfs, are included in the returned array.
1658 * 1683 *
1659 * find_get_entries() returns the number of pages and shadow entries 1684 * Return: the number of pages and shadow entries which were found.
1660 * which were found.
1661 */ 1685 */
1662unsigned find_get_entries(struct address_space *mapping, 1686unsigned find_get_entries(struct address_space *mapping,
1663 pgoff_t start, unsigned int nr_entries, 1687 pgoff_t start, unsigned int nr_entries,
@@ -1727,8 +1751,8 @@ retry:
1727 * indexes. There may be holes in the indices due to not-present pages. 1751 * indexes. There may be holes in the indices due to not-present pages.
1728 * We also update @start to index the next page for the traversal. 1752 * We also update @start to index the next page for the traversal.
1729 * 1753 *
1730 * find_get_pages_range() returns the number of pages which were found. If this 1754 * Return: the number of pages which were found. If this number is
1731 * number is smaller than @nr_pages, the end of specified range has been 1755 * smaller than @nr_pages, the end of specified range has been
1732 * reached. 1756 * reached.
1733 */ 1757 */
1734unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, 1758unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
@@ -1765,7 +1789,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
1765 1789
1766 pages[ret] = page; 1790 pages[ret] = page;
1767 if (++ret == nr_pages) { 1791 if (++ret == nr_pages) {
1768 *start = page->index + 1; 1792 *start = xas.xa_index + 1;
1769 goto out; 1793 goto out;
1770 } 1794 }
1771 continue; 1795 continue;
@@ -1801,7 +1825,7 @@ out:
1801 * find_get_pages_contig() works exactly like find_get_pages(), except 1825 * find_get_pages_contig() works exactly like find_get_pages(), except
1802 * that the returned number of pages are guaranteed to be contiguous. 1826 * that the returned number of pages are guaranteed to be contiguous.
1803 * 1827 *
1804 * find_get_pages_contig() returns the number of pages which were found. 1828 * Return: the number of pages which were found.
1805 */ 1829 */
1806unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 1830unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
1807 unsigned int nr_pages, struct page **pages) 1831 unsigned int nr_pages, struct page **pages)
@@ -1837,16 +1861,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
1837 if (unlikely(page != xas_reload(&xas))) 1861 if (unlikely(page != xas_reload(&xas)))
1838 goto put_page; 1862 goto put_page;
1839 1863
1840 /*
1841 * must check mapping and index after taking the ref.
1842 * otherwise we can get both false positives and false
1843 * negatives, which is just confusing to the caller.
1844 */
1845 if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
1846 put_page(page);
1847 break;
1848 }
1849
1850 pages[ret] = page; 1864 pages[ret] = page;
1851 if (++ret == nr_pages) 1865 if (++ret == nr_pages)
1852 break; 1866 break;
@@ -1872,6 +1886,8 @@ EXPORT_SYMBOL(find_get_pages_contig);
1872 * 1886 *
1873 * Like find_get_pages, except we only return pages which are tagged with 1887 * Like find_get_pages, except we only return pages which are tagged with
1874 * @tag. We update @index to index the next page for the traversal. 1888 * @tag. We update @index to index the next page for the traversal.
1889 *
1890 * Return: the number of pages which were found.
1875 */ 1891 */
1876unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, 1892unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
1877 pgoff_t end, xa_mark_t tag, unsigned int nr_pages, 1893 pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
@@ -1911,7 +1927,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
1911 1927
1912 pages[ret] = page; 1928 pages[ret] = page;
1913 if (++ret == nr_pages) { 1929 if (++ret == nr_pages) {
1914 *index = page->index + 1; 1930 *index = xas.xa_index + 1;
1915 goto out; 1931 goto out;
1916 } 1932 }
1917 continue; 1933 continue;
@@ -1949,6 +1965,8 @@ EXPORT_SYMBOL(find_get_pages_range_tag);
1949 * 1965 *
1950 * Like find_get_entries, except we only return entries which are tagged with 1966 * Like find_get_entries, except we only return entries which are tagged with
1951 * @tag. 1967 * @tag.
1968 *
1969 * Return: the number of entries which were found.
1952 */ 1970 */
1953unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1971unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
1954 xa_mark_t tag, unsigned int nr_entries, 1972 xa_mark_t tag, unsigned int nr_entries,
@@ -2034,6 +2052,10 @@ static void shrink_readahead_size_eio(struct file *filp,
2034 * 2052 *
2035 * This is really ugly. But the goto's actually try to clarify some 2053 * This is really ugly. But the goto's actually try to clarify some
2036 * of the logic when it comes to error handling etc. 2054 * of the logic when it comes to error handling etc.
2055 *
2056 * Return:
2057 * * total number of bytes copied, including those the were already @written
2058 * * negative error code if nothing was copied
2037 */ 2059 */
2038static ssize_t generic_file_buffered_read(struct kiocb *iocb, 2060static ssize_t generic_file_buffered_read(struct kiocb *iocb,
2039 struct iov_iter *iter, ssize_t written) 2061 struct iov_iter *iter, ssize_t written)
@@ -2295,6 +2317,9 @@ out:
2295 * 2317 *
2296 * This is the "read_iter()" routine for all filesystems 2318 * This is the "read_iter()" routine for all filesystems
2297 * that can use the page cache directly. 2319 * that can use the page cache directly.
2320 * Return:
2321 * * number of bytes copied, even for partial reads
2322 * * negative error code if nothing was read
2298 */ 2323 */
2299ssize_t 2324ssize_t
2300generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 2325generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -2362,6 +2387,8 @@ EXPORT_SYMBOL(generic_file_read_iter);
2362 * 2387 *
2363 * This adds the requested page to the page cache if it isn't already there, 2388 * This adds the requested page to the page cache if it isn't already there,
2364 * and schedules an I/O to read in its contents from disk. 2389 * and schedules an I/O to read in its contents from disk.
2390 *
2391 * Return: %0 on success, negative error code otherwise.
2365 */ 2392 */
2366static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) 2393static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
2367{ 2394{
@@ -2476,6 +2503,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
2476 * has not been released. 2503 * has not been released.
2477 * 2504 *
2478 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 2505 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
2506 *
2507 * Return: bitwise-OR of %VM_FAULT_ codes.
2479 */ 2508 */
2480vm_fault_t filemap_fault(struct vm_fault *vmf) 2509vm_fault_t filemap_fault(struct vm_fault *vmf)
2481{ 2510{
@@ -2861,6 +2890,8 @@ out:
2861 * not set, try to fill the page and wait for it to become unlocked. 2890 * not set, try to fill the page and wait for it to become unlocked.
2862 * 2891 *
2863 * If the page does not get brought uptodate, return -EIO. 2892 * If the page does not get brought uptodate, return -EIO.
2893 *
2894 * Return: up to date page on success, ERR_PTR() on failure.
2864 */ 2895 */
2865struct page *read_cache_page(struct address_space *mapping, 2896struct page *read_cache_page(struct address_space *mapping,
2866 pgoff_t index, 2897 pgoff_t index,
@@ -2881,6 +2912,8 @@ EXPORT_SYMBOL(read_cache_page);
2881 * any new page allocations done using the specified allocation flags. 2912 * any new page allocations done using the specified allocation flags.
2882 * 2913 *
2883 * If the page does not get brought uptodate, return -EIO. 2914 * If the page does not get brought uptodate, return -EIO.
2915 *
2916 * Return: up to date page on success, ERR_PTR() on failure.
2884 */ 2917 */
2885struct page *read_cache_page_gfp(struct address_space *mapping, 2918struct page *read_cache_page_gfp(struct address_space *mapping,
2886 pgoff_t index, 2919 pgoff_t index,
@@ -3081,7 +3114,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
3081 if (iocb->ki_flags & IOCB_NOWAIT) { 3114 if (iocb->ki_flags & IOCB_NOWAIT) {
3082 /* If there are pages to writeback, return */ 3115 /* If there are pages to writeback, return */
3083 if (filemap_range_has_page(inode->i_mapping, pos, 3116 if (filemap_range_has_page(inode->i_mapping, pos,
3084 pos + write_len)) 3117 pos + write_len - 1))
3085 return -EAGAIN; 3118 return -EAGAIN;
3086 } else { 3119 } else {
3087 written = filemap_write_and_wait_range(mapping, pos, 3120 written = filemap_write_and_wait_range(mapping, pos,
@@ -3264,6 +3297,10 @@ EXPORT_SYMBOL(generic_perform_write);
3264 * This function does *not* take care of syncing data in case of O_SYNC write. 3297 * This function does *not* take care of syncing data in case of O_SYNC write.
3265 * A caller has to handle it. This is mainly due to the fact that we want to 3298 * A caller has to handle it. This is mainly due to the fact that we want to
3266 * avoid syncing under i_mutex. 3299 * avoid syncing under i_mutex.
3300 *
3301 * Return:
3302 * * number of bytes written, even for truncated writes
3303 * * negative error code if no data has been written at all
3267 */ 3304 */
3268ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 3305ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3269{ 3306{
@@ -3348,6 +3385,10 @@ EXPORT_SYMBOL(__generic_file_write_iter);
3348 * This is a wrapper around __generic_file_write_iter() to be used by most 3385 * This is a wrapper around __generic_file_write_iter() to be used by most
3349 * filesystems. It takes care of syncing the file in case of O_SYNC file 3386 * filesystems. It takes care of syncing the file in case of O_SYNC file
3350 * and acquires i_mutex as needed. 3387 * and acquires i_mutex as needed.
3388 * Return:
3389 * * negative error code if no data has been written at all of
3390 * vfs_fsync_range() failed for a synchronous write
3391 * * number of bytes written, even for truncated writes
3351 */ 3392 */
3352ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 3393ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3353{ 3394{
@@ -3374,8 +3415,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
3374 * @gfp_mask: memory allocation flags (and I/O mode) 3415 * @gfp_mask: memory allocation flags (and I/O mode)
3375 * 3416 *
3376 * The address_space is to try to release any data against the page 3417 * The address_space is to try to release any data against the page
3377 * (presumably at page->private). If the release was successful, return '1'. 3418 * (presumably at page->private).
3378 * Otherwise return zero.
3379 * 3419 *
3380 * This may also be called if PG_fscache is set on a page, indicating that the 3420 * This may also be called if PG_fscache is set on a page, indicating that the
3381 * page is known to the local caching routines. 3421 * page is known to the local caching routines.
@@ -3383,6 +3423,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
3383 * The @gfp_mask argument specifies whether I/O may be performed to release 3423 * The @gfp_mask argument specifies whether I/O may be performed to release
3384 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). 3424 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
3385 * 3425 *
3426 * Return: %1 if the release was successful, otherwise return zero.
3386 */ 3427 */
3387int try_to_release_page(struct page *page, gfp_t gfp_mask) 3428int try_to_release_page(struct page *page, gfp_t gfp_mask)
3388{ 3429{
diff --git a/mm/gup.c b/mm/gup.c
index 75029649baca..22291db50013 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -13,6 +13,9 @@
13#include <linux/sched/signal.h> 13#include <linux/sched/signal.h>
14#include <linux/rwsem.h> 14#include <linux/rwsem.h>
15#include <linux/hugetlb.h> 15#include <linux/hugetlb.h>
16#include <linux/migrate.h>
17#include <linux/mm_inline.h>
18#include <linux/sched/mm.h>
16 19
17#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
18#include <asm/pgtable.h> 21#include <asm/pgtable.h>
@@ -1126,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
1126} 1129}
1127EXPORT_SYMBOL(get_user_pages); 1130EXPORT_SYMBOL(get_user_pages);
1128 1131
1132#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
1133
1129#ifdef CONFIG_FS_DAX 1134#ifdef CONFIG_FS_DAX
1135static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
1136{
1137 long i;
1138 struct vm_area_struct *vma_prev = NULL;
1139
1140 for (i = 0; i < nr_pages; i++) {
1141 struct vm_area_struct *vma = vmas[i];
1142
1143 if (vma == vma_prev)
1144 continue;
1145
1146 vma_prev = vma;
1147
1148 if (vma_is_fsdax(vma))
1149 return true;
1150 }
1151 return false;
1152}
1153#else
1154static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
1155{
1156 return false;
1157}
1158#endif
1159
1160#ifdef CONFIG_CMA
1161static struct page *new_non_cma_page(struct page *page, unsigned long private)
1162{
1163 /*
1164 * We want to make sure we allocate the new page from the same node
1165 * as the source page.
1166 */
1167 int nid = page_to_nid(page);
1168 /*
1169 * Trying to allocate a page for migration. Ignore allocation
1170 * failure warnings. We don't force __GFP_THISNODE here because
1171 * this node here is the node where we have CMA reservation and
1172 * in some case these nodes will have really less non movable
1173 * allocation memory.
1174 */
1175 gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
1176
1177 if (PageHighMem(page))
1178 gfp_mask |= __GFP_HIGHMEM;
1179
1180#ifdef CONFIG_HUGETLB_PAGE
1181 if (PageHuge(page)) {
1182 struct hstate *h = page_hstate(page);
1183 /*
1184 * We don't want to dequeue from the pool because pool pages will
1185 * mostly be from the CMA region.
1186 */
1187 return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1188 }
1189#endif
1190 if (PageTransHuge(page)) {
1191 struct page *thp;
1192 /*
1193 * ignore allocation failure warnings
1194 */
1195 gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
1196
1197 /*
1198 * Remove the movable mask so that we don't allocate from
1199 * CMA area again.
1200 */
1201 thp_gfpmask &= ~__GFP_MOVABLE;
1202 thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
1203 if (!thp)
1204 return NULL;
1205 prep_transhuge_page(thp);
1206 return thp;
1207 }
1208
1209 return __alloc_pages_node(nid, gfp_mask, 0);
1210}
1211
1212static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
1213 unsigned int gup_flags,
1214 struct page **pages,
1215 struct vm_area_struct **vmas)
1216{
1217 long i;
1218 bool drain_allow = true;
1219 bool migrate_allow = true;
1220 LIST_HEAD(cma_page_list);
1221
1222check_again:
1223 for (i = 0; i < nr_pages; i++) {
1224 /*
1225 * If we get a page from the CMA zone, since we are going to
1226 * be pinning these entries, we might as well move them out
1227 * of the CMA zone if possible.
1228 */
1229 if (is_migrate_cma_page(pages[i])) {
1230
1231 struct page *head = compound_head(pages[i]);
1232
1233 if (PageHuge(head)) {
1234 isolate_huge_page(head, &cma_page_list);
1235 } else {
1236 if (!PageLRU(head) && drain_allow) {
1237 lru_add_drain_all();
1238 drain_allow = false;
1239 }
1240
1241 if (!isolate_lru_page(head)) {
1242 list_add_tail(&head->lru, &cma_page_list);
1243 mod_node_page_state(page_pgdat(head),
1244 NR_ISOLATED_ANON +
1245 page_is_file_cache(head),
1246 hpage_nr_pages(head));
1247 }
1248 }
1249 }
1250 }
1251
1252 if (!list_empty(&cma_page_list)) {
1253 /*
1254 * drop the above get_user_pages reference.
1255 */
1256 for (i = 0; i < nr_pages; i++)
1257 put_page(pages[i]);
1258
1259 if (migrate_pages(&cma_page_list, new_non_cma_page,
1260 NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
1261 /*
1262 * some of the pages failed migration. Do get_user_pages
1263 * without migration.
1264 */
1265 migrate_allow = false;
1266
1267 if (!list_empty(&cma_page_list))
1268 putback_movable_pages(&cma_page_list);
1269 }
1270 /*
1271 * We did migrate all the pages, Try to get the page references again
1272 * migrating any new CMA pages which we failed to isolate earlier.
1273 */
1274 nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1275 if ((nr_pages > 0) && migrate_allow) {
1276 drain_allow = true;
1277 goto check_again;
1278 }
1279 }
1280
1281 return nr_pages;
1282}
1283#else
1284static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
1285 unsigned int gup_flags,
1286 struct page **pages,
1287 struct vm_area_struct **vmas)
1288{
1289 return nr_pages;
1290}
1291#endif
1292
1130/* 1293/*
1131 * This is the same as get_user_pages() in that it assumes we are 1294 * This is the same as get_user_pages() in that it assumes we are
1132 * operating on the current task's mm, but it goes further to validate 1295 * operating on the current task's mm, but it goes further to validate
@@ -1140,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages);
1140 * Contrast this to iov_iter_get_pages() usages which are transient. 1303 * Contrast this to iov_iter_get_pages() usages which are transient.
1141 */ 1304 */
1142long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, 1305long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1143 unsigned int gup_flags, struct page **pages, 1306 unsigned int gup_flags, struct page **pages,
1144 struct vm_area_struct **vmas_arg) 1307 struct vm_area_struct **vmas_arg)
1145{ 1308{
1146 struct vm_area_struct **vmas = vmas_arg; 1309 struct vm_area_struct **vmas = vmas_arg;
1147 struct vm_area_struct *vma_prev = NULL; 1310 unsigned long flags;
1148 long rc, i; 1311 long rc, i;
1149 1312
1150 if (!pages) 1313 if (!pages)
@@ -1157,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1157 return -ENOMEM; 1320 return -ENOMEM;
1158 } 1321 }
1159 1322
1323 flags = memalloc_nocma_save();
1160 rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); 1324 rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1325 memalloc_nocma_restore(flags);
1326 if (rc < 0)
1327 goto out;
1161 1328
1162 for (i = 0; i < rc; i++) { 1329 if (check_dax_vmas(vmas, rc)) {
1163 struct vm_area_struct *vma = vmas[i]; 1330 for (i = 0; i < rc; i++)
1164 1331 put_page(pages[i]);
1165 if (vma == vma_prev) 1332 rc = -EOPNOTSUPP;
1166 continue;
1167
1168 vma_prev = vma;
1169
1170 if (vma_is_fsdax(vma))
1171 break;
1172 }
1173
1174 /*
1175 * Either get_user_pages() failed, or the vma validation
1176 * succeeded, in either case we don't need to put_page() before
1177 * returning.
1178 */
1179 if (i >= rc)
1180 goto out; 1333 goto out;
1334 }
1181 1335
1182 for (i = 0; i < rc; i++) 1336 rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
1183 put_page(pages[i]);
1184 rc = -EOPNOTSUPP;
1185out: 1337out:
1186 if (vmas != vmas_arg) 1338 if (vmas != vmas_arg)
1187 kfree(vmas); 1339 kfree(vmas);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 5b42d3d4b60a..6c0279e70cc4 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -122,12 +122,8 @@ static const struct file_operations gup_benchmark_fops = {
122 122
123static int gup_benchmark_init(void) 123static int gup_benchmark_init(void)
124{ 124{
125 void *ret; 125 debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
126 126 &gup_benchmark_fops);
127 ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
128 &gup_benchmark_fops);
129 if (!ret)
130 pr_warn("Failed to create gup_benchmark in debugfs");
131 127
132 return 0; 128 return 0;
133} 129}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index faf357eaf0ce..404acdcd0455 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,7 @@
33#include <linux/page_idle.h> 33#include <linux/page_idle.h>
34#include <linux/shmem_fs.h> 34#include <linux/shmem_fs.h>
35#include <linux/oom.h> 35#include <linux/oom.h>
36#include <linux/numa.h>
36 37
37#include <asm/tlb.h> 38#include <asm/tlb.h>
38#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
@@ -616,6 +617,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
616 mm_inc_nr_ptes(vma->vm_mm); 617 mm_inc_nr_ptes(vma->vm_mm);
617 spin_unlock(vmf->ptl); 618 spin_unlock(vmf->ptl);
618 count_vm_event(THP_FAULT_ALLOC); 619 count_vm_event(THP_FAULT_ALLOC);
620 count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
619 } 621 }
620 622
621 return 0; 623 return 0;
@@ -1337,6 +1339,7 @@ alloc:
1337 } 1339 }
1338 1340
1339 count_vm_event(THP_FAULT_ALLOC); 1341 count_vm_event(THP_FAULT_ALLOC);
1342 count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
1340 1343
1341 if (!page) 1344 if (!page)
1342 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); 1345 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
@@ -1475,7 +1478,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1475 struct anon_vma *anon_vma = NULL; 1478 struct anon_vma *anon_vma = NULL;
1476 struct page *page; 1479 struct page *page;
1477 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1480 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1478 int page_nid = -1, this_nid = numa_node_id(); 1481 int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
1479 int target_nid, last_cpupid = -1; 1482 int target_nid, last_cpupid = -1;
1480 bool page_locked; 1483 bool page_locked;
1481 bool migrated = false; 1484 bool migrated = false;
@@ -1520,7 +1523,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1520 */ 1523 */
1521 page_locked = trylock_page(page); 1524 page_locked = trylock_page(page);
1522 target_nid = mpol_misplaced(page, vma, haddr); 1525 target_nid = mpol_misplaced(page, vma, haddr);
1523 if (target_nid == -1) { 1526 if (target_nid == NUMA_NO_NODE) {
1524 /* If the page was locked, there are no parallel migrations */ 1527 /* If the page was locked, there are no parallel migrations */
1525 if (page_locked) 1528 if (page_locked)
1526 goto clear_pmdnuma; 1529 goto clear_pmdnuma;
@@ -1528,7 +1531,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1528 1531
1529 /* Migration could have started since the pmd_trans_migrating check */ 1532 /* Migration could have started since the pmd_trans_migrating check */
1530 if (!page_locked) { 1533 if (!page_locked) {
1531 page_nid = -1; 1534 page_nid = NUMA_NO_NODE;
1532 if (!get_page_unless_zero(page)) 1535 if (!get_page_unless_zero(page))
1533 goto out_unlock; 1536 goto out_unlock;
1534 spin_unlock(vmf->ptl); 1537 spin_unlock(vmf->ptl);
@@ -1549,14 +1552,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1549 if (unlikely(!pmd_same(pmd, *vmf->pmd))) { 1552 if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
1550 unlock_page(page); 1553 unlock_page(page);
1551 put_page(page); 1554 put_page(page);
1552 page_nid = -1; 1555 page_nid = NUMA_NO_NODE;
1553 goto out_unlock; 1556 goto out_unlock;
1554 } 1557 }
1555 1558
1556 /* Bail if we fail to protect against THP splits for any reason */ 1559 /* Bail if we fail to protect against THP splits for any reason */
1557 if (unlikely(!anon_vma)) { 1560 if (unlikely(!anon_vma)) {
1558 put_page(page); 1561 put_page(page);
1559 page_nid = -1; 1562 page_nid = NUMA_NO_NODE;
1560 goto clear_pmdnuma; 1563 goto clear_pmdnuma;
1561 } 1564 }
1562 1565
@@ -1618,7 +1621,7 @@ out:
1618 if (anon_vma) 1621 if (anon_vma)
1619 page_unlock_anon_vma_read(anon_vma); 1622 page_unlock_anon_vma_read(anon_vma);
1620 1623
1621 if (page_nid != -1) 1624 if (page_nid != NUMA_NO_NODE)
1622 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, 1625 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
1623 flags); 1626 flags);
1624 1627
@@ -1979,7 +1982,6 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
1979int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 1982int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
1980 pud_t *pud, unsigned long addr) 1983 pud_t *pud, unsigned long addr)
1981{ 1984{
1982 pud_t orig_pud;
1983 spinlock_t *ptl; 1985 spinlock_t *ptl;
1984 1986
1985 ptl = __pud_trans_huge_lock(pud, vma); 1987 ptl = __pud_trans_huge_lock(pud, vma);
@@ -1991,8 +1993,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
1991 * pgtable_trans_huge_withdraw after finishing pudp related 1993 * pgtable_trans_huge_withdraw after finishing pudp related
1992 * operations. 1994 * operations.
1993 */ 1995 */
1994 orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, 1996 pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
1995 tlb->fullmm);
1996 tlb_remove_pud_tlb_entry(tlb, pud, addr); 1997 tlb_remove_pud_tlb_entry(tlb, pud, addr);
1997 if (vma_is_dax(vma)) { 1998 if (vma_is_dax(vma)) {
1998 spin_unlock(ptl); 1999 spin_unlock(ptl);
@@ -2437,11 +2438,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2437 pgoff_t end, unsigned long flags) 2438 pgoff_t end, unsigned long flags)
2438{ 2439{
2439 struct page *head = compound_head(page); 2440 struct page *head = compound_head(page);
2440 struct zone *zone = page_zone(head); 2441 pg_data_t *pgdat = page_pgdat(head);
2441 struct lruvec *lruvec; 2442 struct lruvec *lruvec;
2442 int i; 2443 int i;
2443 2444
2444 lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); 2445 lruvec = mem_cgroup_page_lruvec(head, pgdat);
2445 2446
2446 /* complete memcg works before add pages to LRU */ 2447 /* complete memcg works before add pages to LRU */
2447 mem_cgroup_split_huge_fixup(head); 2448 mem_cgroup_split_huge_fixup(head);
@@ -2472,7 +2473,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2472 xa_unlock(&head->mapping->i_pages); 2473 xa_unlock(&head->mapping->i_pages);
2473 } 2474 }
2474 2475
2475 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 2476 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
2476 2477
2477 remap_page(head); 2478 remap_page(head);
2478 2479
@@ -2683,7 +2684,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2683 lru_add_drain(); 2684 lru_add_drain();
2684 2685
2685 /* prevent PageLRU to go away from under us, and freeze lru stats */ 2686 /* prevent PageLRU to go away from under us, and freeze lru stats */
2686 spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); 2687 spin_lock_irqsave(&pgdata->lru_lock, flags);
2687 2688
2688 if (mapping) { 2689 if (mapping) {
2689 XA_STATE(xas, &mapping->i_pages, page_index(head)); 2690 XA_STATE(xas, &mapping->i_pages, page_index(head));
@@ -2728,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2728 spin_unlock(&pgdata->split_queue_lock); 2729 spin_unlock(&pgdata->split_queue_lock);
2729fail: if (mapping) 2730fail: if (mapping)
2730 xa_unlock(&mapping->i_pages); 2731 xa_unlock(&mapping->i_pages);
2731 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 2732 spin_unlock_irqrestore(&pgdata->lru_lock, flags);
2732 remap_page(head); 2733 remap_page(head);
2733 ret = -EBUSY; 2734 ret = -EBUSY;
2734 } 2735 }
@@ -2886,12 +2887,8 @@ DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
2886 2887
2887static int __init split_huge_pages_debugfs(void) 2888static int __init split_huge_pages_debugfs(void)
2888{ 2889{
2889 void *ret; 2890 debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
2890 2891 &split_huge_pages_fops);
2891 ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
2892 &split_huge_pages_fops);
2893 if (!ret)
2894 pr_warn("Failed to create split_huge_pages in debugfs");
2895 return 0; 2892 return 0;
2896} 2893}
2897late_initcall(split_huge_pages_debugfs); 2894late_initcall(split_huge_pages_debugfs);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8dfdffc34a99..97b1e0290c66 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -25,6 +25,7 @@
25#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/swapops.h> 26#include <linux/swapops.h>
27#include <linux/jhash.h> 27#include <linux/jhash.h>
28#include <linux/numa.h>
28 29
29#include <asm/page.h> 30#include <asm/page.h>
30#include <asm/pgtable.h> 31#include <asm/pgtable.h>
@@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask,
887 struct zonelist *zonelist; 888 struct zonelist *zonelist;
888 struct zone *zone; 889 struct zone *zone;
889 struct zoneref *z; 890 struct zoneref *z;
890 int node = -1; 891 int node = NUMA_NO_NODE;
891 892
892 zonelist = node_zonelist(nid, gfp_mask); 893 zonelist = node_zonelist(nid, gfp_mask);
893 894
@@ -919,7 +920,7 @@ retry_cpuset:
919/* Movability of hugepages depends on migration support. */ 920/* Movability of hugepages depends on migration support. */
920static inline gfp_t htlb_alloc_mask(struct hstate *h) 921static inline gfp_t htlb_alloc_mask(struct hstate *h)
921{ 922{
922 if (hugepage_migration_supported(h)) 923 if (hugepage_movable_supported(h))
923 return GFP_HIGHUSER_MOVABLE; 924 return GFP_HIGHUSER_MOVABLE;
924 else 925 else
925 return GFP_HIGHUSER; 926 return GFP_HIGHUSER;
@@ -1586,8 +1587,8 @@ out_unlock:
1586 return page; 1587 return page;
1587} 1588}
1588 1589
1589static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 1590struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
1590 int nid, nodemask_t *nmask) 1591 int nid, nodemask_t *nmask)
1591{ 1592{
1592 struct page *page; 1593 struct page *page;
1593 1594
@@ -4398,10 +4399,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
4398 continue; 4399 continue;
4399 } 4400 }
4400 if (!huge_pte_none(pte)) { 4401 if (!huge_pte_none(pte)) {
4401 pte = huge_ptep_get_and_clear(mm, address, ptep); 4402 pte_t old_pte;
4402 pte = pte_mkhuge(huge_pte_modify(pte, newprot)); 4403
4404 old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
4405 pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
4403 pte = arch_make_huge_pte(pte, vma, NULL, 0); 4406 pte = arch_make_huge_pte(pte, vma, NULL, 0);
4404 set_huge_pte_at(mm, address, ptep, pte); 4407 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
4405 pages++; 4408 pages++;
4406 } 4409 }
4407 spin_unlock(ptl); 4410 spin_unlock(ptl);
diff --git a/mm/internal.h b/mm/internal.h
index f4a7bb02decf..9eeaf2b95166 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
163extern int __isolate_free_page(struct page *page, unsigned int order); 163extern int __isolate_free_page(struct page *page, unsigned int order);
164extern void memblock_free_pages(struct page *page, unsigned long pfn, 164extern void memblock_free_pages(struct page *page, unsigned long pfn,
165 unsigned int order); 165 unsigned int order);
166extern void __free_pages_core(struct page *page, unsigned int order);
166extern void prep_compound_page(struct page *page, unsigned int order); 167extern void prep_compound_page(struct page *page, unsigned int order);
167extern void post_alloc_hook(struct page *page, unsigned int order, 168extern void post_alloc_hook(struct page *page, unsigned int order,
168 gfp_t gfp_flags); 169 gfp_t gfp_flags);
@@ -183,14 +184,16 @@ extern int user_min_free_kbytes;
183struct compact_control { 184struct compact_control {
184 struct list_head freepages; /* List of free pages to migrate to */ 185 struct list_head freepages; /* List of free pages to migrate to */
185 struct list_head migratepages; /* List of pages being migrated */ 186 struct list_head migratepages; /* List of pages being migrated */
187 unsigned int nr_freepages; /* Number of isolated free pages */
188 unsigned int nr_migratepages; /* Number of pages to migrate */
189 unsigned long free_pfn; /* isolate_freepages search base */
190 unsigned long migrate_pfn; /* isolate_migratepages search base */
191 unsigned long fast_start_pfn; /* a pfn to start linear scan from */
186 struct zone *zone; 192 struct zone *zone;
187 unsigned long nr_freepages; /* Number of isolated free pages */
188 unsigned long nr_migratepages; /* Number of pages to migrate */
189 unsigned long total_migrate_scanned; 193 unsigned long total_migrate_scanned;
190 unsigned long total_free_scanned; 194 unsigned long total_free_scanned;
191 unsigned long free_pfn; /* isolate_freepages search base */ 195 unsigned short fast_search_fail;/* failures to use free list searches */
192 unsigned long migrate_pfn; /* isolate_migratepages search base */ 196 short search_order; /* order to start a fast search at */
193 unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
194 const gfp_t gfp_mask; /* gfp mask of a direct compactor */ 197 const gfp_t gfp_mask; /* gfp mask of a direct compactor */
195 int order; /* order a direct compactor needs */ 198 int order; /* order a direct compactor needs */
196 int migratetype; /* migratetype of direct compactor */ 199 int migratetype; /* migratetype of direct compactor */
@@ -203,7 +206,16 @@ struct compact_control {
203 bool direct_compaction; /* False from kcompactd or /proc/... */ 206 bool direct_compaction; /* False from kcompactd or /proc/... */
204 bool whole_zone; /* Whole zone should/has been scanned */ 207 bool whole_zone; /* Whole zone should/has been scanned */
205 bool contended; /* Signal lock or sched contention */ 208 bool contended; /* Signal lock or sched contention */
206 bool finishing_block; /* Finishing current pageblock */ 209 bool rescan; /* Rescanning the same pageblock */
210};
211
212/*
213 * Used in direct compaction when a page should be taken from the freelists
214 * immediately when one is created during the free path.
215 */
216struct capture_control {
217 struct compact_control *cc;
218 struct page *page;
207}; 219};
208 220
209unsigned long 221unsigned long
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 09b534fbba17..80bbe62b16cd 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -14,6 +14,8 @@
14 * 14 *
15 */ 15 */
16 16
17#define __KASAN_INTERNAL
18
17#include <linux/export.h> 19#include <linux/export.h>
18#include <linux/interrupt.h> 20#include <linux/interrupt.h>
19#include <linux/init.h> 21#include <linux/init.h>
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index ccb6207276e3..504c79363a34 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -275,25 +275,6 @@ EXPORT_SYMBOL(__asan_storeN_noabort);
275void __asan_handle_no_return(void) {} 275void __asan_handle_no_return(void) {}
276EXPORT_SYMBOL(__asan_handle_no_return); 276EXPORT_SYMBOL(__asan_handle_no_return);
277 277
278/* Emitted by compiler to poison large objects when they go out of scope. */
279void __asan_poison_stack_memory(const void *addr, size_t size)
280{
281 /*
282 * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
283 * by redzones, so we simply round up size to simplify logic.
284 */
285 kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
286 KASAN_USE_AFTER_SCOPE);
287}
288EXPORT_SYMBOL(__asan_poison_stack_memory);
289
290/* Emitted by compiler to unpoison large objects when they go into scope. */
291void __asan_unpoison_stack_memory(const void *addr, size_t size)
292{
293 kasan_unpoison_shadow(addr, size);
294}
295EXPORT_SYMBOL(__asan_unpoison_stack_memory);
296
297/* Emitted by compiler to poison alloca()ed objects. */ 278/* Emitted by compiler to poison alloca()ed objects. */
298void __asan_alloca_poison(unsigned long addr, size_t size) 279void __asan_alloca_poison(unsigned long addr, size_t size)
299{ 280{
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
index 5e12035888f2..36c645939bc9 100644
--- a/mm/kasan/generic_report.c
+++ b/mm/kasan/generic_report.c
@@ -82,9 +82,6 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
82 case KASAN_KMALLOC_FREE: 82 case KASAN_KMALLOC_FREE:
83 bug_type = "use-after-free"; 83 bug_type = "use-after-free";
84 break; 84 break;
85 case KASAN_USE_AFTER_SCOPE:
86 bug_type = "use-after-scope";
87 break;
88 case KASAN_ALLOCA_LEFT: 85 case KASAN_ALLOCA_LEFT:
89 case KASAN_ALLOCA_RIGHT: 86 case KASAN_ALLOCA_RIGHT:
90 bug_type = "alloca-out-of-bounds"; 87 bug_type = "alloca-out-of-bounds";
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 45a1b5e38e1e..fcaa1ca03175 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -42,7 +42,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
42#else 42#else
43static inline bool kasan_p4d_table(pgd_t pgd) 43static inline bool kasan_p4d_table(pgd_t pgd)
44{ 44{
45 return 0; 45 return false;
46} 46}
47#endif 47#endif
48#if CONFIG_PGTABLE_LEVELS > 3 48#if CONFIG_PGTABLE_LEVELS > 3
@@ -54,7 +54,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
54#else 54#else
55static inline bool kasan_pud_table(p4d_t p4d) 55static inline bool kasan_pud_table(p4d_t p4d)
56{ 56{
57 return 0; 57 return false;
58} 58}
59#endif 59#endif
60#if CONFIG_PGTABLE_LEVELS > 2 60#if CONFIG_PGTABLE_LEVELS > 2
@@ -66,7 +66,7 @@ static inline bool kasan_pmd_table(pud_t pud)
66#else 66#else
67static inline bool kasan_pmd_table(pud_t pud) 67static inline bool kasan_pmd_table(pud_t pud)
68{ 68{
69 return 0; 69 return false;
70} 70}
71#endif 71#endif
72pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; 72pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ea51b2d898ec..3e0c11f7d7a1 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -34,7 +34,6 @@
34#define KASAN_STACK_MID 0xF2 34#define KASAN_STACK_MID 0xF2
35#define KASAN_STACK_RIGHT 0xF3 35#define KASAN_STACK_RIGHT 0xF3
36#define KASAN_STACK_PARTIAL 0xF4 36#define KASAN_STACK_PARTIAL 0xF4
37#define KASAN_USE_AFTER_SCOPE 0xF8
38 37
39/* 38/*
40 * alloca redzone shadow values 39 * alloca redzone shadow values
@@ -187,8 +186,6 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size);
187void __asan_loadN(unsigned long addr, size_t size); 186void __asan_loadN(unsigned long addr, size_t size);
188void __asan_storeN(unsigned long addr, size_t size); 187void __asan_storeN(unsigned long addr, size_t size);
189void __asan_handle_no_return(void); 188void __asan_handle_no_return(void);
190void __asan_poison_stack_memory(const void *addr, size_t size);
191void __asan_unpoison_stack_memory(const void *addr, size_t size);
192void __asan_alloca_poison(unsigned long addr, size_t size); 189void __asan_alloca_poison(unsigned long addr, size_t size);
193void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); 190void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
194 191
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4f017339ddb2..449044378782 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1074,6 +1074,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1074 BUG_ON(!pmd_none(*pmd)); 1074 BUG_ON(!pmd_none(*pmd));
1075 page_add_new_anon_rmap(new_page, vma, address, true); 1075 page_add_new_anon_rmap(new_page, vma, address, true);
1076 mem_cgroup_commit_charge(new_page, memcg, false, true); 1076 mem_cgroup_commit_charge(new_page, memcg, false, true);
1077 count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
1077 lru_cache_add_active_or_unevictable(new_page, vma); 1078 lru_cache_add_active_or_unevictable(new_page, vma);
1078 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1079 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1079 set_pmd_at(mm, address, pmd, _pmd); 1080 set_pmd_at(mm, address, pmd, _pmd);
@@ -1502,6 +1503,7 @@ xa_unlocked:
1502 page_ref_add(new_page, HPAGE_PMD_NR - 1); 1503 page_ref_add(new_page, HPAGE_PMD_NR - 1);
1503 set_page_dirty(new_page); 1504 set_page_dirty(new_page);
1504 mem_cgroup_commit_charge(new_page, memcg, false, true); 1505 mem_cgroup_commit_charge(new_page, memcg, false, true);
1506 count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
1505 lru_cache_add_anon(new_page); 1507 lru_cache_add_anon(new_page);
1506 1508
1507 /* 1509 /*
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c48ad13b4c9..fc64874dc6f4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -598,7 +598,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
598 chain->chain_prune_time = jiffies; 598 chain->chain_prune_time = jiffies;
599 chain->rmap_hlist_len = STABLE_NODE_CHAIN; 599 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
600#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) 600#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
601 chain->nid = -1; /* debug */ 601 chain->nid = NUMA_NO_NODE; /* debug */
602#endif 602#endif
603 ksm_stable_node_chains++; 603 ksm_stable_node_chains++;
604 604
@@ -667,6 +667,12 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
667 free_stable_node(stable_node); 667 free_stable_node(stable_node);
668} 668}
669 669
670enum get_ksm_page_flags {
671 GET_KSM_PAGE_NOLOCK,
672 GET_KSM_PAGE_LOCK,
673 GET_KSM_PAGE_TRYLOCK
674};
675
670/* 676/*
671 * get_ksm_page: checks if the page indicated by the stable node 677 * get_ksm_page: checks if the page indicated by the stable node
672 * is still its ksm page, despite having held no reference to it. 678 * is still its ksm page, despite having held no reference to it.
@@ -686,7 +692,8 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
686 * a page to put something that might look like our key in page->mapping. 692 * a page to put something that might look like our key in page->mapping.
687 * is on its way to being freed; but it is an anomaly to bear in mind. 693 * is on its way to being freed; but it is an anomaly to bear in mind.
688 */ 694 */
689static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) 695static struct page *get_ksm_page(struct stable_node *stable_node,
696 enum get_ksm_page_flags flags)
690{ 697{
691 struct page *page; 698 struct page *page;
692 void *expected_mapping; 699 void *expected_mapping;
@@ -706,8 +713,9 @@ again:
706 * case this node is no longer referenced, and should be freed; 713 * case this node is no longer referenced, and should be freed;
707 * however, it might mean that the page is under page_ref_freeze(). 714 * however, it might mean that the page is under page_ref_freeze().
708 * The __remove_mapping() case is easy, again the node is now stale; 715 * The __remove_mapping() case is easy, again the node is now stale;
709 * but if page is swapcache in migrate_page_move_mapping(), it might 716 * the same is in reuse_ksm_page() case; but if page is swapcache
710 * still be our page, in which case it's essential to keep the node. 717 * in migrate_page_move_mapping(), it might still be our page,
718 * in which case it's essential to keep the node.
711 */ 719 */
712 while (!get_page_unless_zero(page)) { 720 while (!get_page_unless_zero(page)) {
713 /* 721 /*
@@ -728,8 +736,15 @@ again:
728 goto stale; 736 goto stale;
729 } 737 }
730 738
731 if (lock_it) { 739 if (flags == GET_KSM_PAGE_TRYLOCK) {
740 if (!trylock_page(page)) {
741 put_page(page);
742 return ERR_PTR(-EBUSY);
743 }
744 } else if (flags == GET_KSM_PAGE_LOCK)
732 lock_page(page); 745 lock_page(page);
746
747 if (flags != GET_KSM_PAGE_NOLOCK) {
733 if (READ_ONCE(page->mapping) != expected_mapping) { 748 if (READ_ONCE(page->mapping) != expected_mapping) {
734 unlock_page(page); 749 unlock_page(page);
735 put_page(page); 750 put_page(page);
@@ -763,7 +778,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
763 struct page *page; 778 struct page *page;
764 779
765 stable_node = rmap_item->head; 780 stable_node = rmap_item->head;
766 page = get_ksm_page(stable_node, true); 781 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
767 if (!page) 782 if (!page)
768 goto out; 783 goto out;
769 784
@@ -863,7 +878,7 @@ static int remove_stable_node(struct stable_node *stable_node)
863 struct page *page; 878 struct page *page;
864 int err; 879 int err;
865 880
866 page = get_ksm_page(stable_node, true); 881 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
867 if (!page) { 882 if (!page) {
868 /* 883 /*
869 * get_ksm_page did remove_node_from_stable_tree itself. 884 * get_ksm_page did remove_node_from_stable_tree itself.
@@ -1385,7 +1400,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1385 * stable_node parameter itself will be freed from 1400 * stable_node parameter itself will be freed from
1386 * under us if it returns NULL. 1401 * under us if it returns NULL.
1387 */ 1402 */
1388 _tree_page = get_ksm_page(dup, false); 1403 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1389 if (!_tree_page) 1404 if (!_tree_page)
1390 continue; 1405 continue;
1391 nr += 1; 1406 nr += 1;
@@ -1508,7 +1523,7 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1508 if (!is_stable_node_chain(stable_node)) { 1523 if (!is_stable_node_chain(stable_node)) {
1509 if (is_page_sharing_candidate(stable_node)) { 1524 if (is_page_sharing_candidate(stable_node)) {
1510 *_stable_node_dup = stable_node; 1525 *_stable_node_dup = stable_node;
1511 return get_ksm_page(stable_node, false); 1526 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1512 } 1527 }
1513 /* 1528 /*
1514 * _stable_node_dup set to NULL means the stable_node 1529 * _stable_node_dup set to NULL means the stable_node
@@ -1613,7 +1628,8 @@ again:
1613 * wrprotected at all times. Any will work 1628 * wrprotected at all times. Any will work
1614 * fine to continue the walk. 1629 * fine to continue the walk.
1615 */ 1630 */
1616 tree_page = get_ksm_page(stable_node_any, false); 1631 tree_page = get_ksm_page(stable_node_any,
1632 GET_KSM_PAGE_NOLOCK);
1617 } 1633 }
1618 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); 1634 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1619 if (!tree_page) { 1635 if (!tree_page) {
@@ -1673,7 +1689,12 @@ again:
1673 * It would be more elegant to return stable_node 1689 * It would be more elegant to return stable_node
1674 * than kpage, but that involves more changes. 1690 * than kpage, but that involves more changes.
1675 */ 1691 */
1676 tree_page = get_ksm_page(stable_node_dup, true); 1692 tree_page = get_ksm_page(stable_node_dup,
1693 GET_KSM_PAGE_TRYLOCK);
1694
1695 if (PTR_ERR(tree_page) == -EBUSY)
1696 return ERR_PTR(-EBUSY);
1697
1677 if (unlikely(!tree_page)) 1698 if (unlikely(!tree_page))
1678 /* 1699 /*
1679 * The tree may have been rebalanced, 1700 * The tree may have been rebalanced,
@@ -1842,7 +1863,8 @@ again:
1842 * wrprotected at all times. Any will work 1863 * wrprotected at all times. Any will work
1843 * fine to continue the walk. 1864 * fine to continue the walk.
1844 */ 1865 */
1845 tree_page = get_ksm_page(stable_node_any, false); 1866 tree_page = get_ksm_page(stable_node_any,
1867 GET_KSM_PAGE_NOLOCK);
1846 } 1868 }
1847 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); 1869 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1848 if (!tree_page) { 1870 if (!tree_page) {
@@ -2068,6 +2090,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2068 remove_rmap_item_from_tree(rmap_item); 2090 remove_rmap_item_from_tree(rmap_item);
2069 2091
2070 if (kpage) { 2092 if (kpage) {
2093 if (PTR_ERR(kpage) == -EBUSY)
2094 return;
2095
2071 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 2096 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2072 if (!err) { 2097 if (!err) {
2073 /* 2098 /*
@@ -2242,7 +2267,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2242 2267
2243 list_for_each_entry_safe(stable_node, next, 2268 list_for_each_entry_safe(stable_node, next,
2244 &migrate_nodes, list) { 2269 &migrate_nodes, list) {
2245 page = get_ksm_page(stable_node, false); 2270 page = get_ksm_page(stable_node,
2271 GET_KSM_PAGE_NOLOCK);
2246 if (page) 2272 if (page)
2247 put_page(page); 2273 put_page(page);
2248 cond_resched(); 2274 cond_resched();
@@ -2642,6 +2668,31 @@ again:
2642 goto again; 2668 goto again;
2643} 2669}
2644 2670
2671bool reuse_ksm_page(struct page *page,
2672 struct vm_area_struct *vma,
2673 unsigned long address)
2674{
2675#ifdef CONFIG_DEBUG_VM
2676 if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
2677 WARN_ON(!page_mapped(page)) ||
2678 WARN_ON(!PageLocked(page))) {
2679 dump_page(page, "reuse_ksm_page");
2680 return false;
2681 }
2682#endif
2683
2684 if (PageSwapCache(page) || !page_stable_node(page))
2685 return false;
2686 /* Prohibit parallel get_ksm_page() */
2687 if (!page_ref_freeze(page, 1))
2688 return false;
2689
2690 page_move_anon_rmap(page, vma);
2691 page->index = linear_page_index(vma, address);
2692 page_ref_unfreeze(page, 1);
2693
2694 return true;
2695}
2645#ifdef CONFIG_MIGRATION 2696#ifdef CONFIG_MIGRATION
2646void ksm_migrate_page(struct page *newpage, struct page *oldpage) 2697void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2647{ 2698{
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5b30625fd365..0730bf8ff39f 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -601,7 +601,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
601 struct lock_class_key *key, struct shrinker *shrinker) 601 struct lock_class_key *key, struct shrinker *shrinker)
602{ 602{
603 int i; 603 int i;
604 size_t size = sizeof(*lru->node) * nr_node_ids;
605 int err = -ENOMEM; 604 int err = -ENOMEM;
606 605
607#ifdef CONFIG_MEMCG_KMEM 606#ifdef CONFIG_MEMCG_KMEM
@@ -612,7 +611,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
612#endif 611#endif
613 memcg_get_cache_ids(); 612 memcg_get_cache_ids();
614 613
615 lru->node = kzalloc(size, GFP_KERNEL); 614 lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
616 if (!lru->node) 615 if (!lru->node)
617 goto out; 616 goto out;
618 617
diff --git a/mm/memblock.c b/mm/memblock.c
index ea31045ba704..470601115892 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2005,8 +2005,7 @@ DEFINE_SHOW_ATTRIBUTE(memblock_debug);
2005static int __init memblock_init_debugfs(void) 2005static int __init memblock_init_debugfs(void)
2006{ 2006{
2007 struct dentry *root = debugfs_create_dir("memblock", NULL); 2007 struct dentry *root = debugfs_create_dir("memblock", NULL);
2008 if (!root) 2008
2009 return -ENXIO;
2010 debugfs_create_file("memory", 0444, root, 2009 debugfs_create_file("memory", 0444, root,
2011 &memblock.memory, &memblock_debug_fops); 2010 &memblock.memory, &memblock_debug_fops);
2012 debugfs_create_file("reserved", 0444, root, 2011 debugfs_create_file("reserved", 0444, root,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index af7f18b32389..532e0e2a4817 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
39#include <linux/shmem_fs.h> 39#include <linux/shmem_fs.h>
40#include <linux/hugetlb.h> 40#include <linux/hugetlb.h>
41#include <linux/pagemap.h> 41#include <linux/pagemap.h>
42#include <linux/vm_event_item.h>
42#include <linux/smp.h> 43#include <linux/smp.h>
43#include <linux/page-flags.h> 44#include <linux/page-flags.h>
44#include <linux/backing-dev.h> 45#include <linux/backing-dev.h>
@@ -248,6 +249,12 @@ enum res_type {
248 iter != NULL; \ 249 iter != NULL; \
249 iter = mem_cgroup_iter(NULL, iter, NULL)) 250 iter = mem_cgroup_iter(NULL, iter, NULL))
250 251
252static inline bool should_force_charge(void)
253{
254 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
255 (current->flags & PF_EXITING);
256}
257
251/* Some nice accessors for the vmpressure. */ 258/* Some nice accessors for the vmpressure. */
252struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 259struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
253{ 260{
@@ -1389,8 +1396,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1389 }; 1396 };
1390 bool ret; 1397 bool ret;
1391 1398
1392 mutex_lock(&oom_lock); 1399 if (mutex_lock_killable(&oom_lock))
1393 ret = out_of_memory(&oc); 1400 return true;
1401 /*
1402 * A few threads which were not waiting at mutex_lock_killable() can
1403 * fail to bail out. Therefore, check again after holding oom_lock.
1404 */
1405 ret = should_force_charge() || out_of_memory(&oc);
1394 mutex_unlock(&oom_lock); 1406 mutex_unlock(&oom_lock);
1395 return ret; 1407 return ret;
1396} 1408}
@@ -2209,9 +2221,7 @@ retry:
2209 * bypass the last charges so that they can exit quickly and 2221 * bypass the last charges so that they can exit quickly and
2210 * free their memory. 2222 * free their memory.
2211 */ 2223 */
2212 if (unlikely(tsk_is_oom_victim(current) || 2224 if (unlikely(should_force_charge()))
2213 fatal_signal_pending(current) ||
2214 current->flags & PF_EXITING))
2215 goto force; 2225 goto force;
2216 2226
2217 /* 2227 /*
@@ -2352,13 +2362,13 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2352 2362
2353static void lock_page_lru(struct page *page, int *isolated) 2363static void lock_page_lru(struct page *page, int *isolated)
2354{ 2364{
2355 struct zone *zone = page_zone(page); 2365 pg_data_t *pgdat = page_pgdat(page);
2356 2366
2357 spin_lock_irq(zone_lru_lock(zone)); 2367 spin_lock_irq(&pgdat->lru_lock);
2358 if (PageLRU(page)) { 2368 if (PageLRU(page)) {
2359 struct lruvec *lruvec; 2369 struct lruvec *lruvec;
2360 2370
2361 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2371 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2362 ClearPageLRU(page); 2372 ClearPageLRU(page);
2363 del_page_from_lru_list(page, lruvec, page_lru(page)); 2373 del_page_from_lru_list(page, lruvec, page_lru(page));
2364 *isolated = 1; 2374 *isolated = 1;
@@ -2368,17 +2378,17 @@ static void lock_page_lru(struct page *page, int *isolated)
2368 2378
2369static void unlock_page_lru(struct page *page, int isolated) 2379static void unlock_page_lru(struct page *page, int isolated)
2370{ 2380{
2371 struct zone *zone = page_zone(page); 2381 pg_data_t *pgdat = page_pgdat(page);
2372 2382
2373 if (isolated) { 2383 if (isolated) {
2374 struct lruvec *lruvec; 2384 struct lruvec *lruvec;
2375 2385
2376 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2386 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2377 VM_BUG_ON_PAGE(PageLRU(page), page); 2387 VM_BUG_ON_PAGE(PageLRU(page), page);
2378 SetPageLRU(page); 2388 SetPageLRU(page);
2379 add_page_to_lru_list(page, lruvec, page_lru(page)); 2389 add_page_to_lru_list(page, lruvec, page_lru(page));
2380 } 2390 }
2381 spin_unlock_irq(zone_lru_lock(zone)); 2391 spin_unlock_irq(&pgdat->lru_lock);
2382} 2392}
2383 2393
2384static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2394static void commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -2573,7 +2583,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
2573} 2583}
2574 2584
2575/** 2585/**
2576 * memcg_kmem_charge_memcg: charge a kmem page 2586 * __memcg_kmem_charge_memcg: charge a kmem page
2577 * @page: page to charge 2587 * @page: page to charge
2578 * @gfp: reclaim mode 2588 * @gfp: reclaim mode
2579 * @order: allocation order 2589 * @order: allocation order
@@ -2581,7 +2591,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
2581 * 2591 *
2582 * Returns 0 on success, an error code on failure. 2592 * Returns 0 on success, an error code on failure.
2583 */ 2593 */
2584int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 2594int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2585 struct mem_cgroup *memcg) 2595 struct mem_cgroup *memcg)
2586{ 2596{
2587 unsigned int nr_pages = 1 << order; 2597 unsigned int nr_pages = 1 << order;
@@ -2604,24 +2614,24 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2604} 2614}
2605 2615
2606/** 2616/**
2607 * memcg_kmem_charge: charge a kmem page to the current memory cgroup 2617 * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
2608 * @page: page to charge 2618 * @page: page to charge
2609 * @gfp: reclaim mode 2619 * @gfp: reclaim mode
2610 * @order: allocation order 2620 * @order: allocation order
2611 * 2621 *
2612 * Returns 0 on success, an error code on failure. 2622 * Returns 0 on success, an error code on failure.
2613 */ 2623 */
2614int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 2624int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2615{ 2625{
2616 struct mem_cgroup *memcg; 2626 struct mem_cgroup *memcg;
2617 int ret = 0; 2627 int ret = 0;
2618 2628
2619 if (mem_cgroup_disabled() || memcg_kmem_bypass()) 2629 if (memcg_kmem_bypass())
2620 return 0; 2630 return 0;
2621 2631
2622 memcg = get_mem_cgroup_from_current(); 2632 memcg = get_mem_cgroup_from_current();
2623 if (!mem_cgroup_is_root(memcg)) { 2633 if (!mem_cgroup_is_root(memcg)) {
2624 ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); 2634 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2625 if (!ret) 2635 if (!ret)
2626 __SetPageKmemcg(page); 2636 __SetPageKmemcg(page);
2627 } 2637 }
@@ -2629,11 +2639,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2629 return ret; 2639 return ret;
2630} 2640}
2631/** 2641/**
2632 * memcg_kmem_uncharge: uncharge a kmem page 2642 * __memcg_kmem_uncharge: uncharge a kmem page
2633 * @page: page to uncharge 2643 * @page: page to uncharge
2634 * @order: allocation order 2644 * @order: allocation order
2635 */ 2645 */
2636void memcg_kmem_uncharge(struct page *page, int order) 2646void __memcg_kmem_uncharge(struct page *page, int order)
2637{ 2647{
2638 struct mem_cgroup *memcg = page->mem_cgroup; 2648 struct mem_cgroup *memcg = page->mem_cgroup;
2639 unsigned int nr_pages = 1 << order; 2649 unsigned int nr_pages = 1 << order;
@@ -2664,7 +2674,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
2664 2674
2665/* 2675/*
2666 * Because tail pages are not marked as "used", set it. We're under 2676 * Because tail pages are not marked as "used", set it. We're under
2667 * zone_lru_lock and migration entries setup in all page mappings. 2677 * pgdat->lru_lock and migration entries setup in all page mappings.
2668 */ 2678 */
2669void mem_cgroup_split_huge_fixup(struct page *head) 2679void mem_cgroup_split_huge_fixup(struct page *head)
2670{ 2680{
@@ -3337,7 +3347,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
3337 const struct numa_stat *stat; 3347 const struct numa_stat *stat;
3338 int nid; 3348 int nid;
3339 unsigned long nr; 3349 unsigned long nr;
3340 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3350 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3341 3351
3342 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3352 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3343 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3353 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -3388,7 +3398,7 @@ static const char *const memcg1_event_names[] = {
3388 3398
3389static int memcg_stat_show(struct seq_file *m, void *v) 3399static int memcg_stat_show(struct seq_file *m, void *v)
3390{ 3400{
3391 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3401 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3392 unsigned long memory, memsw; 3402 unsigned long memory, memsw;
3393 struct mem_cgroup *mi; 3403 struct mem_cgroup *mi;
3394 unsigned int i; 3404 unsigned int i;
@@ -3626,8 +3636,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3626 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3636 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3627 3637
3628 /* Allocate memory for new array of thresholds */ 3638 /* Allocate memory for new array of thresholds */
3629 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3639 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3630 GFP_KERNEL);
3631 if (!new) { 3640 if (!new) {
3632 ret = -ENOMEM; 3641 ret = -ENOMEM;
3633 goto unlock; 3642 goto unlock;
@@ -3821,7 +3830,7 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3821 3830
3822static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 3831static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3823{ 3832{
3824 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3833 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
3825 3834
3826 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 3835 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3827 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 3836 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
@@ -4420,7 +4429,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
4420static struct mem_cgroup *mem_cgroup_alloc(void) 4429static struct mem_cgroup *mem_cgroup_alloc(void)
4421{ 4430{
4422 struct mem_cgroup *memcg; 4431 struct mem_cgroup *memcg;
4423 size_t size; 4432 unsigned int size;
4424 int node; 4433 int node;
4425 4434
4426 size = sizeof(struct mem_cgroup); 4435 size = sizeof(struct mem_cgroup);
@@ -5354,6 +5363,16 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5354 root_mem_cgroup->use_hierarchy = false; 5363 root_mem_cgroup->use_hierarchy = false;
5355} 5364}
5356 5365
5366static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
5367{
5368 if (value == PAGE_COUNTER_MAX)
5369 seq_puts(m, "max\n");
5370 else
5371 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
5372
5373 return 0;
5374}
5375
5357static u64 memory_current_read(struct cgroup_subsys_state *css, 5376static u64 memory_current_read(struct cgroup_subsys_state *css,
5358 struct cftype *cft) 5377 struct cftype *cft)
5359{ 5378{
@@ -5364,15 +5383,8 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
5364 5383
5365static int memory_min_show(struct seq_file *m, void *v) 5384static int memory_min_show(struct seq_file *m, void *v)
5366{ 5385{
5367 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5386 return seq_puts_memcg_tunable(m,
5368 unsigned long min = READ_ONCE(memcg->memory.min); 5387 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5369
5370 if (min == PAGE_COUNTER_MAX)
5371 seq_puts(m, "max\n");
5372 else
5373 seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5374
5375 return 0;
5376} 5388}
5377 5389
5378static ssize_t memory_min_write(struct kernfs_open_file *of, 5390static ssize_t memory_min_write(struct kernfs_open_file *of,
@@ -5394,15 +5406,8 @@ static ssize_t memory_min_write(struct kernfs_open_file *of,
5394 5406
5395static int memory_low_show(struct seq_file *m, void *v) 5407static int memory_low_show(struct seq_file *m, void *v)
5396{ 5408{
5397 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5409 return seq_puts_memcg_tunable(m,
5398 unsigned long low = READ_ONCE(memcg->memory.low); 5410 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5399
5400 if (low == PAGE_COUNTER_MAX)
5401 seq_puts(m, "max\n");
5402 else
5403 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5404
5405 return 0;
5406} 5411}
5407 5412
5408static ssize_t memory_low_write(struct kernfs_open_file *of, 5413static ssize_t memory_low_write(struct kernfs_open_file *of,
@@ -5424,15 +5429,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
5424 5429
5425static int memory_high_show(struct seq_file *m, void *v) 5430static int memory_high_show(struct seq_file *m, void *v)
5426{ 5431{
5427 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5432 return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
5428 unsigned long high = READ_ONCE(memcg->high);
5429
5430 if (high == PAGE_COUNTER_MAX)
5431 seq_puts(m, "max\n");
5432 else
5433 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5434
5435 return 0;
5436} 5433}
5437 5434
5438static ssize_t memory_high_write(struct kernfs_open_file *of, 5435static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -5461,15 +5458,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
5461 5458
5462static int memory_max_show(struct seq_file *m, void *v) 5459static int memory_max_show(struct seq_file *m, void *v)
5463{ 5460{
5464 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5461 return seq_puts_memcg_tunable(m,
5465 unsigned long max = READ_ONCE(memcg->memory.max); 5462 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5466
5467 if (max == PAGE_COUNTER_MAX)
5468 seq_puts(m, "max\n");
5469 else
5470 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5471
5472 return 0;
5473} 5463}
5474 5464
5475static ssize_t memory_max_write(struct kernfs_open_file *of, 5465static ssize_t memory_max_write(struct kernfs_open_file *of,
@@ -5523,7 +5513,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
5523 5513
5524static int memory_events_show(struct seq_file *m, void *v) 5514static int memory_events_show(struct seq_file *m, void *v)
5525{ 5515{
5526 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5516 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5527 5517
5528 seq_printf(m, "low %lu\n", 5518 seq_printf(m, "low %lu\n",
5529 atomic_long_read(&memcg->memory_events[MEMCG_LOW])); 5519 atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
@@ -5541,7 +5531,7 @@ static int memory_events_show(struct seq_file *m, void *v)
5541 5531
5542static int memory_stat_show(struct seq_file *m, void *v) 5532static int memory_stat_show(struct seq_file *m, void *v)
5543{ 5533{
5544 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5534 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5545 struct accumulated_stats acc; 5535 struct accumulated_stats acc;
5546 int i; 5536 int i;
5547 5537
@@ -5582,6 +5572,15 @@ static int memory_stat_show(struct seq_file *m, void *v)
5582 seq_printf(m, "file_writeback %llu\n", 5572 seq_printf(m, "file_writeback %llu\n",
5583 (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); 5573 (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5584 5574
5575 /*
5576 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
5577 * with the NR_ANON_THP vm counter, but right now it's a pain in the
5578 * arse because it requires migrating the work out of rmap to a place
5579 * where the page->mem_cgroup is set up and stable.
5580 */
5581 seq_printf(m, "anon_thp %llu\n",
5582 (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
5583
5585 for (i = 0; i < NR_LRU_LISTS; i++) 5584 for (i = 0; i < NR_LRU_LISTS; i++)
5586 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], 5585 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5587 (u64)acc.lru_pages[i] * PAGE_SIZE); 5586 (u64)acc.lru_pages[i] * PAGE_SIZE);
@@ -5613,12 +5612,18 @@ static int memory_stat_show(struct seq_file *m, void *v)
5613 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); 5612 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5614 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); 5613 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5615 5614
5615#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5616 seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
5617 seq_printf(m, "thp_collapse_alloc %lu\n",
5618 acc.events[THP_COLLAPSE_ALLOC]);
5619#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5620
5616 return 0; 5621 return 0;
5617} 5622}
5618 5623
5619static int memory_oom_group_show(struct seq_file *m, void *v) 5624static int memory_oom_group_show(struct seq_file *m, void *v)
5620{ 5625{
5621 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5626 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5622 5627
5623 seq_printf(m, "%d\n", memcg->oom_group); 5628 seq_printf(m, "%d\n", memcg->oom_group);
5624 5629
@@ -5747,7 +5752,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
5747 * 5752 *
5748 * | memory.current, if memory.current < memory.low 5753 * | memory.current, if memory.current < memory.low
5749 * low_usage = | 5754 * low_usage = |
5750 | 0, otherwise. 5755 * | 0, otherwise.
5751 * 5756 *
5752 * 5757 *
5753 * Such definition of the effective memory.low provides the expected 5758 * Such definition of the effective memory.low provides the expected
@@ -6601,15 +6606,8 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
6601 6606
6602static int swap_max_show(struct seq_file *m, void *v) 6607static int swap_max_show(struct seq_file *m, void *v)
6603{ 6608{
6604 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 6609 return seq_puts_memcg_tunable(m,
6605 unsigned long max = READ_ONCE(memcg->swap.max); 6610 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
6606
6607 if (max == PAGE_COUNTER_MAX)
6608 seq_puts(m, "max\n");
6609 else
6610 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6611
6612 return 0;
6613} 6611}
6614 6612
6615static ssize_t swap_max_write(struct kernfs_open_file *of, 6613static ssize_t swap_max_write(struct kernfs_open_file *of,
@@ -6631,7 +6629,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
6631 6629
6632static int swap_events_show(struct seq_file *m, void *v) 6630static int swap_events_show(struct seq_file *m, void *v)
6633{ 6631{
6634 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 6632 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6635 6633
6636 seq_printf(m, "max %lu\n", 6634 seq_printf(m, "max %lu\n",
6637 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 6635 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
diff --git a/mm/memfd.c b/mm/memfd.c
index 97264c79d2cd..650e65a46b9c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
131#define F_ALL_SEALS (F_SEAL_SEAL | \ 131#define F_ALL_SEALS (F_SEAL_SEAL | \
132 F_SEAL_SHRINK | \ 132 F_SEAL_SHRINK | \
133 F_SEAL_GROW | \ 133 F_SEAL_GROW | \
134 F_SEAL_WRITE) 134 F_SEAL_WRITE | \
135 F_SEAL_FUTURE_WRITE)
135 136
136static int memfd_add_seals(struct file *file, unsigned int seals) 137static int memfd_add_seals(struct file *file, unsigned int seals)
137{ 138{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 831be5ff5f4d..fc8b51744579 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1825,19 +1825,17 @@ static int soft_offline_in_use_page(struct page *page, int flags)
1825 struct page *hpage = compound_head(page); 1825 struct page *hpage = compound_head(page);
1826 1826
1827 if (!PageHuge(page) && PageTransHuge(hpage)) { 1827 if (!PageHuge(page) && PageTransHuge(hpage)) {
1828 lock_page(hpage); 1828 lock_page(page);
1829 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { 1829 if (!PageAnon(page) || unlikely(split_huge_page(page))) {
1830 unlock_page(hpage); 1830 unlock_page(page);
1831 if (!PageAnon(hpage)) 1831 if (!PageAnon(page))
1832 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); 1832 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1833 else 1833 else
1834 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); 1834 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1835 put_hwpoison_page(hpage); 1835 put_hwpoison_page(page);
1836 return -EBUSY; 1836 return -EBUSY;
1837 } 1837 }
1838 unlock_page(hpage); 1838 unlock_page(page);
1839 get_hwpoison_page(page);
1840 put_hwpoison_page(hpage);
1841 } 1839 }
1842 1840
1843 /* 1841 /*
diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..47fe250307c7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,7 @@
69#include <linux/userfaultfd_k.h> 69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h> 70#include <linux/dax.h>
71#include <linux/oom.h> 71#include <linux/oom.h>
72#include <linux/numa.h>
72 73
73#include <asm/io.h> 74#include <asm/io.h>
74#include <asm/mmu_context.h> 75#include <asm/mmu_context.h>
@@ -1451,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1451 spinlock_t *ptl; 1452 spinlock_t *ptl;
1452 1453
1453 retval = -EINVAL; 1454 retval = -EINVAL;
1454 if (PageAnon(page)) 1455 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
1455 goto out; 1456 goto out;
1456 retval = -ENOMEM; 1457 retval = -ENOMEM;
1457 flush_dcache_page(page); 1458 flush_dcache_page(page);
@@ -1503,6 +1504,8 @@ out:
1503 * under mm->mmap_sem write-lock, so it can change vma->vm_flags. 1504 * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
1504 * Caller must set VM_MIXEDMAP on vma if it wants to call this 1505 * Caller must set VM_MIXEDMAP on vma if it wants to call this
1505 * function from other places, for example from page-fault handler. 1506 * function from other places, for example from page-fault handler.
1507 *
1508 * Return: %0 on success, negative error code otherwise.
1506 */ 1509 */
1507int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 1510int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1508 struct page *page) 1511 struct page *page)
@@ -1830,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
1830 * @size: size of map area 1833 * @size: size of map area
1831 * @prot: page protection flags for this mapping 1834 * @prot: page protection flags for this mapping
1832 * 1835 *
1833 * Note: this is only safe if the mm semaphore is held when called. 1836 * Note: this is only safe if the mm semaphore is held when called.
1837 *
1838 * Return: %0 on success, negative error code otherwise.
1834 */ 1839 */
1835int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1840int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1836 unsigned long pfn, unsigned long size, pgprot_t prot) 1841 unsigned long pfn, unsigned long size, pgprot_t prot)
@@ -1903,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range);
1903 * 1908 *
1904 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get 1909 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
1905 * whatever write-combining details or similar. 1910 * whatever write-combining details or similar.
1911 *
1912 * Return: %0 on success, negative error code otherwise.
1906 */ 1913 */
1907int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) 1914int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1908{ 1915{
@@ -2381,12 +2388,13 @@ oom:
2381 * 2388 *
2382 * This function handles all that is needed to finish a write page fault in a 2389 * This function handles all that is needed to finish a write page fault in a
2383 * shared mapping due to PTE being read-only once the mapped page is prepared. 2390 * shared mapping due to PTE being read-only once the mapped page is prepared.
2384 * It handles locking of PTE and modifying it. The function returns 2391 * It handles locking of PTE and modifying it.
2385 * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
2386 * lock.
2387 * 2392 *
2388 * The function expects the page to be locked or other protection against 2393 * The function expects the page to be locked or other protection against
2389 * concurrent faults / writeback (such as DAX radix tree locks). 2394 * concurrent faults / writeback (such as DAX radix tree locks).
2395 *
2396 * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
2397 * we acquired PTE lock.
2390 */ 2398 */
2391vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) 2399vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2392{ 2400{
@@ -2504,8 +2512,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
2504 * Take out anonymous pages first, anonymous shared vmas are 2512 * Take out anonymous pages first, anonymous shared vmas are
2505 * not dirty accountable. 2513 * not dirty accountable.
2506 */ 2514 */
2507 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { 2515 if (PageAnon(vmf->page)) {
2508 int total_map_swapcount; 2516 int total_map_swapcount;
2517 if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
2518 page_count(vmf->page) != 1))
2519 goto copy;
2509 if (!trylock_page(vmf->page)) { 2520 if (!trylock_page(vmf->page)) {
2510 get_page(vmf->page); 2521 get_page(vmf->page);
2511 pte_unmap_unlock(vmf->pte, vmf->ptl); 2522 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2520,6 +2531,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
2520 } 2531 }
2521 put_page(vmf->page); 2532 put_page(vmf->page);
2522 } 2533 }
2534 if (PageKsm(vmf->page)) {
2535 bool reused = reuse_ksm_page(vmf->page, vmf->vma,
2536 vmf->address);
2537 unlock_page(vmf->page);
2538 if (!reused)
2539 goto copy;
2540 wp_page_reuse(vmf);
2541 return VM_FAULT_WRITE;
2542 }
2523 if (reuse_swap_page(vmf->page, &total_map_swapcount)) { 2543 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2524 if (total_map_swapcount == 1) { 2544 if (total_map_swapcount == 1) {
2525 /* 2545 /*
@@ -2540,7 +2560,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
2540 (VM_WRITE|VM_SHARED))) { 2560 (VM_WRITE|VM_SHARED))) {
2541 return wp_page_shared(vmf); 2561 return wp_page_shared(vmf);
2542 } 2562 }
2543 2563copy:
2544 /* 2564 /*
2545 * Ok, we need to copy. Oh, well.. 2565 * Ok, we need to copy. Oh, well..
2546 */ 2566 */
@@ -3201,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3201 * 3221 *
3202 * Target users are page handler itself and implementations of 3222 * Target users are page handler itself and implementations of
3203 * vm_ops->map_pages. 3223 * vm_ops->map_pages.
3224 *
3225 * Return: %0 on success, %VM_FAULT_ code in case of error.
3204 */ 3226 */
3205vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, 3227vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3206 struct page *page) 3228 struct page *page)
@@ -3261,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3261 * This function handles all that is needed to finish a page fault once the 3283 * This function handles all that is needed to finish a page fault once the
3262 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for 3284 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
3263 * given page, adds reverse page mapping, handles memcg charges and LRU 3285 * given page, adds reverse page mapping, handles memcg charges and LRU
3264 * addition. The function returns 0 on success, VM_FAULT_ code in case of 3286 * addition.
3265 * error.
3266 * 3287 *
3267 * The function expects the page to be locked and on success it consumes a 3288 * The function expects the page to be locked and on success it consumes a
3268 * reference of a page being mapped (for the PTE which maps it). 3289 * reference of a page being mapped (for the PTE which maps it).
3290 *
3291 * Return: %0 on success, %VM_FAULT_ code in case of error.
3269 */ 3292 */
3270vm_fault_t finish_fault(struct vm_fault *vmf) 3293vm_fault_t finish_fault(struct vm_fault *vmf)
3271{ 3294{
@@ -3321,12 +3344,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3321 3344
3322static int __init fault_around_debugfs(void) 3345static int __init fault_around_debugfs(void)
3323{ 3346{
3324 void *ret; 3347 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3325 3348 &fault_around_bytes_fops);
3326 ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3327 &fault_around_bytes_fops);
3328 if (!ret)
3329 pr_warn("Failed to create fault_around_bytes in debugfs");
3330 return 0; 3349 return 0;
3331} 3350}
3332late_initcall(fault_around_debugfs); 3351late_initcall(fault_around_debugfs);
@@ -3517,10 +3536,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
3517 * but allow concurrent faults). 3536 * but allow concurrent faults).
3518 * The mmap_sem may have been released depending on flags and our 3537 * The mmap_sem may have been released depending on flags and our
3519 * return value. See filemap_fault() and __lock_page_or_retry(). 3538 * return value. See filemap_fault() and __lock_page_or_retry().
3539 * If mmap_sem is released, vma may become invalid (for example
3540 * by other thread calling munmap()).
3520 */ 3541 */
3521static vm_fault_t do_fault(struct vm_fault *vmf) 3542static vm_fault_t do_fault(struct vm_fault *vmf)
3522{ 3543{
3523 struct vm_area_struct *vma = vmf->vma; 3544 struct vm_area_struct *vma = vmf->vma;
3545 struct mm_struct *vm_mm = vma->vm_mm;
3524 vm_fault_t ret; 3546 vm_fault_t ret;
3525 3547
3526 /* 3548 /*
@@ -3561,7 +3583,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
3561 3583
3562 /* preallocated pagetable is unused: free it */ 3584 /* preallocated pagetable is unused: free it */
3563 if (vmf->prealloc_pte) { 3585 if (vmf->prealloc_pte) {
3564 pte_free(vma->vm_mm, vmf->prealloc_pte); 3586 pte_free(vm_mm, vmf->prealloc_pte);
3565 vmf->prealloc_pte = NULL; 3587 vmf->prealloc_pte = NULL;
3566 } 3588 }
3567 return ret; 3589 return ret;
@@ -3586,11 +3608,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
3586{ 3608{
3587 struct vm_area_struct *vma = vmf->vma; 3609 struct vm_area_struct *vma = vmf->vma;
3588 struct page *page = NULL; 3610 struct page *page = NULL;
3589 int page_nid = -1; 3611 int page_nid = NUMA_NO_NODE;
3590 int last_cpupid; 3612 int last_cpupid;
3591 int target_nid; 3613 int target_nid;
3592 bool migrated = false; 3614 bool migrated = false;
3593 pte_t pte; 3615 pte_t pte, old_pte;
3594 bool was_writable = pte_savedwrite(vmf->orig_pte); 3616 bool was_writable = pte_savedwrite(vmf->orig_pte);
3595 int flags = 0; 3617 int flags = 0;
3596 3618
@@ -3610,12 +3632,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
3610 * Make it present again, Depending on how arch implementes non 3632 * Make it present again, Depending on how arch implementes non
3611 * accessible ptes, some can allow access by kernel mode. 3633 * accessible ptes, some can allow access by kernel mode.
3612 */ 3634 */
3613 pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); 3635 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
3614 pte = pte_modify(pte, vma->vm_page_prot); 3636 pte = pte_modify(old_pte, vma->vm_page_prot);
3615 pte = pte_mkyoung(pte); 3637 pte = pte_mkyoung(pte);
3616 if (was_writable) 3638 if (was_writable)
3617 pte = pte_mkwrite(pte); 3639 pte = pte_mkwrite(pte);
3618 ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); 3640 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
3619 update_mmu_cache(vma, vmf->address, vmf->pte); 3641 update_mmu_cache(vma, vmf->address, vmf->pte);
3620 3642
3621 page = vm_normal_page(vma, vmf->address, pte); 3643 page = vm_normal_page(vma, vmf->address, pte);
@@ -3653,7 +3675,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
3653 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, 3675 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3654 &flags); 3676 &flags);
3655 pte_unmap_unlock(vmf->pte, vmf->ptl); 3677 pte_unmap_unlock(vmf->pte, vmf->ptl);
3656 if (target_nid == -1) { 3678 if (target_nid == NUMA_NO_NODE) {
3657 put_page(page); 3679 put_page(page);
3658 goto out; 3680 goto out;
3659 } 3681 }
@@ -3667,7 +3689,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
3667 flags |= TNF_MIGRATE_FAIL; 3689 flags |= TNF_MIGRATE_FAIL;
3668 3690
3669out: 3691out:
3670 if (page_nid != -1) 3692 if (page_nid != NUMA_NO_NODE)
3671 task_numa_fault(last_cpupid, page_nid, 1, flags); 3693 task_numa_fault(last_cpupid, page_nid, 1, flags);
3672 return 0; 3694 return 0;
3673} 3695}
@@ -4150,7 +4172,7 @@ EXPORT_SYMBOL(follow_pte_pmd);
4150 * 4172 *
4151 * Only IO mappings and raw PFN mappings are allowed. 4173 * Only IO mappings and raw PFN mappings are allowed.
4152 * 4174 *
4153 * Returns zero and the pfn at @pfn on success, -ve otherwise. 4175 * Return: zero and the pfn at @pfn on success, -ve otherwise.
4154 */ 4176 */
4155int follow_pfn(struct vm_area_struct *vma, unsigned long address, 4177int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4156 unsigned long *pfn) 4178 unsigned long *pfn)
@@ -4300,6 +4322,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4300 * @gup_flags: flags modifying lookup behaviour 4322 * @gup_flags: flags modifying lookup behaviour
4301 * 4323 *
4302 * The caller must hold a reference on @mm. 4324 * The caller must hold a reference on @mm.
4325 *
4326 * Return: number of bytes copied from source to destination.
4303 */ 4327 */
4304int access_remote_vm(struct mm_struct *mm, unsigned long addr, 4328int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4305 void *buf, int len, unsigned int gup_flags) 4329 void *buf, int len, unsigned int gup_flags)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1ad28323fb9f..6b05576fb4ec 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -47,7 +47,7 @@
47 * and restore_online_page_callback() for generic callback restore. 47 * and restore_online_page_callback() for generic callback restore.
48 */ 48 */
49 49
50static void generic_online_page(struct page *page); 50static void generic_online_page(struct page *page, unsigned int order);
51 51
52static online_page_callback_t online_page_callback = generic_online_page; 52static online_page_callback_t online_page_callback = generic_online_page;
53static DEFINE_MUTEX(online_page_callback_lock); 53static DEFINE_MUTEX(online_page_callback_lock);
@@ -656,26 +656,40 @@ void __online_page_free(struct page *page)
656} 656}
657EXPORT_SYMBOL_GPL(__online_page_free); 657EXPORT_SYMBOL_GPL(__online_page_free);
658 658
659static void generic_online_page(struct page *page) 659static void generic_online_page(struct page *page, unsigned int order)
660{ 660{
661 __online_page_set_limits(page); 661 kernel_map_pages(page, 1 << order, 1);
662 __online_page_increment_counters(page); 662 __free_pages_core(page, order);
663 __online_page_free(page); 663 totalram_pages_add(1UL << order);
664#ifdef CONFIG_HIGHMEM
665 if (PageHighMem(page))
666 totalhigh_pages_add(1UL << order);
667#endif
668}
669
670static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
671{
672 unsigned long end = start + nr_pages;
673 int order, onlined_pages = 0;
674
675 while (start < end) {
676 order = min(MAX_ORDER - 1,
677 get_order(PFN_PHYS(end) - PFN_PHYS(start)));
678 (*online_page_callback)(pfn_to_page(start), order);
679
680 onlined_pages += (1UL << order);
681 start += (1UL << order);
682 }
683 return onlined_pages;
664} 684}
665 685
666static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 686static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
667 void *arg) 687 void *arg)
668{ 688{
669 unsigned long i;
670 unsigned long onlined_pages = *(unsigned long *)arg; 689 unsigned long onlined_pages = *(unsigned long *)arg;
671 struct page *page;
672 690
673 if (PageReserved(pfn_to_page(start_pfn))) 691 if (PageReserved(pfn_to_page(start_pfn)))
674 for (i = 0; i < nr_pages; i++) { 692 onlined_pages += online_pages_blocks(start_pfn, nr_pages);
675 page = pfn_to_page(start_pfn + i);
676 (*online_page_callback)(page);
677 onlined_pages++;
678 }
679 693
680 online_mem_sections(start_pfn, start_pfn + nr_pages); 694 online_mem_sections(start_pfn, start_pfn + nr_pages);
681 695
@@ -689,9 +703,9 @@ static void node_states_check_changes_online(unsigned long nr_pages,
689{ 703{
690 int nid = zone_to_nid(zone); 704 int nid = zone_to_nid(zone);
691 705
692 arg->status_change_nid = -1; 706 arg->status_change_nid = NUMA_NO_NODE;
693 arg->status_change_nid_normal = -1; 707 arg->status_change_nid_normal = NUMA_NO_NODE;
694 arg->status_change_nid_high = -1; 708 arg->status_change_nid_high = NUMA_NO_NODE;
695 709
696 if (!node_state(nid, N_MEMORY)) 710 if (!node_state(nid, N_MEMORY))
697 arg->status_change_nid = nid; 711 arg->status_change_nid = nid;
@@ -1365,12 +1379,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1365 1379
1366 if (PageHuge(page)) { 1380 if (PageHuge(page)) {
1367 struct page *head = compound_head(page); 1381 struct page *head = compound_head(page);
1368 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1369 if (compound_order(head) > PFN_SECTION_SHIFT) { 1382 if (compound_order(head) > PFN_SECTION_SHIFT) {
1370 ret = -EBUSY; 1383 ret = -EBUSY;
1371 break; 1384 break;
1372 } 1385 }
1373 isolate_huge_page(page, &source); 1386 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1387 isolate_huge_page(head, &source);
1374 continue; 1388 continue;
1375 } else if (PageTransHuge(page)) 1389 } else if (PageTransHuge(page))
1376 pfn = page_to_pfn(compound_head(page)) 1390 pfn = page_to_pfn(compound_head(page))
@@ -1496,9 +1510,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
1496 unsigned long present_pages = 0; 1510 unsigned long present_pages = 0;
1497 enum zone_type zt; 1511 enum zone_type zt;
1498 1512
1499 arg->status_change_nid = -1; 1513 arg->status_change_nid = NUMA_NO_NODE;
1500 arg->status_change_nid_normal = -1; 1514 arg->status_change_nid_normal = NUMA_NO_NODE;
1501 arg->status_change_nid_high = -1; 1515 arg->status_change_nid_high = NUMA_NO_NODE;
1502 1516
1503 /* 1517 /*
1504 * Check whether node_states[N_NORMAL_MEMORY] will be changed. 1518 * Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1612,7 +1626,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
1612 1626
1613 cond_resched(); 1627 cond_resched();
1614 lru_add_drain_all(); 1628 lru_add_drain_all();
1615 drain_all_pages(zone);
1616 1629
1617 pfn = scan_movable_pages(pfn, end_pfn); 1630 pfn = scan_movable_pages(pfn, end_pfn);
1618 if (pfn) { 1631 if (pfn) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ee2bce59d2bf..af171ccb56a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350{ 350{
351 if (!pol) 351 if (!pol)
352 return; 352 return;
353 if (!mpol_store_user_nodemask(pol) && 353 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 354 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 return; 355 return;
356 356
@@ -2304,7 +2304,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2304 unsigned long pgoff; 2304 unsigned long pgoff;
2305 int thiscpu = raw_smp_processor_id(); 2305 int thiscpu = raw_smp_processor_id();
2306 int thisnid = cpu_to_node(thiscpu); 2306 int thisnid = cpu_to_node(thiscpu);
2307 int polnid = -1; 2307 int polnid = NUMA_NO_NODE;
2308 int ret = -1; 2308 int ret = -1;
2309 2309
2310 pol = get_vma_policy(vma, addr); 2310 pol = get_vma_policy(vma, addr);
diff --git a/mm/mempool.c b/mm/mempool.c
index 0ef8cc8d1602..85efab3da720 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -222,6 +222,8 @@ EXPORT_SYMBOL(mempool_init_node);
222 * 222 *
223 * Like mempool_create(), but initializes the pool in (i.e. embedded in another 223 * Like mempool_create(), but initializes the pool in (i.e. embedded in another
224 * structure). 224 * structure).
225 *
226 * Return: %0 on success, negative error code otherwise.
225 */ 227 */
226int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 228int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
227 mempool_free_t *free_fn, void *pool_data) 229 mempool_free_t *free_fn, void *pool_data)
@@ -245,6 +247,8 @@ EXPORT_SYMBOL(mempool_init);
245 * functions. This function might sleep. Both the alloc_fn() and the free_fn() 247 * functions. This function might sleep. Both the alloc_fn() and the free_fn()
246 * functions might sleep - as long as the mempool_alloc() function is not called 248 * functions might sleep - as long as the mempool_alloc() function is not called
247 * from IRQ contexts. 249 * from IRQ contexts.
250 *
251 * Return: pointer to the created memory pool object or %NULL on error.
248 */ 252 */
249mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 253mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
250 mempool_free_t *free_fn, void *pool_data) 254 mempool_free_t *free_fn, void *pool_data)
@@ -289,6 +293,8 @@ EXPORT_SYMBOL(mempool_create_node);
289 * Note, the caller must guarantee that no mempool_destroy is called 293 * Note, the caller must guarantee that no mempool_destroy is called
290 * while this function is running. mempool_alloc() & mempool_free() 294 * while this function is running. mempool_alloc() & mempool_free()
291 * might be called (eg. from IRQ contexts) while this function executes. 295 * might be called (eg. from IRQ contexts) while this function executes.
296 *
297 * Return: %0 on success, negative error code otherwise.
292 */ 298 */
293int mempool_resize(mempool_t *pool, int new_min_nr) 299int mempool_resize(mempool_t *pool, int new_min_nr)
294{ 300{
@@ -363,6 +369,8 @@ EXPORT_SYMBOL(mempool_resize);
363 * *never* fails when called from process contexts. (it might 369 * *never* fails when called from process contexts. (it might
364 * fail if called from an IRQ context.) 370 * fail if called from an IRQ context.)
365 * Note: using __GFP_ZERO is not supported. 371 * Note: using __GFP_ZERO is not supported.
372 *
373 * Return: pointer to the allocated element or %NULL on error.
366 */ 374 */
367void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) 375void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
368{ 376{
diff --git a/mm/migrate.c b/mm/migrate.c
index 181f5d2718a9..ac6f4939bb59 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -100,7 +100,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
100 /* 100 /*
101 * Check PageMovable before holding a PG_lock because page's owner 101 * Check PageMovable before holding a PG_lock because page's owner
102 * assumes anybody doesn't touch PG_lock of newly allocated page 102 * assumes anybody doesn't touch PG_lock of newly allocated page
103 * so unconditionally grapping the lock ruins page's owner side. 103 * so unconditionally grabbing the lock ruins page's owner side.
104 */ 104 */
105 if (unlikely(!__PageMovable(page))) 105 if (unlikely(!__PageMovable(page)))
106 goto out_putpage; 106 goto out_putpage;
@@ -374,7 +374,7 @@ unlock:
374} 374}
375#endif 375#endif
376 376
377static int expected_page_refs(struct page *page) 377static int expected_page_refs(struct address_space *mapping, struct page *page)
378{ 378{
379 int expected_count = 1; 379 int expected_count = 1;
380 380
@@ -384,7 +384,7 @@ static int expected_page_refs(struct page *page)
384 */ 384 */
385 expected_count += is_device_private_page(page); 385 expected_count += is_device_private_page(page);
386 expected_count += is_device_public_page(page); 386 expected_count += is_device_public_page(page);
387 if (page_mapping(page)) 387 if (mapping)
388 expected_count += hpage_nr_pages(page) + page_has_private(page); 388 expected_count += hpage_nr_pages(page) + page_has_private(page);
389 389
390 return expected_count; 390 return expected_count;
@@ -405,7 +405,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
405 XA_STATE(xas, &mapping->i_pages, page_index(page)); 405 XA_STATE(xas, &mapping->i_pages, page_index(page));
406 struct zone *oldzone, *newzone; 406 struct zone *oldzone, *newzone;
407 int dirty; 407 int dirty;
408 int expected_count = expected_page_refs(page) + extra_count; 408 int expected_count = expected_page_refs(mapping, page) + extra_count;
409 409
410 if (!mapping) { 410 if (!mapping) {
411 /* Anonymous page without mapping */ 411 /* Anonymous page without mapping */
@@ -750,7 +750,7 @@ static int __buffer_migrate_page(struct address_space *mapping,
750 return migrate_page(mapping, newpage, page, mode); 750 return migrate_page(mapping, newpage, page, mode);
751 751
752 /* Check whether page does not have extra refs before we do more work */ 752 /* Check whether page does not have extra refs before we do more work */
753 expected_count = expected_page_refs(page); 753 expected_count = expected_page_refs(mapping, page);
754 if (page_count(page) != expected_count) 754 if (page_count(page) != expected_count)
755 return -EAGAIN; 755 return -EAGAIN;
756 756
@@ -911,7 +911,7 @@ static int fallback_migrate_page(struct address_space *mapping,
911 */ 911 */
912 if (page_has_private(page) && 912 if (page_has_private(page) &&
913 !try_to_release_page(page, GFP_KERNEL)) 913 !try_to_release_page(page, GFP_KERNEL))
914 return -EAGAIN; 914 return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
915 915
916 return migrate_page(mapping, newpage, page, mode); 916 return migrate_page(mapping, newpage, page, mode);
917} 917}
@@ -1287,7 +1287,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1287 struct anon_vma *anon_vma = NULL; 1287 struct anon_vma *anon_vma = NULL;
1288 1288
1289 /* 1289 /*
1290 * Movability of hugepages depends on architectures and hugepage size. 1290 * Migratability of hugepages depends on architectures and their size.
1291 * This check is necessary because some callers of hugepage migration 1291 * This check is necessary because some callers of hugepage migration
1292 * like soft offline and memory hotremove don't walk through page 1292 * like soft offline and memory hotremove don't walk through page
1293 * tables or check whether the hugepage is pmd-based or not before 1293 * tables or check whether the hugepage is pmd-based or not before
diff --git a/mm/mlock.c b/mm/mlock.c
index 41cc47e28ad6..080f3b36415b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -182,7 +182,7 @@ static void __munlock_isolation_failed(struct page *page)
182unsigned int munlock_vma_page(struct page *page) 182unsigned int munlock_vma_page(struct page *page)
183{ 183{
184 int nr_pages; 184 int nr_pages;
185 struct zone *zone = page_zone(page); 185 pg_data_t *pgdat = page_pgdat(page);
186 186
187 /* For try_to_munlock() and to serialize with page migration */ 187 /* For try_to_munlock() and to serialize with page migration */
188 BUG_ON(!PageLocked(page)); 188 BUG_ON(!PageLocked(page));
@@ -194,7 +194,7 @@ unsigned int munlock_vma_page(struct page *page)
194 * might otherwise copy PageMlocked to part of the tail pages before 194 * might otherwise copy PageMlocked to part of the tail pages before
195 * we clear it in the head page. It also stabilizes hpage_nr_pages(). 195 * we clear it in the head page. It also stabilizes hpage_nr_pages().
196 */ 196 */
197 spin_lock_irq(zone_lru_lock(zone)); 197 spin_lock_irq(&pgdat->lru_lock);
198 198
199 if (!TestClearPageMlocked(page)) { 199 if (!TestClearPageMlocked(page)) {
200 /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ 200 /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
@@ -203,17 +203,17 @@ unsigned int munlock_vma_page(struct page *page)
203 } 203 }
204 204
205 nr_pages = hpage_nr_pages(page); 205 nr_pages = hpage_nr_pages(page);
206 __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); 206 __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
207 207
208 if (__munlock_isolate_lru_page(page, true)) { 208 if (__munlock_isolate_lru_page(page, true)) {
209 spin_unlock_irq(zone_lru_lock(zone)); 209 spin_unlock_irq(&pgdat->lru_lock);
210 __munlock_isolated_page(page); 210 __munlock_isolated_page(page);
211 goto out; 211 goto out;
212 } 212 }
213 __munlock_isolation_failed(page); 213 __munlock_isolation_failed(page);
214 214
215unlock_out: 215unlock_out:
216 spin_unlock_irq(zone_lru_lock(zone)); 216 spin_unlock_irq(&pgdat->lru_lock);
217 217
218out: 218out:
219 return nr_pages - 1; 219 return nr_pages - 1;
@@ -298,7 +298,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
298 pagevec_init(&pvec_putback); 298 pagevec_init(&pvec_putback);
299 299
300 /* Phase 1: page isolation */ 300 /* Phase 1: page isolation */
301 spin_lock_irq(zone_lru_lock(zone)); 301 spin_lock_irq(&zone->zone_pgdat->lru_lock);
302 for (i = 0; i < nr; i++) { 302 for (i = 0; i < nr; i++) {
303 struct page *page = pvec->pages[i]; 303 struct page *page = pvec->pages[i];
304 304
@@ -325,7 +325,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
325 pvec->pages[i] = NULL; 325 pvec->pages[i] = NULL;
326 } 326 }
327 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); 327 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
328 spin_unlock_irq(zone_lru_lock(zone)); 328 spin_unlock_irq(&zone->zone_pgdat->lru_lock);
329 329
330 /* Now we can release pins of pages that we are not munlocking */ 330 /* Now we can release pins of pages that we are not munlocking */
331 pagevec_release(&pvec_putback); 331 pagevec_release(&pvec_putback);
diff --git a/mm/mmap.c b/mm/mmap.c
index fc1809b1bed6..41eb48d9b527 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -438,7 +438,7 @@ static void vma_gap_update(struct vm_area_struct *vma)
438{ 438{
439 /* 439 /*
440 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback 440 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
441 * function that does exacltly what we want. 441 * function that does exactly what we want.
442 */ 442 */
443 vma_gap_callbacks_propagate(&vma->vm_rb, NULL); 443 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
444} 444}
@@ -1012,7 +1012,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
1012 * VM_SOFTDIRTY should not prevent from VMA merging, if we 1012 * VM_SOFTDIRTY should not prevent from VMA merging, if we
1013 * match the flags but dirty bit -- the caller should mark 1013 * match the flags but dirty bit -- the caller should mark
1014 * merged VMA as dirty. If dirty bit won't be excluded from 1014 * merged VMA as dirty. If dirty bit won't be excluded from
1015 * comparison, we increase pressue on the memory system forcing 1015 * comparison, we increase pressure on the memory system forcing
1016 * the kernel to generate new VMAs when old one could be 1016 * the kernel to generate new VMAs when old one could be
1017 * extended instead. 1017 * extended instead.
1018 */ 1018 */
@@ -1115,7 +1115,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
1115 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 1115 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
1116 * might become case 1 below case 2 below case 3 below 1116 * might become case 1 below case 2 below case 3 below
1117 * 1117 *
1118 * It is important for case 8 that the the vma NNNN overlapping the 1118 * It is important for case 8 that the vma NNNN overlapping the
1119 * region AAAA is never going to extended over XXXX. Instead XXXX must 1119 * region AAAA is never going to extended over XXXX. Instead XXXX must
1120 * be extended in region AAAA and NNNN must be removed. This way in 1120 * be extended in region AAAA and NNNN must be removed. This way in
1121 * all cases where vma_merge succeeds, the moment vma_adjust drops the 1121 * all cases where vma_merge succeeds, the moment vma_adjust drops the
@@ -1645,7 +1645,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1645#endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1645#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1646 1646
1647/* 1647/*
1648 * Some shared mappigns will want the pages marked read-only 1648 * Some shared mappings will want the pages marked read-only
1649 * to track write events. If so, we'll downgrade vm_page_prot 1649 * to track write events. If so, we'll downgrade vm_page_prot
1650 * to the private version (using protection_map[] without the 1650 * to the private version (using protection_map[] without the
1651 * VM_SHARED bit). 1651 * VM_SHARED bit).
@@ -2126,13 +2126,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
2126 */ 2126 */
2127#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 2127#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
2128unsigned long 2128unsigned long
2129arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 2129arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
2130 const unsigned long len, const unsigned long pgoff, 2130 unsigned long len, unsigned long pgoff,
2131 const unsigned long flags) 2131 unsigned long flags)
2132{ 2132{
2133 struct vm_area_struct *vma, *prev; 2133 struct vm_area_struct *vma, *prev;
2134 struct mm_struct *mm = current->mm; 2134 struct mm_struct *mm = current->mm;
2135 unsigned long addr = addr0;
2136 struct vm_unmapped_area_info info; 2135 struct vm_unmapped_area_info info;
2137 const unsigned long mmap_end = arch_get_mmap_end(addr); 2136 const unsigned long mmap_end = arch_get_mmap_end(addr);
2138 2137
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 36cb358db170..028c724dcb1a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -110,8 +110,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
110 continue; 110 continue;
111 } 111 }
112 112
113 ptent = ptep_modify_prot_start(mm, addr, pte); 113 oldpte = ptep_modify_prot_start(vma, addr, pte);
114 ptent = pte_modify(ptent, newprot); 114 ptent = pte_modify(oldpte, newprot);
115 if (preserve_write) 115 if (preserve_write)
116 ptent = pte_mk_savedwrite(ptent); 116 ptent = pte_mk_savedwrite(ptent);
117 117
@@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
121 !(vma->vm_flags & VM_SOFTDIRTY))) { 121 !(vma->vm_flags & VM_SOFTDIRTY))) {
122 ptent = pte_mkwrite(ptent); 122 ptent = pte_mkwrite(ptent);
123 } 123 }
124 ptep_modify_prot_commit(mm, addr, pte, ptent); 124 ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
125 pages++; 125 pages++;
126 } else if (IS_ENABLED(CONFIG_MIGRATION)) { 126 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
127 swp_entry_t entry = pte_to_swp_entry(oldpte); 127 swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/mremap.c b/mm/mremap.c
index 3320616ed93f..e3edef6b7a12 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -516,6 +516,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
516 if (addr + old_len > new_addr && new_addr + new_len > addr) 516 if (addr + old_len > new_addr && new_addr + new_len > addr)
517 goto out; 517 goto out;
518 518
519 /*
520 * move_vma() need us to stay 4 maps below the threshold, otherwise
521 * it will bail out at the very beginning.
522 * That is a problem if we have already unmaped the regions here
523 * (new_addr, and old_addr), because userspace will not know the
524 * state of the vma's after it gets -ENOMEM.
525 * So, to avoid such scenario we can pre-compute if the whole
526 * operation has high chances to success map-wise.
527 * Worst-scenario case is when both vma's (new_addr and old_addr) get
528 * split in 3 before unmaping it.
529 * That means 2 more maps (1 for each) to the ones we already hold.
530 * Check whether current map count plus 2 still leads us to 4 maps below
531 * the threshold, otherwise return -ENOMEM here to be more safe.
532 */
533 if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
534 return -ENOMEM;
535
519 ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); 536 ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
520 if (ret) 537 if (ret)
521 goto out; 538 goto out;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 26ea8636758f..3a2484884cfd 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -843,7 +843,7 @@ static bool task_will_free_mem(struct task_struct *task)
843 return ret; 843 return ret;
844} 844}
845 845
846static void __oom_kill_process(struct task_struct *victim) 846static void __oom_kill_process(struct task_struct *victim, const char *message)
847{ 847{
848 struct task_struct *p; 848 struct task_struct *p;
849 struct mm_struct *mm; 849 struct mm_struct *mm;
@@ -874,8 +874,9 @@ static void __oom_kill_process(struct task_struct *victim)
874 */ 874 */
875 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); 875 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
876 mark_oom_victim(victim); 876 mark_oom_victim(victim);
877 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 877 pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
878 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), 878 message, task_pid_nr(victim), victim->comm,
879 K(victim->mm->total_vm),
879 K(get_mm_counter(victim->mm, MM_ANONPAGES)), 880 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
880 K(get_mm_counter(victim->mm, MM_FILEPAGES)), 881 K(get_mm_counter(victim->mm, MM_FILEPAGES)),
881 K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); 882 K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
@@ -926,24 +927,20 @@ static void __oom_kill_process(struct task_struct *victim)
926 * Kill provided task unless it's secured by setting 927 * Kill provided task unless it's secured by setting
927 * oom_score_adj to OOM_SCORE_ADJ_MIN. 928 * oom_score_adj to OOM_SCORE_ADJ_MIN.
928 */ 929 */
929static int oom_kill_memcg_member(struct task_struct *task, void *unused) 930static int oom_kill_memcg_member(struct task_struct *task, void *message)
930{ 931{
931 if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { 932 if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
933 !is_global_init(task)) {
932 get_task_struct(task); 934 get_task_struct(task);
933 __oom_kill_process(task); 935 __oom_kill_process(task, message);
934 } 936 }
935 return 0; 937 return 0;
936} 938}
937 939
938static void oom_kill_process(struct oom_control *oc, const char *message) 940static void oom_kill_process(struct oom_control *oc, const char *message)
939{ 941{
940 struct task_struct *p = oc->chosen; 942 struct task_struct *victim = oc->chosen;
941 unsigned int points = oc->chosen_points;
942 struct task_struct *victim = p;
943 struct task_struct *child;
944 struct task_struct *t;
945 struct mem_cgroup *oom_group; 943 struct mem_cgroup *oom_group;
946 unsigned int victim_points = 0;
947 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 944 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
948 DEFAULT_RATELIMIT_BURST); 945 DEFAULT_RATELIMIT_BURST);
949 946
@@ -952,57 +949,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
952 * its children or threads, just give it access to memory reserves 949 * its children or threads, just give it access to memory reserves
953 * so it can die quickly 950 * so it can die quickly
954 */ 951 */
955 task_lock(p); 952 task_lock(victim);
956 if (task_will_free_mem(p)) { 953 if (task_will_free_mem(victim)) {
957 mark_oom_victim(p); 954 mark_oom_victim(victim);
958 wake_oom_reaper(p); 955 wake_oom_reaper(victim);
959 task_unlock(p); 956 task_unlock(victim);
960 put_task_struct(p); 957 put_task_struct(victim);
961 return; 958 return;
962 } 959 }
963 task_unlock(p); 960 task_unlock(victim);
964 961
965 if (__ratelimit(&oom_rs)) 962 if (__ratelimit(&oom_rs))
966 dump_header(oc, p); 963 dump_header(oc, victim);
967
968 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
969 message, task_pid_nr(p), p->comm, points);
970
971 /*
972 * If any of p's children has a different mm and is eligible for kill,
973 * the one with the highest oom_badness() score is sacrificed for its
974 * parent. This attempts to lose the minimal amount of work done while
975 * still freeing memory.
976 */
977 read_lock(&tasklist_lock);
978
979 /*
980 * The task 'p' might have already exited before reaching here. The
981 * put_task_struct() will free task_struct 'p' while the loop still try
982 * to access the field of 'p', so, get an extra reference.
983 */
984 get_task_struct(p);
985 for_each_thread(p, t) {
986 list_for_each_entry(child, &t->children, sibling) {
987 unsigned int child_points;
988
989 if (process_shares_mm(child, p->mm))
990 continue;
991 /*
992 * oom_badness() returns 0 if the thread is unkillable
993 */
994 child_points = oom_badness(child,
995 oc->memcg, oc->nodemask, oc->totalpages);
996 if (child_points > victim_points) {
997 put_task_struct(victim);
998 victim = child;
999 victim_points = child_points;
1000 get_task_struct(victim);
1001 }
1002 }
1003 }
1004 put_task_struct(p);
1005 read_unlock(&tasklist_lock);
1006 964
1007 /* 965 /*
1008 * Do we need to kill the entire memory cgroup? 966 * Do we need to kill the entire memory cgroup?
@@ -1011,14 +969,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
1011 */ 969 */
1012 oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); 970 oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
1013 971
1014 __oom_kill_process(victim); 972 __oom_kill_process(victim, message);
1015 973
1016 /* 974 /*
1017 * If necessary, kill all tasks in the selected memory cgroup. 975 * If necessary, kill all tasks in the selected memory cgroup.
1018 */ 976 */
1019 if (oom_group) { 977 if (oom_group) {
1020 mem_cgroup_print_oom_group(oom_group); 978 mem_cgroup_print_oom_group(oom_group);
1021 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); 979 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
980 (void*)message);
1022 mem_cgroup_put(oom_group); 981 mem_cgroup_put(oom_group);
1023 } 982 }
1024} 983}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7d1010453fb9..9f61dfec6a1f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -270,7 +270,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
270 * node_dirtyable_memory - number of dirtyable pages in a node 270 * node_dirtyable_memory - number of dirtyable pages in a node
271 * @pgdat: the node 271 * @pgdat: the node
272 * 272 *
273 * Returns the node's number of pages potentially available for dirty 273 * Return: the node's number of pages potentially available for dirty
274 * page cache. This is the base value for the per-node dirty limits. 274 * page cache. This is the base value for the per-node dirty limits.
275 */ 275 */
276static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) 276static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
@@ -355,7 +355,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
355/** 355/**
356 * global_dirtyable_memory - number of globally dirtyable pages 356 * global_dirtyable_memory - number of globally dirtyable pages
357 * 357 *
358 * Returns the global number of pages potentially available for dirty 358 * Return: the global number of pages potentially available for dirty
359 * page cache. This is the base value for the global dirty limits. 359 * page cache. This is the base value for the global dirty limits.
360 */ 360 */
361static unsigned long global_dirtyable_memory(void) 361static unsigned long global_dirtyable_memory(void)
@@ -470,7 +470,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
470 * node_dirty_limit - maximum number of dirty pages allowed in a node 470 * node_dirty_limit - maximum number of dirty pages allowed in a node
471 * @pgdat: the node 471 * @pgdat: the node
472 * 472 *
473 * Returns the maximum number of dirty pages allowed in a node, based 473 * Return: the maximum number of dirty pages allowed in a node, based
474 * on the node's dirtyable memory. 474 * on the node's dirtyable memory.
475 */ 475 */
476static unsigned long node_dirty_limit(struct pglist_data *pgdat) 476static unsigned long node_dirty_limit(struct pglist_data *pgdat)
@@ -495,7 +495,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
495 * node_dirty_ok - tells whether a node is within its dirty limits 495 * node_dirty_ok - tells whether a node is within its dirty limits
496 * @pgdat: the node to check 496 * @pgdat: the node to check
497 * 497 *
498 * Returns %true when the dirty pages in @pgdat are within the node's 498 * Return: %true when the dirty pages in @pgdat are within the node's
499 * dirty limit, %false if the limit is exceeded. 499 * dirty limit, %false if the limit is exceeded.
500 */ 500 */
501bool node_dirty_ok(struct pglist_data *pgdat) 501bool node_dirty_ok(struct pglist_data *pgdat)
@@ -743,9 +743,6 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
743 * __wb_calc_thresh - @wb's share of dirty throttling threshold 743 * __wb_calc_thresh - @wb's share of dirty throttling threshold
744 * @dtc: dirty_throttle_context of interest 744 * @dtc: dirty_throttle_context of interest
745 * 745 *
746 * Returns @wb's dirty limit in pages. The term "dirty" in the context of
747 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
748 *
749 * Note that balance_dirty_pages() will only seriously take it as a hard limit 746 * Note that balance_dirty_pages() will only seriously take it as a hard limit
750 * when sleeping max_pause per page is not enough to keep the dirty pages under 747 * when sleeping max_pause per page is not enough to keep the dirty pages under
751 * control. For example, when the device is completely stalled due to some error 748 * control. For example, when the device is completely stalled due to some error
@@ -759,6 +756,9 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
759 * 756 *
760 * The wb's share of dirty limit will be adapting to its throughput and 757 * The wb's share of dirty limit will be adapting to its throughput and
761 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. 758 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
759 *
760 * Return: @wb's dirty limit in pages. The term "dirty" in the context of
761 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
762 */ 762 */
763static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) 763static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
764{ 764{
@@ -1918,7 +1918,9 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
1918 * @wb: bdi_writeback of interest 1918 * @wb: bdi_writeback of interest
1919 * 1919 *
1920 * Determines whether background writeback should keep writing @wb or it's 1920 * Determines whether background writeback should keep writing @wb or it's
1921 * clean enough. Returns %true if writeback should continue. 1921 * clean enough.
1922 *
1923 * Return: %true if writeback should continue.
1922 */ 1924 */
1923bool wb_over_bg_thresh(struct bdi_writeback *wb) 1925bool wb_over_bg_thresh(struct bdi_writeback *wb)
1924{ 1926{
@@ -2147,6 +2149,8 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
2147 * lock/page writeback access order inversion - we should only ever lock 2149 * lock/page writeback access order inversion - we should only ever lock
2148 * multiple pages in ascending page->index order, and looping back to the start 2150 * multiple pages in ascending page->index order, and looping back to the start
2149 * of the file violates that rule and causes deadlocks. 2151 * of the file violates that rule and causes deadlocks.
2152 *
2153 * Return: %0 on success, negative error code otherwise
2150 */ 2154 */
2151int write_cache_pages(struct address_space *mapping, 2155int write_cache_pages(struct address_space *mapping,
2152 struct writeback_control *wbc, writepage_t writepage, 2156 struct writeback_control *wbc, writepage_t writepage,
@@ -2305,6 +2309,8 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
2305 * 2309 *
2306 * This is a library function, which implements the writepages() 2310 * This is a library function, which implements the writepages()
2307 * address_space_operation. 2311 * address_space_operation.
2312 *
2313 * Return: %0 on success, negative error code otherwise
2308 */ 2314 */
2309int generic_writepages(struct address_space *mapping, 2315int generic_writepages(struct address_space *mapping,
2310 struct writeback_control *wbc) 2316 struct writeback_control *wbc)
@@ -2351,6 +2357,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
2351 * 2357 *
2352 * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this 2358 * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
2353 * function returns. 2359 * function returns.
2360 *
2361 * Return: %0 on success, negative error code otherwise
2354 */ 2362 */
2355int write_one_page(struct page *page) 2363int write_one_page(struct page *page)
2356{ 2364{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b9f577b1a2a..3eb01dedfb50 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -289,8 +289,8 @@ EXPORT_SYMBOL(movable_zone);
289#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 289#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
290 290
291#if MAX_NUMNODES > 1 291#if MAX_NUMNODES > 1
292int nr_node_ids __read_mostly = MAX_NUMNODES; 292unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
293int nr_online_nodes __read_mostly = 1; 293unsigned int nr_online_nodes __read_mostly = 1;
294EXPORT_SYMBOL(nr_node_ids); 294EXPORT_SYMBOL(nr_node_ids);
295EXPORT_SYMBOL(nr_online_nodes); 295EXPORT_SYMBOL(nr_online_nodes);
296#endif 296#endif
@@ -789,6 +789,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
789 return 0; 789 return 0;
790} 790}
791 791
792#ifdef CONFIG_COMPACTION
793static inline struct capture_control *task_capc(struct zone *zone)
794{
795 struct capture_control *capc = current->capture_control;
796
797 return capc &&
798 !(current->flags & PF_KTHREAD) &&
799 !capc->page &&
800 capc->cc->zone == zone &&
801 capc->cc->direct_compaction ? capc : NULL;
802}
803
804static inline bool
805compaction_capture(struct capture_control *capc, struct page *page,
806 int order, int migratetype)
807{
808 if (!capc || order != capc->cc->order)
809 return false;
810
811 /* Do not accidentally pollute CMA or isolated regions*/
812 if (is_migrate_cma(migratetype) ||
813 is_migrate_isolate(migratetype))
814 return false;
815
816 /*
817 * Do not let lower order allocations polluate a movable pageblock.
818 * This might let an unmovable request use a reclaimable pageblock
819 * and vice-versa but no more than normal fallback logic which can
820 * have trouble finding a high-order free page.
821 */
822 if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
823 return false;
824
825 capc->page = page;
826 return true;
827}
828
829#else
830static inline struct capture_control *task_capc(struct zone *zone)
831{
832 return NULL;
833}
834
835static inline bool
836compaction_capture(struct capture_control *capc, struct page *page,
837 int order, int migratetype)
838{
839 return false;
840}
841#endif /* CONFIG_COMPACTION */
842
792/* 843/*
793 * Freeing function for a buddy system allocator. 844 * Freeing function for a buddy system allocator.
794 * 845 *
@@ -822,6 +873,7 @@ static inline void __free_one_page(struct page *page,
822 unsigned long uninitialized_var(buddy_pfn); 873 unsigned long uninitialized_var(buddy_pfn);
823 struct page *buddy; 874 struct page *buddy;
824 unsigned int max_order; 875 unsigned int max_order;
876 struct capture_control *capc = task_capc(zone);
825 877
826 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); 878 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
827 879
@@ -837,6 +889,11 @@ static inline void __free_one_page(struct page *page,
837 889
838continue_merging: 890continue_merging:
839 while (order < max_order - 1) { 891 while (order < max_order - 1) {
892 if (compaction_capture(capc, page, order, migratetype)) {
893 __mod_zone_freepage_state(zone, -(1 << order),
894 migratetype);
895 return;
896 }
840 buddy_pfn = __find_buddy_pfn(pfn, order); 897 buddy_pfn = __find_buddy_pfn(pfn, order);
841 buddy = page + (buddy_pfn - pfn); 898 buddy = page + (buddy_pfn - pfn);
842 899
@@ -1056,7 +1113,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
1056 if (PageMappingFlags(page)) 1113 if (PageMappingFlags(page))
1057 page->mapping = NULL; 1114 page->mapping = NULL;
1058 if (memcg_kmem_enabled() && PageKmemcg(page)) 1115 if (memcg_kmem_enabled() && PageKmemcg(page))
1059 memcg_kmem_uncharge(page, order); 1116 __memcg_kmem_uncharge(page, order);
1060 if (check_free) 1117 if (check_free)
1061 bad += free_pages_check(page); 1118 bad += free_pages_check(page);
1062 if (bad) 1119 if (bad)
@@ -1303,7 +1360,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
1303 local_irq_restore(flags); 1360 local_irq_restore(flags);
1304} 1361}
1305 1362
1306static void __init __free_pages_boot_core(struct page *page, unsigned int order) 1363void __free_pages_core(struct page *page, unsigned int order)
1307{ 1364{
1308 unsigned int nr_pages = 1 << order; 1365 unsigned int nr_pages = 1 << order;
1309 struct page *p = page; 1366 struct page *p = page;
@@ -1382,7 +1439,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
1382{ 1439{
1383 if (early_page_uninitialised(pfn)) 1440 if (early_page_uninitialised(pfn))
1384 return; 1441 return;
1385 return __free_pages_boot_core(page, order); 1442 __free_pages_core(page, order);
1386} 1443}
1387 1444
1388/* 1445/*
@@ -1472,14 +1529,14 @@ static void __init deferred_free_range(unsigned long pfn,
1472 if (nr_pages == pageblock_nr_pages && 1529 if (nr_pages == pageblock_nr_pages &&
1473 (pfn & (pageblock_nr_pages - 1)) == 0) { 1530 (pfn & (pageblock_nr_pages - 1)) == 0) {
1474 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1531 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1475 __free_pages_boot_core(page, pageblock_order); 1532 __free_pages_core(page, pageblock_order);
1476 return; 1533 return;
1477 } 1534 }
1478 1535
1479 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1536 for (i = 0; i < nr_pages; i++, page++, pfn++) {
1480 if ((pfn & (pageblock_nr_pages - 1)) == 0) 1537 if ((pfn & (pageblock_nr_pages - 1)) == 0)
1481 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1538 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1482 __free_pages_boot_core(page, 0); 1539 __free_pages_core(page, 0);
1483 } 1540 }
1484} 1541}
1485 1542
@@ -1945,8 +2002,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
1945 2002
1946 arch_alloc_page(page, order); 2003 arch_alloc_page(page, order);
1947 kernel_map_pages(page, 1 << order, 1); 2004 kernel_map_pages(page, 1 << order, 1);
1948 kernel_poison_pages(page, 1 << order, 1);
1949 kasan_alloc_pages(page, order); 2005 kasan_alloc_pages(page, order);
2006 kernel_poison_pages(page, 1 << order, 1);
1950 set_page_owner(page, order, gfp_flags); 2007 set_page_owner(page, order, gfp_flags);
1951} 2008}
1952 2009
@@ -2962,7 +3019,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
2962 * watermark, because we already know our high-order page 3019 * watermark, because we already know our high-order page
2963 * exists. 3020 * exists.
2964 */ 3021 */
2965 watermark = min_wmark_pages(zone) + (1UL << order); 3022 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
2966 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 3023 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2967 return 0; 3024 return 0;
2968 3025
@@ -3173,24 +3230,14 @@ static int __init fail_page_alloc_debugfs(void)
3173 3230
3174 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 3231 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3175 &fail_page_alloc.attr); 3232 &fail_page_alloc.attr);
3176 if (IS_ERR(dir))
3177 return PTR_ERR(dir);
3178
3179 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
3180 &fail_page_alloc.ignore_gfp_reclaim))
3181 goto fail;
3182 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3183 &fail_page_alloc.ignore_gfp_highmem))
3184 goto fail;
3185 if (!debugfs_create_u32("min-order", mode, dir,
3186 &fail_page_alloc.min_order))
3187 goto fail;
3188 3233
3189 return 0; 3234 debugfs_create_bool("ignore-gfp-wait", mode, dir,
3190fail: 3235 &fail_page_alloc.ignore_gfp_reclaim);
3191 debugfs_remove_recursive(dir); 3236 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3237 &fail_page_alloc.ignore_gfp_highmem);
3238 debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
3192 3239
3193 return -ENOMEM; 3240 return 0;
3194} 3241}
3195 3242
3196late_initcall(fail_page_alloc_debugfs); 3243late_initcall(fail_page_alloc_debugfs);
@@ -3710,7 +3757,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3710 unsigned int alloc_flags, const struct alloc_context *ac, 3757 unsigned int alloc_flags, const struct alloc_context *ac,
3711 enum compact_priority prio, enum compact_result *compact_result) 3758 enum compact_priority prio, enum compact_result *compact_result)
3712{ 3759{
3713 struct page *page; 3760 struct page *page = NULL;
3714 unsigned long pflags; 3761 unsigned long pflags;
3715 unsigned int noreclaim_flag; 3762 unsigned int noreclaim_flag;
3716 3763
@@ -3721,13 +3768,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3721 noreclaim_flag = memalloc_noreclaim_save(); 3768 noreclaim_flag = memalloc_noreclaim_save();
3722 3769
3723 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3770 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3724 prio); 3771 prio, &page);
3725 3772
3726 memalloc_noreclaim_restore(noreclaim_flag); 3773 memalloc_noreclaim_restore(noreclaim_flag);
3727 psi_memstall_leave(&pflags); 3774 psi_memstall_leave(&pflags);
3728 3775
3729 if (*compact_result <= COMPACT_INACTIVE) 3776 if (*compact_result <= COMPACT_INACTIVE) {
3777 WARN_ON_ONCE(page);
3730 return NULL; 3778 return NULL;
3779 }
3731 3780
3732 /* 3781 /*
3733 * At least in one zone compaction wasn't deferred or skipped, so let's 3782 * At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3735,7 +3784,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3735 */ 3784 */
3736 count_vm_event(COMPACTSTALL); 3785 count_vm_event(COMPACTSTALL);
3737 3786
3738 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3787 /* Prep a captured page if available */
3788 if (page)
3789 prep_new_page(page, order, gfp_mask, alloc_flags);
3790
3791 /* Try get a page from the freelist if available */
3792 if (!page)
3793 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3739 3794
3740 if (page) { 3795 if (page) {
3741 struct zone *zone = page_zone(page); 3796 struct zone *zone = page_zone(page);
@@ -4568,7 +4623,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
4568 4623
4569out: 4624out:
4570 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 4625 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4571 unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { 4626 unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
4572 __free_pages(page, order); 4627 __free_pages(page, order);
4573 page = NULL; 4628 page = NULL;
4574 } 4629 }
@@ -4761,6 +4816,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
4761 * This function is also limited by MAX_ORDER. 4816 * This function is also limited by MAX_ORDER.
4762 * 4817 *
4763 * Memory allocated by this function must be released by free_pages_exact(). 4818 * Memory allocated by this function must be released by free_pages_exact().
4819 *
4820 * Return: pointer to the allocated area or %NULL in case of error.
4764 */ 4821 */
4765void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 4822void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4766{ 4823{
@@ -4781,6 +4838,8 @@ EXPORT_SYMBOL(alloc_pages_exact);
4781 * 4838 *
4782 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 4839 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
4783 * back. 4840 * back.
4841 *
4842 * Return: pointer to the allocated area or %NULL in case of error.
4784 */ 4843 */
4785void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 4844void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4786{ 4845{
@@ -4814,11 +4873,13 @@ EXPORT_SYMBOL(free_pages_exact);
4814 * nr_free_zone_pages - count number of pages beyond high watermark 4873 * nr_free_zone_pages - count number of pages beyond high watermark
4815 * @offset: The zone index of the highest zone 4874 * @offset: The zone index of the highest zone
4816 * 4875 *
4817 * nr_free_zone_pages() counts the number of counts pages which are beyond the 4876 * nr_free_zone_pages() counts the number of pages which are beyond the
4818 * high watermark within all zones at or below a given zone index. For each 4877 * high watermark within all zones at or below a given zone index. For each
4819 * zone, the number of pages is calculated as: 4878 * zone, the number of pages is calculated as:
4820 * 4879 *
4821 * nr_free_zone_pages = managed_pages - high_pages 4880 * nr_free_zone_pages = managed_pages - high_pages
4881 *
4882 * Return: number of pages beyond high watermark.
4822 */ 4883 */
4823static unsigned long nr_free_zone_pages(int offset) 4884static unsigned long nr_free_zone_pages(int offset)
4824{ 4885{
@@ -4845,6 +4906,9 @@ static unsigned long nr_free_zone_pages(int offset)
4845 * 4906 *
4846 * nr_free_buffer_pages() counts the number of pages which are beyond the high 4907 * nr_free_buffer_pages() counts the number of pages which are beyond the high
4847 * watermark within ZONE_DMA and ZONE_NORMAL. 4908 * watermark within ZONE_DMA and ZONE_NORMAL.
4909 *
4910 * Return: number of pages beyond high watermark within ZONE_DMA and
4911 * ZONE_NORMAL.
4848 */ 4912 */
4849unsigned long nr_free_buffer_pages(void) 4913unsigned long nr_free_buffer_pages(void)
4850{ 4914{
@@ -4857,6 +4921,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4857 * 4921 *
4858 * nr_free_pagecache_pages() counts the number of pages which are beyond the 4922 * nr_free_pagecache_pages() counts the number of pages which are beyond the
4859 * high watermark within all zones. 4923 * high watermark within all zones.
4924 *
4925 * Return: number of pages beyond high watermark within all zones.
4860 */ 4926 */
4861unsigned long nr_free_pagecache_pages(void) 4927unsigned long nr_free_pagecache_pages(void)
4862{ 4928{
@@ -5303,7 +5369,8 @@ static int node_load[MAX_NUMNODES];
5303 * from each node to each node in the system), and should also prefer nodes 5369 * from each node to each node in the system), and should also prefer nodes
5304 * with no CPUs, since presumably they'll have very little allocation pressure 5370 * with no CPUs, since presumably they'll have very little allocation pressure
5305 * on them otherwise. 5371 * on them otherwise.
5306 * It returns -1 if no node is found. 5372 *
5373 * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5307 */ 5374 */
5308static int find_next_best_node(int node, nodemask_t *used_node_mask) 5375static int find_next_best_node(int node, nodemask_t *used_node_mask)
5309{ 5376{
@@ -5609,7 +5676,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
5609 else 5676 else
5610 page_group_by_mobility_disabled = 0; 5677 page_group_by_mobility_disabled = 0;
5611 5678
5612 pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", 5679 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5613 nr_online_nodes, 5680 nr_online_nodes,
5614 page_group_by_mobility_disabled ? "off" : "on", 5681 page_group_by_mobility_disabled ? "off" : "on",
5615 vm_total_pages); 5682 vm_total_pages);
@@ -6016,7 +6083,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
6016 return state->last_nid; 6083 return state->last_nid;
6017 6084
6018 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 6085 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
6019 if (nid != -1) { 6086 if (nid != NUMA_NO_NODE) {
6020 state->last_start = start_pfn; 6087 state->last_start = start_pfn;
6021 state->last_end = end_pfn; 6088 state->last_end = end_pfn;
6022 state->last_nid = nid; 6089 state->last_nid = nid;
@@ -6214,7 +6281,7 @@ unsigned long __init __absent_pages_in_range(int nid,
6214 * @start_pfn: The start PFN to start searching for holes 6281 * @start_pfn: The start PFN to start searching for holes
6215 * @end_pfn: The end PFN to stop searching for holes 6282 * @end_pfn: The end PFN to stop searching for holes
6216 * 6283 *
6217 * It returns the number of pages frames in memory holes within a range. 6284 * Return: the number of pages frames in memory holes within a range.
6218 */ 6285 */
6219unsigned long __init absent_pages_in_range(unsigned long start_pfn, 6286unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6220 unsigned long end_pfn) 6287 unsigned long end_pfn)
@@ -6376,10 +6443,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat,
6376{ 6443{
6377 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 6444 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
6378 zone->pageblock_flags = NULL; 6445 zone->pageblock_flags = NULL;
6379 if (usemapsize) 6446 if (usemapsize) {
6380 zone->pageblock_flags = 6447 zone->pageblock_flags =
6381 memblock_alloc_node_nopanic(usemapsize, 6448 memblock_alloc_node_nopanic(usemapsize,
6382 pgdat->node_id); 6449 pgdat->node_id);
6450 if (!zone->pageblock_flags)
6451 panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
6452 usemapsize, zone->name, pgdat->node_id);
6453 }
6383} 6454}
6384#else 6455#else
6385static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 6456static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -6609,6 +6680,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6609 end = ALIGN(end, MAX_ORDER_NR_PAGES); 6680 end = ALIGN(end, MAX_ORDER_NR_PAGES);
6610 size = (end - start) * sizeof(struct page); 6681 size = (end - start) * sizeof(struct page);
6611 map = memblock_alloc_node_nopanic(size, pgdat->node_id); 6682 map = memblock_alloc_node_nopanic(size, pgdat->node_id);
6683 if (!map)
6684 panic("Failed to allocate %ld bytes for node %d memory map\n",
6685 size, pgdat->node_id);
6612 pgdat->node_mem_map = map + offset; 6686 pgdat->node_mem_map = map + offset;
6613 } 6687 }
6614 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", 6688 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6764,14 +6838,14 @@ void __init setup_nr_node_ids(void)
6764 * model has fine enough granularity to avoid incorrect mapping for the 6838 * model has fine enough granularity to avoid incorrect mapping for the
6765 * populated node map. 6839 * populated node map.
6766 * 6840 *
6767 * Returns the determined alignment in pfn's. 0 if there is no alignment 6841 * Return: the determined alignment in pfn's. 0 if there is no alignment
6768 * requirement (single node). 6842 * requirement (single node).
6769 */ 6843 */
6770unsigned long __init node_map_pfn_alignment(void) 6844unsigned long __init node_map_pfn_alignment(void)
6771{ 6845{
6772 unsigned long accl_mask = 0, last_end = 0; 6846 unsigned long accl_mask = 0, last_end = 0;
6773 unsigned long start, end, mask; 6847 unsigned long start, end, mask;
6774 int last_nid = -1; 6848 int last_nid = NUMA_NO_NODE;
6775 int i, nid; 6849 int i, nid;
6776 6850
6777 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 6851 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
@@ -6819,7 +6893,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
6819/** 6893/**
6820 * find_min_pfn_with_active_regions - Find the minimum PFN registered 6894 * find_min_pfn_with_active_regions - Find the minimum PFN registered
6821 * 6895 *
6822 * It returns the minimum PFN based on information provided via 6896 * Return: the minimum PFN based on information provided via
6823 * memblock_set_node(). 6897 * memblock_set_node().
6824 */ 6898 */
6825unsigned long __init find_min_pfn_with_active_regions(void) 6899unsigned long __init find_min_pfn_with_active_regions(void)
@@ -7267,7 +7341,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
7267 7341
7268 return pages; 7342 return pages;
7269} 7343}
7270EXPORT_SYMBOL(free_reserved_area);
7271 7344
7272#ifdef CONFIG_HIGHMEM 7345#ifdef CONFIG_HIGHMEM
7273void free_highmem_page(struct page *page) 7346void free_highmem_page(struct page *page)
@@ -7496,7 +7569,7 @@ static void __setup_per_zone_wmarks(void)
7496 * value here. 7569 * value here.
7497 * 7570 *
7498 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 7571 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
7499 * deltas control asynch page reclaim, and so should 7572 * deltas control async page reclaim, and so should
7500 * not be capped for highmem. 7573 * not be capped for highmem.
7501 */ 7574 */
7502 unsigned long min_pages; 7575 unsigned long min_pages;
@@ -7973,7 +8046,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7973 8046
7974 /* 8047 /*
7975 * Hugepages are not in LRU lists, but they're movable. 8048 * Hugepages are not in LRU lists, but they're movable.
7976 * We need not scan over tail pages bacause we don't 8049 * We need not scan over tail pages because we don't
7977 * handle each tail page individually in migration. 8050 * handle each tail page individually in migration.
7978 */ 8051 */
7979 if (PageHuge(page)) { 8052 if (PageHuge(page)) {
@@ -8112,7 +8185,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
8112 * pageblocks in the range. Once isolated, the pageblocks should not 8185 * pageblocks in the range. Once isolated, the pageblocks should not
8113 * be modified by others. 8186 * be modified by others.
8114 * 8187 *
8115 * Returns zero on success or negative error code. On success all 8188 * Return: zero on success or negative error code. On success all
8116 * pages which PFN is in [start, end) are allocated for the caller and 8189 * pages which PFN is in [start, end) are allocated for the caller and
8117 * need to be freed with free_contig_range(). 8190 * need to be freed with free_contig_range().
8118 */ 8191 */
@@ -8196,7 +8269,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
8196 */ 8269 */
8197 8270
8198 lru_add_drain_all(); 8271 lru_add_drain_all();
8199 drain_all_pages(cc.zone);
8200 8272
8201 order = 0; 8273 order = 0;
8202 outer_start = start; 8274 outer_start = start;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 8c78b8d45117..ab4244920e0f 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -273,6 +273,7 @@ static void free_page_ext(void *addr)
273 table_size = get_entry_size() * PAGES_PER_SECTION; 273 table_size = get_entry_size() * PAGES_PER_SECTION;
274 274
275 BUG_ON(PageReserved(page)); 275 BUG_ON(PageReserved(page));
276 kmemleak_free(addr);
276 free_pages_exact(addr, table_size); 277 free_pages_exact(addr, table_size);
277 } 278 }
278} 279}
@@ -300,7 +301,7 @@ static int __meminit online_page_ext(unsigned long start_pfn,
300 start = SECTION_ALIGN_DOWN(start_pfn); 301 start = SECTION_ALIGN_DOWN(start_pfn);
301 end = SECTION_ALIGN_UP(start_pfn + nr_pages); 302 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
302 303
303 if (nid == -1) { 304 if (nid == NUMA_NO_NODE) {
304 /* 305 /*
305 * In this case, "nid" already exists and contains valid memory. 306 * In this case, "nid" already exists and contains valid memory.
306 * "start_pfn" passed to us is a pfn which is an arg for 307 * "start_pfn" passed to us is a pfn which is an arg for
diff --git a/mm/page_idle.c b/mm/page_idle.c
index b9e4b42b33ab..0b39ec0c945c 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -31,7 +31,7 @@
31static struct page *page_idle_get_page(unsigned long pfn) 31static struct page *page_idle_get_page(unsigned long pfn)
32{ 32{
33 struct page *page; 33 struct page *page;
34 struct zone *zone; 34 pg_data_t *pgdat;
35 35
36 if (!pfn_valid(pfn)) 36 if (!pfn_valid(pfn))
37 return NULL; 37 return NULL;
@@ -41,13 +41,13 @@ static struct page *page_idle_get_page(unsigned long pfn)
41 !get_page_unless_zero(page)) 41 !get_page_unless_zero(page))
42 return NULL; 42 return NULL;
43 43
44 zone = page_zone(page); 44 pgdat = page_pgdat(page);
45 spin_lock_irq(zone_lru_lock(zone)); 45 spin_lock_irq(&pgdat->lru_lock);
46 if (unlikely(!PageLRU(page))) { 46 if (unlikely(!PageLRU(page))) {
47 put_page(page); 47 put_page(page);
48 page = NULL; 48 page = NULL;
49 } 49 }
50 spin_unlock_irq(zone_lru_lock(zone)); 50 spin_unlock_irq(&pgdat->lru_lock);
51 return page; 51 return page;
52} 52}
53 53
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 28b06524939f..925b6f44a444 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -625,16 +625,14 @@ static const struct file_operations proc_page_owner_operations = {
625 625
626static int __init pageowner_init(void) 626static int __init pageowner_init(void)
627{ 627{
628 struct dentry *dentry;
629
630 if (!static_branch_unlikely(&page_owner_inited)) { 628 if (!static_branch_unlikely(&page_owner_inited)) {
631 pr_info("page_owner is disabled\n"); 629 pr_info("page_owner is disabled\n");
632 return 0; 630 return 0;
633 } 631 }
634 632
635 dentry = debugfs_create_file("page_owner", 0400, NULL, 633 debugfs_create_file("page_owner", 0400, NULL, NULL,
636 NULL, &proc_page_owner_operations); 634 &proc_page_owner_operations);
637 635
638 return PTR_ERR_OR_ZERO(dentry); 636 return 0;
639} 637}
640late_initcall(pageowner_init) 638late_initcall(pageowner_init)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index f0c15e9017c0..21d4f97cb49b 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,6 +6,7 @@
6#include <linux/page_ext.h> 6#include <linux/page_ext.h>
7#include <linux/poison.h> 7#include <linux/poison.h>
8#include <linux/ratelimit.h> 8#include <linux/ratelimit.h>
9#include <linux/kasan.h>
9 10
10static bool want_page_poisoning __read_mostly; 11static bool want_page_poisoning __read_mostly;
11 12
@@ -40,7 +41,10 @@ static void poison_page(struct page *page)
40{ 41{
41 void *addr = kmap_atomic(page); 42 void *addr = kmap_atomic(page);
42 43
44 /* KASAN still think the page is in-use, so skip it. */
45 kasan_disable_current();
43 memset(addr, PAGE_POISON, PAGE_SIZE); 46 memset(addr, PAGE_POISON, PAGE_SIZE);
47 kasan_enable_current();
44 kunmap_atomic(addr); 48 kunmap_atomic(addr);
45} 49}
46 50
diff --git a/mm/readahead.c b/mm/readahead.c
index 1ae16522412a..a4593654a26c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -81,6 +81,8 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping,
81 * @data: private data for the callback routine. 81 * @data: private data for the callback routine.
82 * 82 *
83 * Hides the details of the LRU cache etc from the filesystems. 83 * Hides the details of the LRU cache etc from the filesystems.
84 *
85 * Returns: %0 on success, error return by @filler otherwise
84 */ 86 */
85int read_cache_pages(struct address_space *mapping, struct list_head *pages, 87int read_cache_pages(struct address_space *mapping, struct list_head *pages,
86 int (*filler)(void *, struct page *), void *data) 88 int (*filler)(void *, struct page *), void *data)
diff --git a/mm/rmap.c b/mm/rmap.c
index 0454ecc29537..b30c7c71d1d9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -27,7 +27,7 @@
27 * mapping->i_mmap_rwsem 27 * mapping->i_mmap_rwsem
28 * anon_vma->rwsem 28 * anon_vma->rwsem
29 * mm->page_table_lock or pte_lock 29 * mm->page_table_lock or pte_lock
30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 30 * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
31 * swap_lock (in swap_duplicate, swap_info_get) 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * mmlist_lock (in mmput, drain_mmlist and others) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 33 * mapping->private_lock (in __set_page_dirty_buffers)
diff --git a/mm/shmem.c b/mm/shmem.c
index 2c012eee133d..b3db3779a30a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -36,6 +36,7 @@
36#include <linux/uio.h> 36#include <linux/uio.h>
37#include <linux/khugepaged.h> 37#include <linux/khugepaged.h>
38#include <linux/hugetlb.h> 38#include <linux/hugetlb.h>
39#include <linux/frontswap.h>
39 40
40#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ 41#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
41 42
@@ -123,6 +124,10 @@ static unsigned long shmem_default_max_inodes(void)
123static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 124static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
124static int shmem_replace_page(struct page **pagep, gfp_t gfp, 125static int shmem_replace_page(struct page **pagep, gfp_t gfp,
125 struct shmem_inode_info *info, pgoff_t index); 126 struct shmem_inode_info *info, pgoff_t index);
127static int shmem_swapin_page(struct inode *inode, pgoff_t index,
128 struct page **pagep, enum sgp_type sgp,
129 gfp_t gfp, struct vm_area_struct *vma,
130 vm_fault_t *fault_type);
126static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 131static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
127 struct page **pagep, enum sgp_type sgp, 132 struct page **pagep, enum sgp_type sgp,
128 gfp_t gfp, struct vm_area_struct *vma, 133 gfp_t gfp, struct vm_area_struct *vma,
@@ -1089,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode)
1089 clear_inode(inode); 1094 clear_inode(inode);
1090} 1095}
1091 1096
1092static unsigned long find_swap_entry(struct xarray *xa, void *item) 1097extern struct swap_info_struct *swap_info[];
1098
1099static int shmem_find_swap_entries(struct address_space *mapping,
1100 pgoff_t start, unsigned int nr_entries,
1101 struct page **entries, pgoff_t *indices,
1102 bool frontswap)
1093{ 1103{
1094 XA_STATE(xas, xa, 0); 1104 XA_STATE(xas, &mapping->i_pages, start);
1095 unsigned int checked = 0; 1105 struct page *page;
1096 void *entry; 1106 unsigned int ret = 0;
1107
1108 if (!nr_entries)
1109 return 0;
1097 1110
1098 rcu_read_lock(); 1111 rcu_read_lock();
1099 xas_for_each(&xas, entry, ULONG_MAX) { 1112 xas_for_each(&xas, page, ULONG_MAX) {
1100 if (xas_retry(&xas, entry)) 1113 if (xas_retry(&xas, page))
1101 continue; 1114 continue;
1102 if (entry == item) 1115
1103 break; 1116 if (!xa_is_value(page))
1104 checked++;
1105 if ((checked % XA_CHECK_SCHED) != 0)
1106 continue; 1117 continue;
1107 xas_pause(&xas); 1118
1108 cond_resched_rcu(); 1119 if (frontswap) {
1120 swp_entry_t entry = radix_to_swp_entry(page);
1121
1122 if (!frontswap_test(swap_info[swp_type(entry)],
1123 swp_offset(entry)))
1124 continue;
1125 }
1126
1127 indices[ret] = xas.xa_index;
1128 entries[ret] = page;
1129
1130 if (need_resched()) {
1131 xas_pause(&xas);
1132 cond_resched_rcu();
1133 }
1134 if (++ret == nr_entries)
1135 break;
1109 } 1136 }
1110 rcu_read_unlock(); 1137 rcu_read_unlock();
1111 1138
1112 return entry ? xas.xa_index : -1; 1139 return ret;
1113} 1140}
1114 1141
1115/* 1142/*
1116 * If swap found in inode, free it and move page from swapcache to filecache. 1143 * Move the swapped pages for an inode to page cache. Returns the count
1144 * of pages swapped in, or the error in case of failure.
1117 */ 1145 */
1118static int shmem_unuse_inode(struct shmem_inode_info *info, 1146static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
1119 swp_entry_t swap, struct page **pagep) 1147 pgoff_t *indices)
1120{ 1148{
1121 struct address_space *mapping = info->vfs_inode.i_mapping; 1149 int i = 0;
1122 void *radswap; 1150 int ret = 0;
1123 pgoff_t index;
1124 gfp_t gfp;
1125 int error = 0; 1151 int error = 0;
1152 struct address_space *mapping = inode->i_mapping;
1126 1153
1127 radswap = swp_to_radix_entry(swap); 1154 for (i = 0; i < pvec.nr; i++) {
1128 index = find_swap_entry(&mapping->i_pages, radswap); 1155 struct page *page = pvec.pages[i];
1129 if (index == -1)
1130 return -EAGAIN; /* tell shmem_unuse we found nothing */
1131 1156
1132 /* 1157 if (!xa_is_value(page))
1133 * Move _head_ to start search for next from here. 1158 continue;
1134 * But be careful: shmem_evict_inode checks list_empty without taking 1159 error = shmem_swapin_page(inode, indices[i],
1135 * mutex, and there's an instant in list_move_tail when info->swaplist 1160 &page, SGP_CACHE,
1136 * would appear empty, if it were the only one on shmem_swaplist. 1161 mapping_gfp_mask(mapping),
1137 */ 1162 NULL, NULL);
1138 if (shmem_swaplist.next != &info->swaplist) 1163 if (error == 0) {
1139 list_move_tail(&shmem_swaplist, &info->swaplist); 1164 unlock_page(page);
1140 1165 put_page(page);
1141 gfp = mapping_gfp_mask(mapping); 1166 ret++;
1142 if (shmem_should_replace_page(*pagep, gfp)) { 1167 }
1143 mutex_unlock(&shmem_swaplist_mutex); 1168 if (error == -ENOMEM)
1144 error = shmem_replace_page(pagep, gfp, info, index); 1169 break;
1145 mutex_lock(&shmem_swaplist_mutex); 1170 error = 0;
1146 /*
1147 * We needed to drop mutex to make that restrictive page
1148 * allocation, but the inode might have been freed while we
1149 * dropped it: although a racing shmem_evict_inode() cannot
1150 * complete without emptying the page cache, our page lock
1151 * on this swapcache page is not enough to prevent that -
1152 * free_swap_and_cache() of our swap entry will only
1153 * trylock_page(), removing swap from page cache whatever.
1154 *
1155 * We must not proceed to shmem_add_to_page_cache() if the
1156 * inode has been freed, but of course we cannot rely on
1157 * inode or mapping or info to check that. However, we can
1158 * safely check if our swap entry is still in use (and here
1159 * it can't have got reused for another page): if it's still
1160 * in use, then the inode cannot have been freed yet, and we
1161 * can safely proceed (if it's no longer in use, that tells
1162 * nothing about the inode, but we don't need to unuse swap).
1163 */
1164 if (!page_swapcount(*pagep))
1165 error = -ENOENT;
1166 } 1171 }
1172 return error ? error : ret;
1173}
1167 1174
1168 /* 1175/*
1169 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 1176 * If swap found in inode, free it and move page from swapcache to filecache.
1170 * but also to hold up shmem_evict_inode(): so inode cannot be freed 1177 */
1171 * beneath us (pagelock doesn't help until the page is in pagecache). 1178static int shmem_unuse_inode(struct inode *inode, unsigned int type,
1172 */ 1179 bool frontswap, unsigned long *fs_pages_to_unuse)
1173 if (!error) 1180{
1174 error = shmem_add_to_page_cache(*pagep, mapping, index, 1181 struct address_space *mapping = inode->i_mapping;
1175 radswap, gfp); 1182 pgoff_t start = 0;
1176 if (error != -ENOMEM) { 1183 struct pagevec pvec;
1177 /* 1184 pgoff_t indices[PAGEVEC_SIZE];
1178 * Truncation and eviction use free_swap_and_cache(), which 1185 bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
1179 * only does trylock page: if we raced, best clean up here. 1186 int ret = 0;
1180 */ 1187
1181 delete_from_swap_cache(*pagep); 1188 pagevec_init(&pvec);
1182 set_page_dirty(*pagep); 1189 do {
1183 if (!error) { 1190 unsigned int nr_entries = PAGEVEC_SIZE;
1184 spin_lock_irq(&info->lock); 1191
1185 info->swapped--; 1192 if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
1186 spin_unlock_irq(&info->lock); 1193 nr_entries = *fs_pages_to_unuse;
1187 swap_free(swap); 1194
1195 pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
1196 pvec.pages, indices,
1197 frontswap);
1198 if (pvec.nr == 0) {
1199 ret = 0;
1200 break;
1188 } 1201 }
1189 } 1202
1190 return error; 1203 ret = shmem_unuse_swap_entries(inode, pvec, indices);
1204 if (ret < 0)
1205 break;
1206
1207 if (frontswap_partial) {
1208 *fs_pages_to_unuse -= ret;
1209 if (*fs_pages_to_unuse == 0) {
1210 ret = FRONTSWAP_PAGES_UNUSED;
1211 break;
1212 }
1213 }
1214
1215 start = indices[pvec.nr - 1];
1216 } while (true);
1217
1218 return ret;
1191} 1219}
1192 1220
1193/* 1221/*
1194 * Search through swapped inodes to find and replace swap by page. 1222 * Read all the shared memory data that resides in the swap
1223 * device 'type' back into memory, so the swap device can be
1224 * unused.
1195 */ 1225 */
1196int shmem_unuse(swp_entry_t swap, struct page *page) 1226int shmem_unuse(unsigned int type, bool frontswap,
1227 unsigned long *fs_pages_to_unuse)
1197{ 1228{
1198 struct list_head *this, *next; 1229 struct shmem_inode_info *info, *next;
1199 struct shmem_inode_info *info; 1230 struct inode *inode;
1200 struct mem_cgroup *memcg; 1231 struct inode *prev_inode = NULL;
1201 int error = 0; 1232 int error = 0;
1202 1233
1203 /* 1234 if (list_empty(&shmem_swaplist))
1204 * There's a faint possibility that swap page was replaced before 1235 return 0;
1205 * caller locked it: caller will come back later with the right page. 1236
1206 */ 1237 mutex_lock(&shmem_swaplist_mutex);
1207 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
1208 goto out;
1209 1238
1210 /* 1239 /*
1211 * Charge page using GFP_KERNEL while we can wait, before taking 1240 * The extra refcount on the inode is necessary to safely dereference
1212 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 1241 * p->next after re-acquiring the lock. New shmem inodes with swap
1213 * Charged back to the user (not to caller) when swap account is used. 1242 * get added to the end of the list and we will scan them all.
1214 */ 1243 */
1215 error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, 1244 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1216 &memcg, false); 1245 if (!info->swapped) {
1217 if (error)
1218 goto out;
1219 /* No memory allocation: swap entry occupies the slot for the page */
1220 error = -EAGAIN;
1221
1222 mutex_lock(&shmem_swaplist_mutex);
1223 list_for_each_safe(this, next, &shmem_swaplist) {
1224 info = list_entry(this, struct shmem_inode_info, swaplist);
1225 if (info->swapped)
1226 error = shmem_unuse_inode(info, swap, &page);
1227 else
1228 list_del_init(&info->swaplist); 1246 list_del_init(&info->swaplist);
1247 continue;
1248 }
1249
1250 inode = igrab(&info->vfs_inode);
1251 if (!inode)
1252 continue;
1253
1254 mutex_unlock(&shmem_swaplist_mutex);
1255 if (prev_inode)
1256 iput(prev_inode);
1257 prev_inode = inode;
1258
1259 error = shmem_unuse_inode(inode, type, frontswap,
1260 fs_pages_to_unuse);
1229 cond_resched(); 1261 cond_resched();
1230 if (error != -EAGAIN) 1262
1263 mutex_lock(&shmem_swaplist_mutex);
1264 next = list_next_entry(info, swaplist);
1265 if (!info->swapped)
1266 list_del_init(&info->swaplist);
1267 if (error)
1231 break; 1268 break;
1232 /* found nothing in this: move on to search the next */
1233 } 1269 }
1234 mutex_unlock(&shmem_swaplist_mutex); 1270 mutex_unlock(&shmem_swaplist_mutex);
1235 1271
1236 if (error) { 1272 if (prev_inode)
1237 if (error != -ENOMEM) 1273 iput(prev_inode);
1238 error = 0; 1274
1239 mem_cgroup_cancel_charge(page, memcg, false);
1240 } else
1241 mem_cgroup_commit_charge(page, memcg, true, false);
1242out:
1243 unlock_page(page);
1244 put_page(page);
1245 return error; 1275 return error;
1246} 1276}
1247 1277
@@ -1325,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1325 */ 1355 */
1326 mutex_lock(&shmem_swaplist_mutex); 1356 mutex_lock(&shmem_swaplist_mutex);
1327 if (list_empty(&info->swaplist)) 1357 if (list_empty(&info->swaplist))
1328 list_add_tail(&info->swaplist, &shmem_swaplist); 1358 list_add(&info->swaplist, &shmem_swaplist);
1329 1359
1330 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1360 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1331 spin_lock_irq(&info->lock); 1361 spin_lock_irq(&info->lock);
@@ -1576,6 +1606,116 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1576} 1606}
1577 1607
1578/* 1608/*
1609 * Swap in the page pointed to by *pagep.
1610 * Caller has to make sure that *pagep contains a valid swapped page.
1611 * Returns 0 and the page in pagep if success. On failure, returns the
1612 * the error code and NULL in *pagep.
1613 */
1614static int shmem_swapin_page(struct inode *inode, pgoff_t index,
1615 struct page **pagep, enum sgp_type sgp,
1616 gfp_t gfp, struct vm_area_struct *vma,
1617 vm_fault_t *fault_type)
1618{
1619 struct address_space *mapping = inode->i_mapping;
1620 struct shmem_inode_info *info = SHMEM_I(inode);
1621 struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
1622 struct mem_cgroup *memcg;
1623 struct page *page;
1624 swp_entry_t swap;
1625 int error;
1626
1627 VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
1628 swap = radix_to_swp_entry(*pagep);
1629 *pagep = NULL;
1630
1631 /* Look it up and read it in.. */
1632 page = lookup_swap_cache(swap, NULL, 0);
1633 if (!page) {
1634 /* Or update major stats only when swapin succeeds?? */
1635 if (fault_type) {
1636 *fault_type |= VM_FAULT_MAJOR;
1637 count_vm_event(PGMAJFAULT);
1638 count_memcg_event_mm(charge_mm, PGMAJFAULT);
1639 }
1640 /* Here we actually start the io */
1641 page = shmem_swapin(swap, gfp, info, index);
1642 if (!page) {
1643 error = -ENOMEM;
1644 goto failed;
1645 }
1646 }
1647
1648 /* We have to do this with page locked to prevent races */
1649 lock_page(page);
1650 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1651 !shmem_confirm_swap(mapping, index, swap)) {
1652 error = -EEXIST;
1653 goto unlock;
1654 }
1655 if (!PageUptodate(page)) {
1656 error = -EIO;
1657 goto failed;
1658 }
1659 wait_on_page_writeback(page);
1660
1661 if (shmem_should_replace_page(page, gfp)) {
1662 error = shmem_replace_page(&page, gfp, info, index);
1663 if (error)
1664 goto failed;
1665 }
1666
1667 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1668 false);
1669 if (!error) {
1670 error = shmem_add_to_page_cache(page, mapping, index,
1671 swp_to_radix_entry(swap), gfp);
1672 /*
1673 * We already confirmed swap under page lock, and make
1674 * no memory allocation here, so usually no possibility
1675 * of error; but free_swap_and_cache() only trylocks a
1676 * page, so it is just possible that the entry has been
1677 * truncated or holepunched since swap was confirmed.
1678 * shmem_undo_range() will have done some of the
1679 * unaccounting, now delete_from_swap_cache() will do
1680 * the rest.
1681 */
1682 if (error) {
1683 mem_cgroup_cancel_charge(page, memcg, false);
1684 delete_from_swap_cache(page);
1685 }
1686 }
1687 if (error)
1688 goto failed;
1689
1690 mem_cgroup_commit_charge(page, memcg, true, false);
1691
1692 spin_lock_irq(&info->lock);
1693 info->swapped--;
1694 shmem_recalc_inode(inode);
1695 spin_unlock_irq(&info->lock);
1696
1697 if (sgp == SGP_WRITE)
1698 mark_page_accessed(page);
1699
1700 delete_from_swap_cache(page);
1701 set_page_dirty(page);
1702 swap_free(swap);
1703
1704 *pagep = page;
1705 return 0;
1706failed:
1707 if (!shmem_confirm_swap(mapping, index, swap))
1708 error = -EEXIST;
1709unlock:
1710 if (page) {
1711 unlock_page(page);
1712 put_page(page);
1713 }
1714
1715 return error;
1716}
1717
1718/*
1579 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1719 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1580 * 1720 *
1581 * If we allocate a new one we do not mark it dirty. That's up to the 1721 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -1596,7 +1736,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1596 struct mm_struct *charge_mm; 1736 struct mm_struct *charge_mm;
1597 struct mem_cgroup *memcg; 1737 struct mem_cgroup *memcg;
1598 struct page *page; 1738 struct page *page;
1599 swp_entry_t swap;
1600 enum sgp_type sgp_huge = sgp; 1739 enum sgp_type sgp_huge = sgp;
1601 pgoff_t hindex = index; 1740 pgoff_t hindex = index;
1602 int error; 1741 int error;
@@ -1608,17 +1747,23 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1608 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) 1747 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
1609 sgp = SGP_CACHE; 1748 sgp = SGP_CACHE;
1610repeat: 1749repeat:
1611 swap.val = 0; 1750 if (sgp <= SGP_CACHE &&
1751 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1752 return -EINVAL;
1753 }
1754
1755 sbinfo = SHMEM_SB(inode->i_sb);
1756 charge_mm = vma ? vma->vm_mm : current->mm;
1757
1612 page = find_lock_entry(mapping, index); 1758 page = find_lock_entry(mapping, index);
1613 if (xa_is_value(page)) { 1759 if (xa_is_value(page)) {
1614 swap = radix_to_swp_entry(page); 1760 error = shmem_swapin_page(inode, index, &page,
1615 page = NULL; 1761 sgp, gfp, vma, fault_type);
1616 } 1762 if (error == -EEXIST)
1763 goto repeat;
1617 1764
1618 if (sgp <= SGP_CACHE && 1765 *pagep = page;
1619 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1766 return error;
1620 error = -EINVAL;
1621 goto unlock;
1622 } 1767 }
1623 1768
1624 if (page && sgp == SGP_WRITE) 1769 if (page && sgp == SGP_WRITE)
@@ -1632,7 +1777,7 @@ repeat:
1632 put_page(page); 1777 put_page(page);
1633 page = NULL; 1778 page = NULL;
1634 } 1779 }
1635 if (page || (sgp == SGP_READ && !swap.val)) { 1780 if (page || sgp == SGP_READ) {
1636 *pagep = page; 1781 *pagep = page;
1637 return 0; 1782 return 0;
1638 } 1783 }
@@ -1641,215 +1786,138 @@ repeat:
1641 * Fast cache lookup did not find it: 1786 * Fast cache lookup did not find it:
1642 * bring it back from swap or allocate. 1787 * bring it back from swap or allocate.
1643 */ 1788 */
1644 sbinfo = SHMEM_SB(inode->i_sb);
1645 charge_mm = vma ? vma->vm_mm : current->mm;
1646
1647 if (swap.val) {
1648 /* Look it up and read it in.. */
1649 page = lookup_swap_cache(swap, NULL, 0);
1650 if (!page) {
1651 /* Or update major stats only when swapin succeeds?? */
1652 if (fault_type) {
1653 *fault_type |= VM_FAULT_MAJOR;
1654 count_vm_event(PGMAJFAULT);
1655 count_memcg_event_mm(charge_mm, PGMAJFAULT);
1656 }
1657 /* Here we actually start the io */
1658 page = shmem_swapin(swap, gfp, info, index);
1659 if (!page) {
1660 error = -ENOMEM;
1661 goto failed;
1662 }
1663 }
1664
1665 /* We have to do this with page locked to prevent races */
1666 lock_page(page);
1667 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1668 !shmem_confirm_swap(mapping, index, swap)) {
1669 error = -EEXIST; /* try again */
1670 goto unlock;
1671 }
1672 if (!PageUptodate(page)) {
1673 error = -EIO;
1674 goto failed;
1675 }
1676 wait_on_page_writeback(page);
1677
1678 if (shmem_should_replace_page(page, gfp)) {
1679 error = shmem_replace_page(&page, gfp, info, index);
1680 if (error)
1681 goto failed;
1682 }
1683
1684 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1685 false);
1686 if (!error) {
1687 error = shmem_add_to_page_cache(page, mapping, index,
1688 swp_to_radix_entry(swap), gfp);
1689 /*
1690 * We already confirmed swap under page lock, and make
1691 * no memory allocation here, so usually no possibility
1692 * of error; but free_swap_and_cache() only trylocks a
1693 * page, so it is just possible that the entry has been
1694 * truncated or holepunched since swap was confirmed.
1695 * shmem_undo_range() will have done some of the
1696 * unaccounting, now delete_from_swap_cache() will do
1697 * the rest.
1698 * Reset swap.val? No, leave it so "failed" goes back to
1699 * "repeat": reading a hole and writing should succeed.
1700 */
1701 if (error) {
1702 mem_cgroup_cancel_charge(page, memcg, false);
1703 delete_from_swap_cache(page);
1704 }
1705 }
1706 if (error)
1707 goto failed;
1708
1709 mem_cgroup_commit_charge(page, memcg, true, false);
1710
1711 spin_lock_irq(&info->lock);
1712 info->swapped--;
1713 shmem_recalc_inode(inode);
1714 spin_unlock_irq(&info->lock);
1715
1716 if (sgp == SGP_WRITE)
1717 mark_page_accessed(page);
1718 1789
1719 delete_from_swap_cache(page); 1790 if (vma && userfaultfd_missing(vma)) {
1720 set_page_dirty(page); 1791 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1721 swap_free(swap); 1792 return 0;
1722 1793 }
1723 } else {
1724 if (vma && userfaultfd_missing(vma)) {
1725 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1726 return 0;
1727 }
1728 1794
1729 /* shmem_symlink() */ 1795 /* shmem_symlink() */
1730 if (mapping->a_ops != &shmem_aops) 1796 if (mapping->a_ops != &shmem_aops)
1731 goto alloc_nohuge; 1797 goto alloc_nohuge;
1732 if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) 1798 if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1733 goto alloc_nohuge; 1799 goto alloc_nohuge;
1734 if (shmem_huge == SHMEM_HUGE_FORCE) 1800 if (shmem_huge == SHMEM_HUGE_FORCE)
1801 goto alloc_huge;
1802 switch (sbinfo->huge) {
1803 loff_t i_size;
1804 pgoff_t off;
1805 case SHMEM_HUGE_NEVER:
1806 goto alloc_nohuge;
1807 case SHMEM_HUGE_WITHIN_SIZE:
1808 off = round_up(index, HPAGE_PMD_NR);
1809 i_size = round_up(i_size_read(inode), PAGE_SIZE);
1810 if (i_size >= HPAGE_PMD_SIZE &&
1811 i_size >> PAGE_SHIFT >= off)
1735 goto alloc_huge; 1812 goto alloc_huge;
1736 switch (sbinfo->huge) { 1813 /* fallthrough */
1737 loff_t i_size; 1814 case SHMEM_HUGE_ADVISE:
1738 pgoff_t off; 1815 if (sgp_huge == SGP_HUGE)
1739 case SHMEM_HUGE_NEVER: 1816 goto alloc_huge;
1740 goto alloc_nohuge; 1817 /* TODO: implement fadvise() hints */
1741 case SHMEM_HUGE_WITHIN_SIZE: 1818 goto alloc_nohuge;
1742 off = round_up(index, HPAGE_PMD_NR); 1819 }
1743 i_size = round_up(i_size_read(inode), PAGE_SIZE);
1744 if (i_size >= HPAGE_PMD_SIZE &&
1745 i_size >> PAGE_SHIFT >= off)
1746 goto alloc_huge;
1747 /* fallthrough */
1748 case SHMEM_HUGE_ADVISE:
1749 if (sgp_huge == SGP_HUGE)
1750 goto alloc_huge;
1751 /* TODO: implement fadvise() hints */
1752 goto alloc_nohuge;
1753 }
1754 1820
1755alloc_huge: 1821alloc_huge:
1756 page = shmem_alloc_and_acct_page(gfp, inode, index, true); 1822 page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1757 if (IS_ERR(page)) { 1823 if (IS_ERR(page)) {
1758alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, 1824alloc_nohuge:
1759 index, false); 1825 page = shmem_alloc_and_acct_page(gfp, inode,
1760 } 1826 index, false);
1761 if (IS_ERR(page)) { 1827 }
1762 int retry = 5; 1828 if (IS_ERR(page)) {
1763 error = PTR_ERR(page); 1829 int retry = 5;
1764 page = NULL;
1765 if (error != -ENOSPC)
1766 goto failed;
1767 /*
1768 * Try to reclaim some spece by splitting a huge page
1769 * beyond i_size on the filesystem.
1770 */
1771 while (retry--) {
1772 int ret;
1773 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1774 if (ret == SHRINK_STOP)
1775 break;
1776 if (ret)
1777 goto alloc_nohuge;
1778 }
1779 goto failed;
1780 }
1781
1782 if (PageTransHuge(page))
1783 hindex = round_down(index, HPAGE_PMD_NR);
1784 else
1785 hindex = index;
1786 1830
1787 if (sgp == SGP_WRITE) 1831 error = PTR_ERR(page);
1788 __SetPageReferenced(page); 1832 page = NULL;
1833 if (error != -ENOSPC)
1834 goto unlock;
1835 /*
1836 * Try to reclaim some space by splitting a huge page
1837 * beyond i_size on the filesystem.
1838 */
1839 while (retry--) {
1840 int ret;
1789 1841
1790 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1842 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1791 PageTransHuge(page)); 1843 if (ret == SHRINK_STOP)
1792 if (error) 1844 break;
1793 goto unacct; 1845 if (ret)
1794 error = shmem_add_to_page_cache(page, mapping, hindex, 1846 goto alloc_nohuge;
1795 NULL, gfp & GFP_RECLAIM_MASK);
1796 if (error) {
1797 mem_cgroup_cancel_charge(page, memcg,
1798 PageTransHuge(page));
1799 goto unacct;
1800 } 1847 }
1801 mem_cgroup_commit_charge(page, memcg, false, 1848 goto unlock;
1802 PageTransHuge(page)); 1849 }
1803 lru_cache_add_anon(page);
1804 1850
1805 spin_lock_irq(&info->lock); 1851 if (PageTransHuge(page))
1806 info->alloced += 1 << compound_order(page); 1852 hindex = round_down(index, HPAGE_PMD_NR);
1807 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1853 else
1808 shmem_recalc_inode(inode); 1854 hindex = index;
1809 spin_unlock_irq(&info->lock);
1810 alloced = true;
1811 1855
1812 if (PageTransHuge(page) && 1856 if (sgp == SGP_WRITE)
1813 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 1857 __SetPageReferenced(page);
1814 hindex + HPAGE_PMD_NR - 1) { 1858
1815 /* 1859 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1816 * Part of the huge page is beyond i_size: subject 1860 PageTransHuge(page));
1817 * to shrink under memory pressure. 1861 if (error)
1818 */ 1862 goto unacct;
1819 spin_lock(&sbinfo->shrinklist_lock); 1863 error = shmem_add_to_page_cache(page, mapping, hindex,
1820 /* 1864 NULL, gfp & GFP_RECLAIM_MASK);
1821 * _careful to defend against unlocked access to 1865 if (error) {
1822 * ->shrink_list in shmem_unused_huge_shrink() 1866 mem_cgroup_cancel_charge(page, memcg,
1823 */ 1867 PageTransHuge(page));
1824 if (list_empty_careful(&info->shrinklist)) { 1868 goto unacct;
1825 list_add_tail(&info->shrinklist, 1869 }
1826 &sbinfo->shrinklist); 1870 mem_cgroup_commit_charge(page, memcg, false,
1827 sbinfo->shrinklist_len++; 1871 PageTransHuge(page));
1828 } 1872 lru_cache_add_anon(page);
1829 spin_unlock(&sbinfo->shrinklist_lock);
1830 }
1831 1873
1874 spin_lock_irq(&info->lock);
1875 info->alloced += 1 << compound_order(page);
1876 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1877 shmem_recalc_inode(inode);
1878 spin_unlock_irq(&info->lock);
1879 alloced = true;
1880
1881 if (PageTransHuge(page) &&
1882 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1883 hindex + HPAGE_PMD_NR - 1) {
1832 /* 1884 /*
1833 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1885 * Part of the huge page is beyond i_size: subject
1886 * to shrink under memory pressure.
1834 */ 1887 */
1835 if (sgp == SGP_FALLOC) 1888 spin_lock(&sbinfo->shrinklist_lock);
1836 sgp = SGP_WRITE;
1837clear:
1838 /* 1889 /*
1839 * Let SGP_WRITE caller clear ends if write does not fill page; 1890 * _careful to defend against unlocked access to
1840 * but SGP_FALLOC on a page fallocated earlier must initialize 1891 * ->shrink_list in shmem_unused_huge_shrink()
1841 * it now, lest undo on failure cancel our earlier guarantee.
1842 */ 1892 */
1843 if (sgp != SGP_WRITE && !PageUptodate(page)) { 1893 if (list_empty_careful(&info->shrinklist)) {
1844 struct page *head = compound_head(page); 1894 list_add_tail(&info->shrinklist,
1845 int i; 1895 &sbinfo->shrinklist);
1896 sbinfo->shrinklist_len++;
1897 }
1898 spin_unlock(&sbinfo->shrinklist_lock);
1899 }
1846 1900
1847 for (i = 0; i < (1 << compound_order(head)); i++) { 1901 /*
1848 clear_highpage(head + i); 1902 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1849 flush_dcache_page(head + i); 1903 */
1850 } 1904 if (sgp == SGP_FALLOC)
1851 SetPageUptodate(head); 1905 sgp = SGP_WRITE;
1906clear:
1907 /*
1908 * Let SGP_WRITE caller clear ends if write does not fill page;
1909 * but SGP_FALLOC on a page fallocated earlier must initialize
1910 * it now, lest undo on failure cancel our earlier guarantee.
1911 */
1912 if (sgp != SGP_WRITE && !PageUptodate(page)) {
1913 struct page *head = compound_head(page);
1914 int i;
1915
1916 for (i = 0; i < (1 << compound_order(head)); i++) {
1917 clear_highpage(head + i);
1918 flush_dcache_page(head + i);
1852 } 1919 }
1920 SetPageUptodate(head);
1853 } 1921 }
1854 1922
1855 /* Perhaps the file has been truncated since we checked */ 1923 /* Perhaps the file has been truncated since we checked */
@@ -1879,9 +1947,6 @@ unacct:
1879 put_page(page); 1947 put_page(page);
1880 goto alloc_nohuge; 1948 goto alloc_nohuge;
1881 } 1949 }
1882failed:
1883 if (swap.val && !shmem_confirm_swap(mapping, index, swap))
1884 error = -EEXIST;
1885unlock: 1950unlock:
1886 if (page) { 1951 if (page) {
1887 unlock_page(page); 1952 unlock_page(page);
@@ -2125,6 +2190,24 @@ out_nomem:
2125 2190
2126static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2191static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2127{ 2192{
2193 struct shmem_inode_info *info = SHMEM_I(file_inode(file));
2194
2195 if (info->seals & F_SEAL_FUTURE_WRITE) {
2196 /*
2197 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2198 * "future write" seal active.
2199 */
2200 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2201 return -EPERM;
2202
2203 /*
2204 * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
2205 * read-only mapping, take care to not allow mprotect to revert
2206 * protections.
2207 */
2208 vma->vm_flags &= ~(VM_MAYWRITE);
2209 }
2210
2128 file_accessed(file); 2211 file_accessed(file);
2129 vma->vm_ops = &shmem_vm_ops; 2212 vma->vm_ops = &shmem_vm_ops;
2130 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 2213 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
@@ -2375,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
2375 pgoff_t index = pos >> PAGE_SHIFT; 2458 pgoff_t index = pos >> PAGE_SHIFT;
2376 2459
2377 /* i_mutex is held by caller */ 2460 /* i_mutex is held by caller */
2378 if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { 2461 if (unlikely(info->seals & (F_SEAL_GROW |
2379 if (info->seals & F_SEAL_WRITE) 2462 F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2463 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2380 return -EPERM; 2464 return -EPERM;
2381 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2465 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2382 return -EPERM; 2466 return -EPERM;
@@ -2639,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2639 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 2723 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2640 2724
2641 /* protected by i_mutex */ 2725 /* protected by i_mutex */
2642 if (info->seals & F_SEAL_WRITE) { 2726 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2643 error = -EPERM; 2727 error = -EPERM;
2644 goto out; 2728 goto out;
2645 } 2729 }
@@ -3847,7 +3931,8 @@ int __init shmem_init(void)
3847 return 0; 3931 return 0;
3848} 3932}
3849 3933
3850int shmem_unuse(swp_entry_t swap, struct page *page) 3934int shmem_unuse(unsigned int type, bool frontswap,
3935 unsigned long *fs_pages_to_unuse)
3851{ 3936{
3852 return 0; 3937 return 0;
3853} 3938}
diff --git a/mm/slab.c b/mm/slab.c
index 91c1863df93d..28652e4218e0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -550,14 +550,6 @@ static void start_cpu_timer(int cpu)
550 550
551static void init_arraycache(struct array_cache *ac, int limit, int batch) 551static void init_arraycache(struct array_cache *ac, int limit, int batch)
552{ 552{
553 /*
554 * The array_cache structures contain pointers to free object.
555 * However, when such objects are allocated or transferred to another
556 * cache the pointers are not cleared and they could be counted as
557 * valid references during a kmemleak scan. Therefore, kmemleak must
558 * not scan such objects.
559 */
560 kmemleak_no_scan(ac);
561 if (ac) { 553 if (ac) {
562 ac->avail = 0; 554 ac->avail = 0;
563 ac->limit = limit; 555 ac->limit = limit;
@@ -573,6 +565,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
573 struct array_cache *ac = NULL; 565 struct array_cache *ac = NULL;
574 566
575 ac = kmalloc_node(memsize, gfp, node); 567 ac = kmalloc_node(memsize, gfp, node);
568 /*
569 * The array_cache structures contain pointers to free object.
570 * However, when such objects are allocated or transferred to another
571 * cache the pointers are not cleared and they could be counted as
572 * valid references during a kmemleak scan. Therefore, kmemleak must
573 * not scan such objects.
574 */
575 kmemleak_no_scan(ac);
576 init_arraycache(ac, entries, batchcount); 576 init_arraycache(ac, entries, batchcount);
577 return ac; 577 return ac;
578} 578}
@@ -667,6 +667,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
667 667
668 alc = kmalloc_node(memsize, gfp, node); 668 alc = kmalloc_node(memsize, gfp, node);
669 if (alc) { 669 if (alc) {
670 kmemleak_no_scan(alc);
670 init_arraycache(&alc->ac, entries, batch); 671 init_arraycache(&alc->ac, entries, batch);
671 spin_lock_init(&alc->lock); 672 spin_lock_init(&alc->lock);
672 } 673 }
@@ -676,12 +677,11 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
676static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 677static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
677{ 678{
678 struct alien_cache **alc_ptr; 679 struct alien_cache **alc_ptr;
679 size_t memsize = sizeof(void *) * nr_node_ids;
680 int i; 680 int i;
681 681
682 if (limit > 1) 682 if (limit > 1)
683 limit = 12; 683 limit = 12;
684 alc_ptr = kzalloc_node(memsize, gfp, node); 684 alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
685 if (!alc_ptr) 685 if (!alc_ptr)
686 return NULL; 686 return NULL;
687 687
@@ -1727,6 +1727,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1727 * This could be made much more intelligent. For now, try to avoid using 1727 * This could be made much more intelligent. For now, try to avoid using
1728 * high order pages for slabs. When the gfp() functions are more friendly 1728 * high order pages for slabs. When the gfp() functions are more friendly
1729 * towards high-order requests, this should be changed. 1729 * towards high-order requests, this should be changed.
1730 *
1731 * Return: number of left-over bytes in a slab
1730 */ 1732 */
1731static size_t calculate_slab_order(struct kmem_cache *cachep, 1733static size_t calculate_slab_order(struct kmem_cache *cachep,
1732 size_t size, slab_flags_t flags) 1734 size_t size, slab_flags_t flags)
@@ -1975,6 +1977,8 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
1975 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1977 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1976 * cacheline. This can be beneficial if you're counting cycles as closely 1978 * cacheline. This can be beneficial if you're counting cycles as closely
1977 * as davem. 1979 * as davem.
1980 *
1981 * Return: a pointer to the created cache or %NULL in case of error
1978 */ 1982 */
1979int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) 1983int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
1980{ 1984{
@@ -3542,6 +3546,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
3542 * 3546 *
3543 * Allocate an object from this cache. The flags are only relevant 3547 * Allocate an object from this cache. The flags are only relevant
3544 * if the cache has no available objects. 3548 * if the cache has no available objects.
3549 *
3550 * Return: pointer to the new object or %NULL in case of error
3545 */ 3551 */
3546void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3552void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3547{ 3553{
@@ -3631,6 +3637,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
3631 * node, which can improve the performance for cpu bound structures. 3637 * node, which can improve the performance for cpu bound structures.
3632 * 3638 *
3633 * Fallback to other node is possible if __GFP_THISNODE is not set. 3639 * Fallback to other node is possible if __GFP_THISNODE is not set.
3640 *
3641 * Return: pointer to the new object or %NULL in case of error
3634 */ 3642 */
3635void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3643void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3636{ 3644{
@@ -3699,6 +3707,8 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
3699 * @size: how many bytes of memory are required. 3707 * @size: how many bytes of memory are required.
3700 * @flags: the type of memory to allocate (see kmalloc). 3708 * @flags: the type of memory to allocate (see kmalloc).
3701 * @caller: function caller for debug tracking of the caller 3709 * @caller: function caller for debug tracking of the caller
3710 *
3711 * Return: pointer to the allocated memory or %NULL in case of error
3702 */ 3712 */
3703static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3713static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3704 unsigned long caller) 3714 unsigned long caller)
@@ -4164,6 +4174,8 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
4164 * @buffer: user buffer 4174 * @buffer: user buffer
4165 * @count: data length 4175 * @count: data length
4166 * @ppos: unused 4176 * @ppos: unused
4177 *
4178 * Return: %0 on success, negative error code otherwise.
4167 */ 4179 */
4168ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4180ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4169 size_t count, loff_t *ppos) 4181 size_t count, loff_t *ppos)
@@ -4457,6 +4469,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
4457 * The caller must guarantee that objp points to a valid object previously 4469 * The caller must guarantee that objp points to a valid object previously
4458 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4470 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4459 * must not be freed during the duration of the call. 4471 * must not be freed during the duration of the call.
4472 *
4473 * Return: size of the actual memory used by @objp in bytes
4460 */ 4474 */
4461size_t ksize(const void *objp) 4475size_t ksize(const void *objp)
4462{ 4476{
diff --git a/mm/slab.h b/mm/slab.h
index 384105318779..e5e6658eeacc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page,
276 gfp_t gfp, int order, 276 gfp_t gfp, int order,
277 struct kmem_cache *s) 277 struct kmem_cache *s)
278{ 278{
279 if (!memcg_kmem_enabled())
280 return 0;
281 if (is_root_cache(s)) 279 if (is_root_cache(s))
282 return 0; 280 return 0;
283 return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); 281 return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
@@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page,
286static __always_inline void memcg_uncharge_slab(struct page *page, int order, 284static __always_inline void memcg_uncharge_slab(struct page *page, int order,
287 struct kmem_cache *s) 285 struct kmem_cache *s)
288{ 286{
289 if (!memcg_kmem_enabled())
290 return;
291 memcg_kmem_uncharge(page, order); 287 memcg_kmem_uncharge(page, order);
292} 288}
293 289
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f9d89c1b5977..03eeb8b7b4b1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -939,6 +939,8 @@ EXPORT_SYMBOL(kmem_cache_destroy);
939 * 939 *
940 * Releases as many slabs as possible for a cache. 940 * Releases as many slabs as possible for a cache.
941 * To help debugging, a zero exit status indicates all slabs were released. 941 * To help debugging, a zero exit status indicates all slabs were released.
942 *
943 * Return: %0 if all slabs were released, non-zero otherwise
942 */ 944 */
943int kmem_cache_shrink(struct kmem_cache *cachep) 945int kmem_cache_shrink(struct kmem_cache *cachep)
944{ 946{
@@ -1425,7 +1427,7 @@ void dump_unreclaimable_slab(void)
1425#if defined(CONFIG_MEMCG) 1427#if defined(CONFIG_MEMCG)
1426void *memcg_slab_start(struct seq_file *m, loff_t *pos) 1428void *memcg_slab_start(struct seq_file *m, loff_t *pos)
1427{ 1429{
1428 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 1430 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1429 1431
1430 mutex_lock(&slab_mutex); 1432 mutex_lock(&slab_mutex);
1431 return seq_list_start(&memcg->kmem_caches, *pos); 1433 return seq_list_start(&memcg->kmem_caches, *pos);
@@ -1433,7 +1435,7 @@ void *memcg_slab_start(struct seq_file *m, loff_t *pos)
1433 1435
1434void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) 1436void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
1435{ 1437{
1436 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 1438 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1437 1439
1438 return seq_list_next(p, &memcg->kmem_caches, pos); 1440 return seq_list_next(p, &memcg->kmem_caches, pos);
1439} 1441}
@@ -1447,7 +1449,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
1447{ 1449{
1448 struct kmem_cache *s = list_entry(p, struct kmem_cache, 1450 struct kmem_cache *s = list_entry(p, struct kmem_cache,
1449 memcg_params.kmem_caches_node); 1451 memcg_params.kmem_caches_node);
1450 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 1452 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1451 1453
1452 if (p == memcg->kmem_caches.next) 1454 if (p == memcg->kmem_caches.next)
1453 print_slabinfo_header(m); 1455 print_slabinfo_header(m);
@@ -1528,6 +1530,8 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
1528 * This function is like krealloc() except it never frees the originally 1530 * This function is like krealloc() except it never frees the originally
1529 * allocated buffer. Use this if you don't want to free the buffer immediately 1531 * allocated buffer. Use this if you don't want to free the buffer immediately
1530 * like, for example, with RCU. 1532 * like, for example, with RCU.
1533 *
1534 * Return: pointer to the allocated memory or %NULL in case of error
1531 */ 1535 */
1532void *__krealloc(const void *p, size_t new_size, gfp_t flags) 1536void *__krealloc(const void *p, size_t new_size, gfp_t flags)
1533{ 1537{
@@ -1549,6 +1553,8 @@ EXPORT_SYMBOL(__krealloc);
1549 * lesser of the new and old sizes. If @p is %NULL, krealloc() 1553 * lesser of the new and old sizes. If @p is %NULL, krealloc()
1550 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a 1554 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
1551 * %NULL pointer, the object pointed to is freed. 1555 * %NULL pointer, the object pointed to is freed.
1556 *
1557 * Return: pointer to the allocated memory or %NULL in case of error
1552 */ 1558 */
1553void *krealloc(const void *p, size_t new_size, gfp_t flags) 1559void *krealloc(const void *p, size_t new_size, gfp_t flags)
1554{ 1560{
diff --git a/mm/slub.c b/mm/slub.c
index dc777761b6b7..1b08fbcb7e61 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1093,8 +1093,7 @@ static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
1093} 1093}
1094 1094
1095static inline int alloc_consistency_checks(struct kmem_cache *s, 1095static inline int alloc_consistency_checks(struct kmem_cache *s,
1096 struct page *page, 1096 struct page *page, void *object)
1097 void *object, unsigned long addr)
1098{ 1097{
1099 if (!check_slab(s, page)) 1098 if (!check_slab(s, page))
1100 return 0; 1099 return 0;
@@ -1115,7 +1114,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,
1115 void *object, unsigned long addr) 1114 void *object, unsigned long addr)
1116{ 1115{
1117 if (s->flags & SLAB_CONSISTENCY_CHECKS) { 1116 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1118 if (!alloc_consistency_checks(s, page, object, addr)) 1117 if (!alloc_consistency_checks(s, page, object))
1119 goto bad; 1118 goto bad;
1120 } 1119 }
1121 1120
@@ -2130,7 +2129,7 @@ redo:
2130 if (!lock) { 2129 if (!lock) {
2131 lock = 1; 2130 lock = 1;
2132 /* 2131 /*
2133 * Taking the spinlock removes the possiblity 2132 * Taking the spinlock removes the possibility
2134 * that acquire_slab() will see a slab page that 2133 * that acquire_slab() will see a slab page that
2135 * is frozen 2134 * is frozen
2136 */ 2135 */
@@ -2254,8 +2253,8 @@ static void unfreeze_partials(struct kmem_cache *s,
2254} 2253}
2255 2254
2256/* 2255/*
2257 * Put a page that was just frozen (in __slab_free) into a partial page 2256 * Put a page that was just frozen (in __slab_free|get_partial_node) into a
2258 * slot if available. 2257 * partial page slot if available.
2259 * 2258 *
2260 * If we did not find a slot then simply move all the partials to the 2259 * If we did not find a slot then simply move all the partials to the
2261 * per node partial list. 2260 * per node partial list.
@@ -2482,8 +2481,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2482 stat(s, ALLOC_SLAB); 2481 stat(s, ALLOC_SLAB);
2483 c->page = page; 2482 c->page = page;
2484 *pc = c; 2483 *pc = c;
2485 } else 2484 }
2486 freelist = NULL;
2487 2485
2488 return freelist; 2486 return freelist;
2489} 2487}
@@ -4264,7 +4262,7 @@ void __init kmem_cache_init(void)
4264 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, 4262 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
4265 slub_cpu_dead); 4263 slub_cpu_dead);
4266 4264
4267 pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", 4265 pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
4268 cache_line_size(), 4266 cache_line_size(),
4269 slub_min_order, slub_max_order, slub_min_objects, 4267 slub_min_order, slub_max_order, slub_min_objects,
4270 nr_cpu_ids, nr_node_ids); 4268 nr_cpu_ids, nr_node_ids);
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ea5dc6c6b19..77a0554fa5bd 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -197,7 +197,7 @@ static inline int next_present_section_nr(int section_nr)
197} 197}
198#define for_each_present_section_nr(start, section_nr) \ 198#define for_each_present_section_nr(start, section_nr) \
199 for (section_nr = next_present_section_nr(start-1); \ 199 for (section_nr = next_present_section_nr(start-1); \
200 ((section_nr >= 0) && \ 200 ((section_nr != -1) && \
201 (section_nr <= __highest_present_section_nr)); \ 201 (section_nr <= __highest_present_section_nr)); \
202 section_nr = next_present_section_nr(section_nr)) 202 section_nr = next_present_section_nr(section_nr))
203 203
diff --git a/mm/swap.c b/mm/swap.c
index 4d7d37eb3c40..301ed4e04320 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -58,16 +58,16 @@ static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
58static void __page_cache_release(struct page *page) 58static void __page_cache_release(struct page *page)
59{ 59{
60 if (PageLRU(page)) { 60 if (PageLRU(page)) {
61 struct zone *zone = page_zone(page); 61 pg_data_t *pgdat = page_pgdat(page);
62 struct lruvec *lruvec; 62 struct lruvec *lruvec;
63 unsigned long flags; 63 unsigned long flags;
64 64
65 spin_lock_irqsave(zone_lru_lock(zone), flags); 65 spin_lock_irqsave(&pgdat->lru_lock, flags);
66 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 66 lruvec = mem_cgroup_page_lruvec(page, pgdat);
67 VM_BUG_ON_PAGE(!PageLRU(page), page); 67 VM_BUG_ON_PAGE(!PageLRU(page), page);
68 __ClearPageLRU(page); 68 __ClearPageLRU(page);
69 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 69 del_page_from_lru_list(page, lruvec, page_off_lru(page));
70 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 70 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
71 } 71 }
72 __ClearPageWaiters(page); 72 __ClearPageWaiters(page);
73 mem_cgroup_uncharge(page); 73 mem_cgroup_uncharge(page);
@@ -322,12 +322,12 @@ static inline void activate_page_drain(int cpu)
322 322
323void activate_page(struct page *page) 323void activate_page(struct page *page)
324{ 324{
325 struct zone *zone = page_zone(page); 325 pg_data_t *pgdat = page_pgdat(page);
326 326
327 page = compound_head(page); 327 page = compound_head(page);
328 spin_lock_irq(zone_lru_lock(zone)); 328 spin_lock_irq(&pgdat->lru_lock);
329 __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); 329 __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
330 spin_unlock_irq(zone_lru_lock(zone)); 330 spin_unlock_irq(&pgdat->lru_lock);
331} 331}
332#endif 332#endif
333 333
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fd2f21e1c60a..85245fdec8d9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -523,7 +523,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
523 * This has been extended to use the NUMA policies from the mm triggering 523 * This has been extended to use the NUMA policies from the mm triggering
524 * the readahead. 524 * the readahead.
525 * 525 *
526 * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. 526 * Caller must hold read mmap_sem if vmf->vma is not NULL.
527 */ 527 */
528struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 528struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
529 struct vm_fault *vmf) 529 struct vm_fault *vmf)
@@ -543,6 +543,13 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
543 if (!mask) 543 if (!mask)
544 goto skip; 544 goto skip;
545 545
546 /* Test swap type to make sure the dereference is safe */
547 if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
548 struct inode *inode = si->swap_file->f_mapping->host;
549 if (inode_read_congested(inode))
550 goto skip;
551 }
552
546 do_poll = false; 553 do_poll = false;
547 /* Read a page_cluster sized and aligned cluster around offset. */ 554 /* Read a page_cluster sized and aligned cluster around offset. */
548 start_offset = offset & ~mask; 555 start_offset = offset & ~mask;
@@ -691,6 +698,20 @@ static void swap_ra_info(struct vm_fault *vmf,
691 pte_unmap(orig_pte); 698 pte_unmap(orig_pte);
692} 699}
693 700
701/**
702 * swap_vma_readahead - swap in pages in hope we need them soon
703 * @entry: swap entry of this memory
704 * @gfp_mask: memory allocation flags
705 * @vmf: fault information
706 *
707 * Returns the struct page for entry and addr, after queueing swapin.
708 *
709 * Primitive swap readahead code. We simply read in a few pages whoes
710 * virtual addresses are around the fault address in the same vma.
711 *
712 * Caller must hold read mmap_sem if vmf->vma is not NULL.
713 *
714 */
694static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, 715static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
695 struct vm_fault *vmf) 716 struct vm_fault *vmf)
696{ 717{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dbac1d49469d..2b8d9c3fbb47 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -98,6 +98,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
98 98
99atomic_t nr_rotate_swap = ATOMIC_INIT(0); 99atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100 100
101static struct swap_info_struct *swap_type_to_swap_info(int type)
102{
103 if (type >= READ_ONCE(nr_swapfiles))
104 return NULL;
105
106 smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
107 return READ_ONCE(swap_info[type]);
108}
109
101static inline unsigned char swap_count(unsigned char ent) 110static inline unsigned char swap_count(unsigned char ent)
102{ 111{
103 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ 112 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
@@ -1044,12 +1053,14 @@ noswap:
1044/* The only caller of this function is now suspend routine */ 1053/* The only caller of this function is now suspend routine */
1045swp_entry_t get_swap_page_of_type(int type) 1054swp_entry_t get_swap_page_of_type(int type)
1046{ 1055{
1047 struct swap_info_struct *si; 1056 struct swap_info_struct *si = swap_type_to_swap_info(type);
1048 pgoff_t offset; 1057 pgoff_t offset;
1049 1058
1050 si = swap_info[type]; 1059 if (!si)
1060 goto fail;
1061
1051 spin_lock(&si->lock); 1062 spin_lock(&si->lock);
1052 if (si && (si->flags & SWP_WRITEOK)) { 1063 if (si->flags & SWP_WRITEOK) {
1053 atomic_long_dec(&nr_swap_pages); 1064 atomic_long_dec(&nr_swap_pages);
1054 /* This is called for allocating swap entry, not cache */ 1065 /* This is called for allocating swap entry, not cache */
1055 offset = scan_swap_map(si, 1); 1066 offset = scan_swap_map(si, 1);
@@ -1060,6 +1071,7 @@ swp_entry_t get_swap_page_of_type(int type)
1060 atomic_long_inc(&nr_swap_pages); 1071 atomic_long_inc(&nr_swap_pages);
1061 } 1072 }
1062 spin_unlock(&si->lock); 1073 spin_unlock(&si->lock);
1074fail:
1063 return (swp_entry_t) {0}; 1075 return (swp_entry_t) {0};
1064} 1076}
1065 1077
@@ -1071,9 +1083,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1071 if (!entry.val) 1083 if (!entry.val)
1072 goto out; 1084 goto out;
1073 type = swp_type(entry); 1085 type = swp_type(entry);
1074 if (type >= nr_swapfiles) 1086 p = swap_type_to_swap_info(type);
1087 if (!p)
1075 goto bad_nofile; 1088 goto bad_nofile;
1076 p = swap_info[type];
1077 if (!(p->flags & SWP_USED)) 1089 if (!(p->flags & SWP_USED))
1078 goto bad_device; 1090 goto bad_device;
1079 offset = swp_offset(entry); 1091 offset = swp_offset(entry);
@@ -1697,10 +1709,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1697sector_t swapdev_block(int type, pgoff_t offset) 1709sector_t swapdev_block(int type, pgoff_t offset)
1698{ 1710{
1699 struct block_device *bdev; 1711 struct block_device *bdev;
1712 struct swap_info_struct *si = swap_type_to_swap_info(type);
1700 1713
1701 if ((unsigned int)type >= nr_swapfiles) 1714 if (!si || !(si->flags & SWP_WRITEOK))
1702 return 0;
1703 if (!(swap_info[type]->flags & SWP_WRITEOK))
1704 return 0; 1715 return 0;
1705 return map_swap_entry(swp_entry(type, offset), &bdev); 1716 return map_swap_entry(swp_entry(type, offset), &bdev);
1706} 1717}
@@ -1799,44 +1810,77 @@ out_nolock:
1799} 1810}
1800 1811
1801static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 1812static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1802 unsigned long addr, unsigned long end, 1813 unsigned long addr, unsigned long end,
1803 swp_entry_t entry, struct page *page) 1814 unsigned int type, bool frontswap,
1815 unsigned long *fs_pages_to_unuse)
1804{ 1816{
1805 pte_t swp_pte = swp_entry_to_pte(entry); 1817 struct page *page;
1818 swp_entry_t entry;
1806 pte_t *pte; 1819 pte_t *pte;
1820 struct swap_info_struct *si;
1821 unsigned long offset;
1807 int ret = 0; 1822 int ret = 0;
1823 volatile unsigned char *swap_map;
1808 1824
1809 /* 1825 si = swap_info[type];
1810 * We don't actually need pte lock while scanning for swp_pte: since
1811 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
1812 * page table while we're scanning; though it could get zapped, and on
1813 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1814 * of unmatched parts which look like swp_pte, so unuse_pte must
1815 * recheck under pte lock. Scanning without pte lock lets it be
1816 * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1817 */
1818 pte = pte_offset_map(pmd, addr); 1826 pte = pte_offset_map(pmd, addr);
1819 do { 1827 do {
1820 /* 1828 struct vm_fault vmf;
1821 * swapoff spends a _lot_ of time in this loop! 1829
1822 * Test inline before going to call unuse_pte. 1830 if (!is_swap_pte(*pte))
1823 */ 1831 continue;
1824 if (unlikely(pte_same_as_swp(*pte, swp_pte))) { 1832
1825 pte_unmap(pte); 1833 entry = pte_to_swp_entry(*pte);
1826 ret = unuse_pte(vma, pmd, addr, entry, page); 1834 if (swp_type(entry) != type)
1827 if (ret) 1835 continue;
1828 goto out; 1836
1829 pte = pte_offset_map(pmd, addr); 1837 offset = swp_offset(entry);
1838 if (frontswap && !frontswap_test(si, offset))
1839 continue;
1840
1841 pte_unmap(pte);
1842 swap_map = &si->swap_map[offset];
1843 vmf.vma = vma;
1844 vmf.address = addr;
1845 vmf.pmd = pmd;
1846 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
1847 if (!page) {
1848 if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
1849 goto try_next;
1850 return -ENOMEM;
1851 }
1852
1853 lock_page(page);
1854 wait_on_page_writeback(page);
1855 ret = unuse_pte(vma, pmd, addr, entry, page);
1856 if (ret < 0) {
1857 unlock_page(page);
1858 put_page(page);
1859 goto out;
1830 } 1860 }
1861
1862 try_to_free_swap(page);
1863 unlock_page(page);
1864 put_page(page);
1865
1866 if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
1867 ret = FRONTSWAP_PAGES_UNUSED;
1868 goto out;
1869 }
1870try_next:
1871 pte = pte_offset_map(pmd, addr);
1831 } while (pte++, addr += PAGE_SIZE, addr != end); 1872 } while (pte++, addr += PAGE_SIZE, addr != end);
1832 pte_unmap(pte - 1); 1873 pte_unmap(pte - 1);
1874
1875 ret = 0;
1833out: 1876out:
1834 return ret; 1877 return ret;
1835} 1878}
1836 1879
1837static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 1880static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1838 unsigned long addr, unsigned long end, 1881 unsigned long addr, unsigned long end,
1839 swp_entry_t entry, struct page *page) 1882 unsigned int type, bool frontswap,
1883 unsigned long *fs_pages_to_unuse)
1840{ 1884{
1841 pmd_t *pmd; 1885 pmd_t *pmd;
1842 unsigned long next; 1886 unsigned long next;
@@ -1848,7 +1892,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1848 next = pmd_addr_end(addr, end); 1892 next = pmd_addr_end(addr, end);
1849 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 1893 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1850 continue; 1894 continue;
1851 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 1895 ret = unuse_pte_range(vma, pmd, addr, next, type,
1896 frontswap, fs_pages_to_unuse);
1852 if (ret) 1897 if (ret)
1853 return ret; 1898 return ret;
1854 } while (pmd++, addr = next, addr != end); 1899 } while (pmd++, addr = next, addr != end);
@@ -1857,7 +1902,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1857 1902
1858static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 1903static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1859 unsigned long addr, unsigned long end, 1904 unsigned long addr, unsigned long end,
1860 swp_entry_t entry, struct page *page) 1905 unsigned int type, bool frontswap,
1906 unsigned long *fs_pages_to_unuse)
1861{ 1907{
1862 pud_t *pud; 1908 pud_t *pud;
1863 unsigned long next; 1909 unsigned long next;
@@ -1868,7 +1914,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1868 next = pud_addr_end(addr, end); 1914 next = pud_addr_end(addr, end);
1869 if (pud_none_or_clear_bad(pud)) 1915 if (pud_none_or_clear_bad(pud))
1870 continue; 1916 continue;
1871 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 1917 ret = unuse_pmd_range(vma, pud, addr, next, type,
1918 frontswap, fs_pages_to_unuse);
1872 if (ret) 1919 if (ret)
1873 return ret; 1920 return ret;
1874 } while (pud++, addr = next, addr != end); 1921 } while (pud++, addr = next, addr != end);
@@ -1877,7 +1924,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1877 1924
1878static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 1925static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1879 unsigned long addr, unsigned long end, 1926 unsigned long addr, unsigned long end,
1880 swp_entry_t entry, struct page *page) 1927 unsigned int type, bool frontswap,
1928 unsigned long *fs_pages_to_unuse)
1881{ 1929{
1882 p4d_t *p4d; 1930 p4d_t *p4d;
1883 unsigned long next; 1931 unsigned long next;
@@ -1888,78 +1936,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1888 next = p4d_addr_end(addr, end); 1936 next = p4d_addr_end(addr, end);
1889 if (p4d_none_or_clear_bad(p4d)) 1937 if (p4d_none_or_clear_bad(p4d))
1890 continue; 1938 continue;
1891 ret = unuse_pud_range(vma, p4d, addr, next, entry, page); 1939 ret = unuse_pud_range(vma, p4d, addr, next, type,
1940 frontswap, fs_pages_to_unuse);
1892 if (ret) 1941 if (ret)
1893 return ret; 1942 return ret;
1894 } while (p4d++, addr = next, addr != end); 1943 } while (p4d++, addr = next, addr != end);
1895 return 0; 1944 return 0;
1896} 1945}
1897 1946
1898static int unuse_vma(struct vm_area_struct *vma, 1947static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
1899 swp_entry_t entry, struct page *page) 1948 bool frontswap, unsigned long *fs_pages_to_unuse)
1900{ 1949{
1901 pgd_t *pgd; 1950 pgd_t *pgd;
1902 unsigned long addr, end, next; 1951 unsigned long addr, end, next;
1903 int ret; 1952 int ret;
1904 1953
1905 if (page_anon_vma(page)) { 1954 addr = vma->vm_start;
1906 addr = page_address_in_vma(page, vma); 1955 end = vma->vm_end;
1907 if (addr == -EFAULT)
1908 return 0;
1909 else
1910 end = addr + PAGE_SIZE;
1911 } else {
1912 addr = vma->vm_start;
1913 end = vma->vm_end;
1914 }
1915 1956
1916 pgd = pgd_offset(vma->vm_mm, addr); 1957 pgd = pgd_offset(vma->vm_mm, addr);
1917 do { 1958 do {
1918 next = pgd_addr_end(addr, end); 1959 next = pgd_addr_end(addr, end);
1919 if (pgd_none_or_clear_bad(pgd)) 1960 if (pgd_none_or_clear_bad(pgd))
1920 continue; 1961 continue;
1921 ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); 1962 ret = unuse_p4d_range(vma, pgd, addr, next, type,
1963 frontswap, fs_pages_to_unuse);
1922 if (ret) 1964 if (ret)
1923 return ret; 1965 return ret;
1924 } while (pgd++, addr = next, addr != end); 1966 } while (pgd++, addr = next, addr != end);
1925 return 0; 1967 return 0;
1926} 1968}
1927 1969
1928static int unuse_mm(struct mm_struct *mm, 1970static int unuse_mm(struct mm_struct *mm, unsigned int type,
1929 swp_entry_t entry, struct page *page) 1971 bool frontswap, unsigned long *fs_pages_to_unuse)
1930{ 1972{
1931 struct vm_area_struct *vma; 1973 struct vm_area_struct *vma;
1932 int ret = 0; 1974 int ret = 0;
1933 1975
1934 if (!down_read_trylock(&mm->mmap_sem)) { 1976 down_read(&mm->mmap_sem);
1935 /*
1936 * Activate page so shrink_inactive_list is unlikely to unmap
1937 * its ptes while lock is dropped, so swapoff can make progress.
1938 */
1939 activate_page(page);
1940 unlock_page(page);
1941 down_read(&mm->mmap_sem);
1942 lock_page(page);
1943 }
1944 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1977 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1945 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 1978 if (vma->anon_vma) {
1946 break; 1979 ret = unuse_vma(vma, type, frontswap,
1980 fs_pages_to_unuse);
1981 if (ret)
1982 break;
1983 }
1947 cond_resched(); 1984 cond_resched();
1948 } 1985 }
1949 up_read(&mm->mmap_sem); 1986 up_read(&mm->mmap_sem);
1950 return (ret < 0)? ret: 0; 1987 return ret;
1951} 1988}
1952 1989
1953/* 1990/*
1954 * Scan swap_map (or frontswap_map if frontswap parameter is true) 1991 * Scan swap_map (or frontswap_map if frontswap parameter is true)
1955 * from current position to next entry still in use. 1992 * from current position to next entry still in use. Return 0
1956 * Recycle to start on reaching the end, returning 0 when empty. 1993 * if there are no inuse entries after prev till end of the map.
1957 */ 1994 */
1958static unsigned int find_next_to_unuse(struct swap_info_struct *si, 1995static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1959 unsigned int prev, bool frontswap) 1996 unsigned int prev, bool frontswap)
1960{ 1997{
1961 unsigned int max = si->max; 1998 unsigned int i;
1962 unsigned int i = prev;
1963 unsigned char count; 1999 unsigned char count;
1964 2000
1965 /* 2001 /*
@@ -1968,20 +2004,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1968 * hits are okay, and sys_swapoff() has already prevented new 2004 * hits are okay, and sys_swapoff() has already prevented new
1969 * allocations from this area (while holding swap_lock). 2005 * allocations from this area (while holding swap_lock).
1970 */ 2006 */
1971 for (;;) { 2007 for (i = prev + 1; i < si->max; i++) {
1972 if (++i >= max) {
1973 if (!prev) {
1974 i = 0;
1975 break;
1976 }
1977 /*
1978 * No entries in use at top of swap_map,
1979 * loop back to start and recheck there.
1980 */
1981 max = prev + 1;
1982 prev = 0;
1983 i = 1;
1984 }
1985 count = READ_ONCE(si->swap_map[i]); 2008 count = READ_ONCE(si->swap_map[i]);
1986 if (count && swap_count(count) != SWAP_MAP_BAD) 2009 if (count && swap_count(count) != SWAP_MAP_BAD)
1987 if (!frontswap || frontswap_test(si, i)) 2010 if (!frontswap || frontswap_test(si, i))
@@ -1989,240 +2012,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1989 if ((i % LATENCY_LIMIT) == 0) 2012 if ((i % LATENCY_LIMIT) == 0)
1990 cond_resched(); 2013 cond_resched();
1991 } 2014 }
2015
2016 if (i == si->max)
2017 i = 0;
2018
1992 return i; 2019 return i;
1993} 2020}
1994 2021
1995/* 2022/*
1996 * We completely avoid races by reading each swap page in advance, 2023 * If the boolean frontswap is true, only unuse pages_to_unuse pages;
1997 * and then search for the process using it. All the necessary
1998 * page table adjustments can then be made atomically.
1999 *
2000 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
2001 * pages_to_unuse==0 means all pages; ignored if frontswap is false 2024 * pages_to_unuse==0 means all pages; ignored if frontswap is false
2002 */ 2025 */
2026#define SWAP_UNUSE_MAX_TRIES 3
2003int try_to_unuse(unsigned int type, bool frontswap, 2027int try_to_unuse(unsigned int type, bool frontswap,
2004 unsigned long pages_to_unuse) 2028 unsigned long pages_to_unuse)
2005{ 2029{
2030 struct mm_struct *prev_mm;
2031 struct mm_struct *mm;
2032 struct list_head *p;
2033 int retval = 0;
2006 struct swap_info_struct *si = swap_info[type]; 2034 struct swap_info_struct *si = swap_info[type];
2007 struct mm_struct *start_mm;
2008 volatile unsigned char *swap_map; /* swap_map is accessed without
2009 * locking. Mark it as volatile
2010 * to prevent compiler doing
2011 * something odd.
2012 */
2013 unsigned char swcount;
2014 struct page *page; 2035 struct page *page;
2015 swp_entry_t entry; 2036 swp_entry_t entry;
2016 unsigned int i = 0; 2037 unsigned int i;
2017 int retval = 0; 2038 int retries = 0;
2018 2039
2019 /* 2040 if (!si->inuse_pages)
2020 * When searching mms for an entry, a good strategy is to 2041 return 0;
2021 * start at the first mm we freed the previous entry from
2022 * (though actually we don't notice whether we or coincidence
2023 * freed the entry). Initialize this start_mm with a hold.
2024 *
2025 * A simpler strategy would be to start at the last mm we
2026 * freed the previous entry from; but that would take less
2027 * advantage of mmlist ordering, which clusters forked mms
2028 * together, child after parent. If we race with dup_mmap(), we
2029 * prefer to resolve parent before child, lest we miss entries
2030 * duplicated after we scanned child: using last mm would invert
2031 * that.
2032 */
2033 start_mm = &init_mm;
2034 mmget(&init_mm);
2035 2042
2036 /* 2043 if (!frontswap)
2037 * Keep on scanning until all entries have gone. Usually, 2044 pages_to_unuse = 0;
2038 * one pass through swap_map is enough, but not necessarily: 2045
2039 * there are races when an instance of an entry might be missed. 2046retry:
2040 */ 2047 retval = shmem_unuse(type, frontswap, &pages_to_unuse);
2041 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { 2048 if (retval)
2049 goto out;
2050
2051 prev_mm = &init_mm;
2052 mmget(prev_mm);
2053
2054 spin_lock(&mmlist_lock);
2055 p = &init_mm.mmlist;
2056 while ((p = p->next) != &init_mm.mmlist) {
2042 if (signal_pending(current)) { 2057 if (signal_pending(current)) {
2043 retval = -EINTR; 2058 retval = -EINTR;
2044 break; 2059 break;
2045 } 2060 }
2046 2061
2047 /* 2062 mm = list_entry(p, struct mm_struct, mmlist);
2048 * Get a page for the entry, using the existing swap 2063 if (!mmget_not_zero(mm))
2049 * cache page if there is one. Otherwise, get a clean 2064 continue;
2050 * page and read the swap into it. 2065 spin_unlock(&mmlist_lock);
2051 */ 2066 mmput(prev_mm);
2052 swap_map = &si->swap_map[i]; 2067 prev_mm = mm;
2053 entry = swp_entry(type, i); 2068 retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
2054 page = read_swap_cache_async(entry,
2055 GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2056 if (!page) {
2057 /*
2058 * Either swap_duplicate() failed because entry
2059 * has been freed independently, and will not be
2060 * reused since sys_swapoff() already disabled
2061 * allocation from here, or alloc_page() failed.
2062 */
2063 swcount = *swap_map;
2064 /*
2065 * We don't hold lock here, so the swap entry could be
2066 * SWAP_MAP_BAD (when the cluster is discarding).
2067 * Instead of fail out, We can just skip the swap
2068 * entry because swapoff will wait for discarding
2069 * finish anyway.
2070 */
2071 if (!swcount || swcount == SWAP_MAP_BAD)
2072 continue;
2073 retval = -ENOMEM;
2074 break;
2075 }
2076 2069
2077 /* 2070 if (retval) {
2078 * Don't hold on to start_mm if it looks like exiting. 2071 mmput(prev_mm);
2079 */ 2072 goto out;
2080 if (atomic_read(&start_mm->mm_users) == 1) {
2081 mmput(start_mm);
2082 start_mm = &init_mm;
2083 mmget(&init_mm);
2084 } 2073 }
2085 2074
2086 /* 2075 /*
2087 * Wait for and lock page. When do_swap_page races with 2076 * Make sure that we aren't completely killing
2088 * try_to_unuse, do_swap_page can handle the fault much 2077 * interactive performance.
2089 * faster than try_to_unuse can locate the entry. This
2090 * apparently redundant "wait_on_page_locked" lets try_to_unuse
2091 * defer to do_swap_page in such a case - in some tests,
2092 * do_swap_page and try_to_unuse repeatedly compete.
2093 */
2094 wait_on_page_locked(page);
2095 wait_on_page_writeback(page);
2096 lock_page(page);
2097 wait_on_page_writeback(page);
2098
2099 /*
2100 * Remove all references to entry.
2101 */ 2078 */
2102 swcount = *swap_map; 2079 cond_resched();
2103 if (swap_count(swcount) == SWAP_MAP_SHMEM) { 2080 spin_lock(&mmlist_lock);
2104 retval = shmem_unuse(entry, page); 2081 }
2105 /* page has already been unlocked and released */ 2082 spin_unlock(&mmlist_lock);
2106 if (retval < 0)
2107 break;
2108 continue;
2109 }
2110 if (swap_count(swcount) && start_mm != &init_mm)
2111 retval = unuse_mm(start_mm, entry, page);
2112
2113 if (swap_count(*swap_map)) {
2114 int set_start_mm = (*swap_map >= swcount);
2115 struct list_head *p = &start_mm->mmlist;
2116 struct mm_struct *new_start_mm = start_mm;
2117 struct mm_struct *prev_mm = start_mm;
2118 struct mm_struct *mm;
2119
2120 mmget(new_start_mm);
2121 mmget(prev_mm);
2122 spin_lock(&mmlist_lock);
2123 while (swap_count(*swap_map) && !retval &&
2124 (p = p->next) != &start_mm->mmlist) {
2125 mm = list_entry(p, struct mm_struct, mmlist);
2126 if (!mmget_not_zero(mm))
2127 continue;
2128 spin_unlock(&mmlist_lock);
2129 mmput(prev_mm);
2130 prev_mm = mm;
2131 2083
2132 cond_resched(); 2084 mmput(prev_mm);
2133 2085
2134 swcount = *swap_map; 2086 i = 0;
2135 if (!swap_count(swcount)) /* any usage ? */ 2087 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2136 ;
2137 else if (mm == &init_mm)
2138 set_start_mm = 1;
2139 else
2140 retval = unuse_mm(mm, entry, page);
2141
2142 if (set_start_mm && *swap_map < swcount) {
2143 mmput(new_start_mm);
2144 mmget(mm);
2145 new_start_mm = mm;
2146 set_start_mm = 0;
2147 }
2148 spin_lock(&mmlist_lock);
2149 }
2150 spin_unlock(&mmlist_lock);
2151 mmput(prev_mm);
2152 mmput(start_mm);
2153 start_mm = new_start_mm;
2154 }
2155 if (retval) {
2156 unlock_page(page);
2157 put_page(page);
2158 break;
2159 }
2160 2088
2161 /* 2089 entry = swp_entry(type, i);
2162 * If a reference remains (rare), we would like to leave 2090 page = find_get_page(swap_address_space(entry), i);
2163 * the page in the swap cache; but try_to_unmap could 2091 if (!page)
2164 * then re-duplicate the entry once we drop page lock, 2092 continue;
2165 * so we might loop indefinitely; also, that page could
2166 * not be swapped out to other storage meanwhile. So:
2167 * delete from cache even if there's another reference,
2168 * after ensuring that the data has been saved to disk -
2169 * since if the reference remains (rarer), it will be
2170 * read from disk into another page. Splitting into two
2171 * pages would be incorrect if swap supported "shared
2172 * private" pages, but they are handled by tmpfs files.
2173 *
2174 * Given how unuse_vma() targets one particular offset
2175 * in an anon_vma, once the anon_vma has been determined,
2176 * this splitting happens to be just what is needed to
2177 * handle where KSM pages have been swapped out: re-reading
2178 * is unnecessarily slow, but we can fix that later on.
2179 */
2180 if (swap_count(*swap_map) &&
2181 PageDirty(page) && PageSwapCache(page)) {
2182 struct writeback_control wbc = {
2183 .sync_mode = WB_SYNC_NONE,
2184 };
2185
2186 swap_writepage(compound_head(page), &wbc);
2187 lock_page(page);
2188 wait_on_page_writeback(page);
2189 }
2190 2093
2191 /* 2094 /*
2192 * It is conceivable that a racing task removed this page from 2095 * It is conceivable that a racing task removed this page from
2193 * swap cache just before we acquired the page lock at the top, 2096 * swap cache just before we acquired the page lock. The page
2194 * or while we dropped it in unuse_mm(). The page might even 2097 * might even be back in swap cache on another swap area. But
2195 * be back in swap cache on another swap area: that we must not 2098 * that is okay, try_to_free_swap() only removes stale pages.
2196 * delete, since it may not have been written out to swap yet.
2197 */
2198 if (PageSwapCache(page) &&
2199 likely(page_private(page) == entry.val) &&
2200 (!PageTransCompound(page) ||
2201 !swap_page_trans_huge_swapped(si, entry)))
2202 delete_from_swap_cache(compound_head(page));
2203
2204 /*
2205 * So we could skip searching mms once swap count went
2206 * to 1, we did not mark any present ptes as dirty: must
2207 * mark page dirty so shrink_page_list will preserve it.
2208 */ 2099 */
2209 SetPageDirty(page); 2100 lock_page(page);
2101 wait_on_page_writeback(page);
2102 try_to_free_swap(page);
2210 unlock_page(page); 2103 unlock_page(page);
2211 put_page(page); 2104 put_page(page);
2212 2105
2213 /* 2106 /*
2214 * Make sure that we aren't completely killing 2107 * For frontswap, we just need to unuse pages_to_unuse, if
2215 * interactive performance. 2108 * it was specified. Need not check frontswap again here as
2109 * we already zeroed out pages_to_unuse if not frontswap.
2216 */ 2110 */
2217 cond_resched(); 2111 if (pages_to_unuse && --pages_to_unuse == 0)
2218 if (frontswap && pages_to_unuse > 0) { 2112 goto out;
2219 if (!--pages_to_unuse)
2220 break;
2221 }
2222 } 2113 }
2223 2114
2224 mmput(start_mm); 2115 /*
2225 return retval; 2116 * Lets check again to see if there are still swap entries in the map.
2117 * If yes, we would need to do retry the unuse logic again.
2118 * Under global memory pressure, swap entries can be reinserted back
2119 * into process space after the mmlist loop above passes over them.
2120 * Its not worth continuosuly retrying to unuse the swap in this case.
2121 * So we try SWAP_UNUSE_MAX_TRIES times.
2122 */
2123 if (++retries >= SWAP_UNUSE_MAX_TRIES)
2124 retval = -EBUSY;
2125 else if (si->inuse_pages)
2126 goto retry;
2127
2128out:
2129 return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
2226} 2130}
2227 2131
2228/* 2132/*
@@ -2258,7 +2162,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2258 struct swap_extent *se; 2162 struct swap_extent *se;
2259 pgoff_t offset; 2163 pgoff_t offset;
2260 2164
2261 sis = swap_info[swp_type(entry)]; 2165 sis = swp_swap_info(entry);
2262 *bdev = sis->bdev; 2166 *bdev = sis->bdev;
2263 2167
2264 offset = swp_offset(entry); 2168 offset = swp_offset(entry);
@@ -2700,9 +2604,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
2700 if (!l) 2604 if (!l)
2701 return SEQ_START_TOKEN; 2605 return SEQ_START_TOKEN;
2702 2606
2703 for (type = 0; type < nr_swapfiles; type++) { 2607 for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
2704 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
2705 si = swap_info[type];
2706 if (!(si->flags & SWP_USED) || !si->swap_map) 2608 if (!(si->flags & SWP_USED) || !si->swap_map)
2707 continue; 2609 continue;
2708 if (!--l) 2610 if (!--l)
@@ -2722,9 +2624,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2722 else 2624 else
2723 type = si->type + 1; 2625 type = si->type + 1;
2724 2626
2725 for (; type < nr_swapfiles; type++) { 2627 for (; (si = swap_type_to_swap_info(type)); type++) {
2726 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
2727 si = swap_info[type];
2728 if (!(si->flags & SWP_USED) || !si->swap_map) 2628 if (!(si->flags & SWP_USED) || !si->swap_map)
2729 continue; 2629 continue;
2730 ++*pos; 2630 ++*pos;
@@ -2813,9 +2713,8 @@ static struct swap_info_struct *alloc_swap_info(void)
2813 struct swap_info_struct *p; 2713 struct swap_info_struct *p;
2814 unsigned int type; 2714 unsigned int type;
2815 int i; 2715 int i;
2816 int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
2817 2716
2818 p = kvzalloc(size, GFP_KERNEL); 2717 p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2819 if (!p) 2718 if (!p)
2820 return ERR_PTR(-ENOMEM); 2719 return ERR_PTR(-ENOMEM);
2821 2720
@@ -2831,14 +2730,14 @@ static struct swap_info_struct *alloc_swap_info(void)
2831 } 2730 }
2832 if (type >= nr_swapfiles) { 2731 if (type >= nr_swapfiles) {
2833 p->type = type; 2732 p->type = type;
2834 swap_info[type] = p; 2733 WRITE_ONCE(swap_info[type], p);
2835 /* 2734 /*
2836 * Write swap_info[type] before nr_swapfiles, in case a 2735 * Write swap_info[type] before nr_swapfiles, in case a
2837 * racing procfs swap_start() or swap_next() is reading them. 2736 * racing procfs swap_start() or swap_next() is reading them.
2838 * (We never shrink nr_swapfiles, we never free this entry.) 2737 * (We never shrink nr_swapfiles, we never free this entry.)
2839 */ 2738 */
2840 smp_wmb(); 2739 smp_wmb();
2841 nr_swapfiles++; 2740 WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
2842 } else { 2741 } else {
2843 kvfree(p); 2742 kvfree(p);
2844 p = swap_info[type]; 2743 p = swap_info[type];
@@ -3358,7 +3257,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3358{ 3257{
3359 struct swap_info_struct *p; 3258 struct swap_info_struct *p;
3360 struct swap_cluster_info *ci; 3259 struct swap_cluster_info *ci;
3361 unsigned long offset, type; 3260 unsigned long offset;
3362 unsigned char count; 3261 unsigned char count;
3363 unsigned char has_cache; 3262 unsigned char has_cache;
3364 int err = -EINVAL; 3263 int err = -EINVAL;
@@ -3366,10 +3265,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3366 if (non_swap_entry(entry)) 3265 if (non_swap_entry(entry))
3367 goto out; 3266 goto out;
3368 3267
3369 type = swp_type(entry); 3268 p = swp_swap_info(entry);
3370 if (type >= nr_swapfiles) 3269 if (!p)
3371 goto bad_file; 3270 goto bad_file;
3372 p = swap_info[type]; 3271
3373 offset = swp_offset(entry); 3272 offset = swp_offset(entry);
3374 if (unlikely(offset >= p->max)) 3273 if (unlikely(offset >= p->max))
3375 goto out; 3274 goto out;
@@ -3466,7 +3365,7 @@ int swapcache_prepare(swp_entry_t entry)
3466 3365
3467struct swap_info_struct *swp_swap_info(swp_entry_t entry) 3366struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3468{ 3367{
3469 return swap_info[swp_type(entry)]; 3368 return swap_type_to_swap_info(swp_type(entry));
3470} 3369}
3471 3370
3472struct swap_info_struct *page_swap_info(struct page *page) 3371struct swap_info_struct *page_swap_info(struct page *page)
diff --git a/mm/truncate.c b/mm/truncate.c
index 798e7ccfb030..b7d3c99f00c9 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -539,6 +539,8 @@ EXPORT_SYMBOL(truncate_inode_pages_final);
539 * invalidate_mapping_pages() will not block on IO activity. It will not 539 * invalidate_mapping_pages() will not block on IO activity. It will not
540 * invalidate pages which are dirty, locked, under writeback or mapped into 540 * invalidate pages which are dirty, locked, under writeback or mapped into
541 * pagetables. 541 * pagetables.
542 *
543 * Return: the number of the pages that were invalidated
542 */ 544 */
543unsigned long invalidate_mapping_pages(struct address_space *mapping, 545unsigned long invalidate_mapping_pages(struct address_space *mapping,
544 pgoff_t start, pgoff_t end) 546 pgoff_t start, pgoff_t end)
@@ -664,7 +666,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
664 * Any pages which are found to be mapped into pagetables are unmapped prior to 666 * Any pages which are found to be mapped into pagetables are unmapped prior to
665 * invalidation. 667 * invalidation.
666 * 668 *
667 * Returns -EBUSY if any pages could not be invalidated. 669 * Return: -EBUSY if any pages could not be invalidated.
668 */ 670 */
669int invalidate_inode_pages2_range(struct address_space *mapping, 671int invalidate_inode_pages2_range(struct address_space *mapping,
670 pgoff_t start, pgoff_t end) 672 pgoff_t start, pgoff_t end)
@@ -761,7 +763,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
761 * Any pages which are found to be mapped into pagetables are unmapped prior to 763 * Any pages which are found to be mapped into pagetables are unmapped prior to
762 * invalidation. 764 * invalidation.
763 * 765 *
764 * Returns -EBUSY if any pages could not be invalidated. 766 * Return: -EBUSY if any pages could not be invalidated.
765 */ 767 */
766int invalidate_inode_pages2(struct address_space *mapping) 768int invalidate_inode_pages2(struct address_space *mapping)
767{ 769{
diff --git a/mm/util.c b/mm/util.c
index 379319b1bcfd..d559bde497a9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -36,6 +36,8 @@ EXPORT_SYMBOL(kfree_const);
36 * kstrdup - allocate space for and copy an existing string 36 * kstrdup - allocate space for and copy an existing string
37 * @s: the string to duplicate 37 * @s: the string to duplicate
38 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 38 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
39 *
40 * Return: newly allocated copy of @s or %NULL in case of error
39 */ 41 */
40char *kstrdup(const char *s, gfp_t gfp) 42char *kstrdup(const char *s, gfp_t gfp)
41{ 43{
@@ -58,9 +60,10 @@ EXPORT_SYMBOL(kstrdup);
58 * @s: the string to duplicate 60 * @s: the string to duplicate
59 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 61 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
60 * 62 *
61 * Function returns source string if it is in .rodata section otherwise it 63 * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
62 * fallbacks to kstrdup. 64 *
63 * Strings allocated by kstrdup_const should be freed by kfree_const. 65 * Return: source string if it is in .rodata section otherwise
66 * fallback to kstrdup.
64 */ 67 */
65const char *kstrdup_const(const char *s, gfp_t gfp) 68const char *kstrdup_const(const char *s, gfp_t gfp)
66{ 69{
@@ -78,6 +81,8 @@ EXPORT_SYMBOL(kstrdup_const);
78 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 81 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
79 * 82 *
80 * Note: Use kmemdup_nul() instead if the size is known exactly. 83 * Note: Use kmemdup_nul() instead if the size is known exactly.
84 *
85 * Return: newly allocated copy of @s or %NULL in case of error
81 */ 86 */
82char *kstrndup(const char *s, size_t max, gfp_t gfp) 87char *kstrndup(const char *s, size_t max, gfp_t gfp)
83{ 88{
@@ -103,6 +108,8 @@ EXPORT_SYMBOL(kstrndup);
103 * @src: memory region to duplicate 108 * @src: memory region to duplicate
104 * @len: memory region length 109 * @len: memory region length
105 * @gfp: GFP mask to use 110 * @gfp: GFP mask to use
111 *
112 * Return: newly allocated copy of @src or %NULL in case of error
106 */ 113 */
107void *kmemdup(const void *src, size_t len, gfp_t gfp) 114void *kmemdup(const void *src, size_t len, gfp_t gfp)
108{ 115{
@@ -120,6 +127,9 @@ EXPORT_SYMBOL(kmemdup);
120 * @s: The data to stringify 127 * @s: The data to stringify
121 * @len: The size of the data 128 * @len: The size of the data
122 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 129 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
130 *
131 * Return: newly allocated copy of @s with NUL-termination or %NULL in
132 * case of error
123 */ 133 */
124char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) 134char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
125{ 135{
@@ -143,7 +153,7 @@ EXPORT_SYMBOL(kmemdup_nul);
143 * @src: source address in user space 153 * @src: source address in user space
144 * @len: number of bytes to copy 154 * @len: number of bytes to copy
145 * 155 *
146 * Returns an ERR_PTR() on failure. Result is physically 156 * Return: an ERR_PTR() on failure. Result is physically
147 * contiguous, to be freed by kfree(). 157 * contiguous, to be freed by kfree().
148 */ 158 */
149void *memdup_user(const void __user *src, size_t len) 159void *memdup_user(const void __user *src, size_t len)
@@ -169,7 +179,7 @@ EXPORT_SYMBOL(memdup_user);
169 * @src: source address in user space 179 * @src: source address in user space
170 * @len: number of bytes to copy 180 * @len: number of bytes to copy
171 * 181 *
172 * Returns an ERR_PTR() on failure. Result may be not 182 * Return: an ERR_PTR() on failure. Result may be not
173 * physically contiguous. Use kvfree() to free. 183 * physically contiguous. Use kvfree() to free.
174 */ 184 */
175void *vmemdup_user(const void __user *src, size_t len) 185void *vmemdup_user(const void __user *src, size_t len)
@@ -193,6 +203,8 @@ EXPORT_SYMBOL(vmemdup_user);
193 * strndup_user - duplicate an existing string from user space 203 * strndup_user - duplicate an existing string from user space
194 * @s: The string to duplicate 204 * @s: The string to duplicate
195 * @n: Maximum number of bytes to copy, including the trailing NUL. 205 * @n: Maximum number of bytes to copy, including the trailing NUL.
206 *
207 * Return: newly allocated copy of @s or %NULL in case of error
196 */ 208 */
197char *strndup_user(const char __user *s, long n) 209char *strndup_user(const char __user *s, long n)
198{ 210{
@@ -224,7 +236,7 @@ EXPORT_SYMBOL(strndup_user);
224 * @src: source address in user space 236 * @src: source address in user space
225 * @len: number of bytes to copy 237 * @len: number of bytes to copy
226 * 238 *
227 * Returns an ERR_PTR() on failure. 239 * Return: an ERR_PTR() on failure.
228 */ 240 */
229void *memdup_user_nul(const void __user *src, size_t len) 241void *memdup_user_nul(const void __user *src, size_t len)
230{ 242{
@@ -310,10 +322,6 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
310 * @pages: array that receives pointers to the pages pinned. 322 * @pages: array that receives pointers to the pages pinned.
311 * Should be at least nr_pages long. 323 * Should be at least nr_pages long.
312 * 324 *
313 * Returns number of pages pinned. This may be fewer than the number
314 * requested. If nr_pages is 0 or negative, returns 0. If no pages
315 * were pinned, returns -errno.
316 *
317 * get_user_pages_fast provides equivalent functionality to get_user_pages, 325 * get_user_pages_fast provides equivalent functionality to get_user_pages,
318 * operating on current and current->mm, with force=0 and vma=NULL. However 326 * operating on current and current->mm, with force=0 and vma=NULL. However
319 * unlike get_user_pages, it must be called without mmap_sem held. 327 * unlike get_user_pages, it must be called without mmap_sem held.
@@ -325,6 +333,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
325 * pages have to be faulted in, it may turn out to be slightly slower so 333 * pages have to be faulted in, it may turn out to be slightly slower so
326 * callers need to carefully consider what to use. On many architectures, 334 * callers need to carefully consider what to use. On many architectures,
327 * get_user_pages_fast simply falls back to get_user_pages. 335 * get_user_pages_fast simply falls back to get_user_pages.
336 *
337 * Return: number of pages pinned. This may be fewer than the number
338 * requested. If nr_pages is 0 or negative, returns 0. If no pages
339 * were pinned, returns -errno.
328 */ 340 */
329int __weak get_user_pages_fast(unsigned long start, 341int __weak get_user_pages_fast(unsigned long start,
330 int nr_pages, int write, struct page **pages) 342 int nr_pages, int write, struct page **pages)
@@ -386,6 +398,8 @@ EXPORT_SYMBOL(vm_mmap);
386 * 398 *
387 * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not 399 * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
388 * fall back to vmalloc. 400 * fall back to vmalloc.
401 *
402 * Return: pointer to the allocated memory of %NULL in case of failure
389 */ 403 */
390void *kvmalloc_node(size_t size, gfp_t flags, int node) 404void *kvmalloc_node(size_t size, gfp_t flags, int node)
391{ 405{
@@ -729,7 +743,8 @@ error:
729 * @buffer: the buffer to copy to. 743 * @buffer: the buffer to copy to.
730 * @buflen: the length of the buffer. Larger cmdline values are truncated 744 * @buflen: the length of the buffer. Larger cmdline values are truncated
731 * to this length. 745 * to this length.
732 * Returns the size of the cmdline field copied. Note that the copy does 746 *
747 * Return: the size of the cmdline field copied. Note that the copy does
733 * not guarantee an ending NULL byte. 748 * not guarantee an ending NULL byte.
734 */ 749 */
735int get_cmdline(struct task_struct *task, char *buffer, int buflen) 750int get_cmdline(struct task_struct *task, char *buffer, int buflen)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 871e41c55e23..e86ba6e74b50 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -498,7 +498,11 @@ nocache:
498 } 498 }
499 499
500found: 500found:
501 if (addr + size > vend) 501 /*
502 * Check also calculated address against the vstart,
503 * because it can be 0 because of big align request.
504 */
505 if (addr + size > vend || addr < vstart)
502 goto overflow; 506 goto overflow;
503 507
504 va->va_start = addr; 508 va->va_start = addr;
@@ -840,7 +844,7 @@ static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
840 * @order: how many 2^order pages should be occupied in newly allocated block 844 * @order: how many 2^order pages should be occupied in newly allocated block
841 * @gfp_mask: flags for the page level allocator 845 * @gfp_mask: flags for the page level allocator
842 * 846 *
843 * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) 847 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
844 */ 848 */
845static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 849static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
846{ 850{
@@ -1187,6 +1191,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
1187EXPORT_SYMBOL(vm_map_ram); 1191EXPORT_SYMBOL(vm_map_ram);
1188 1192
1189static struct vm_struct *vmlist __initdata; 1193static struct vm_struct *vmlist __initdata;
1194
1190/** 1195/**
1191 * vm_area_add_early - add vmap area early during boot 1196 * vm_area_add_early - add vmap area early during boot
1192 * @vm: vm_struct to add 1197 * @vm: vm_struct to add
@@ -1421,13 +1426,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1421} 1426}
1422 1427
1423/** 1428/**
1424 * get_vm_area - reserve a contiguous kernel virtual area 1429 * get_vm_area - reserve a contiguous kernel virtual area
1425 * @size: size of the area 1430 * @size: size of the area
1426 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 1431 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
1427 * 1432 *
1428 * Search an area of @size in the kernel virtual mapping area, 1433 * Search an area of @size in the kernel virtual mapping area,
1429 * and reserved it for out purposes. Returns the area descriptor 1434 * and reserved it for out purposes. Returns the area descriptor
1430 * on success or %NULL on failure. 1435 * on success or %NULL on failure.
1436 *
1437 * Return: the area descriptor on success or %NULL on failure.
1431 */ 1438 */
1432struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1439struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1433{ 1440{
@@ -1444,12 +1451,14 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1444} 1451}
1445 1452
1446/** 1453/**
1447 * find_vm_area - find a continuous kernel virtual area 1454 * find_vm_area - find a continuous kernel virtual area
1448 * @addr: base address 1455 * @addr: base address
1456 *
1457 * Search for the kernel VM area starting at @addr, and return it.
1458 * It is up to the caller to do all required locking to keep the returned
1459 * pointer valid.
1449 * 1460 *
1450 * Search for the kernel VM area starting at @addr, and return it. 1461 * Return: pointer to the found area or %NULL on faulure
1451 * It is up to the caller to do all required locking to keep the returned
1452 * pointer valid.
1453 */ 1462 */
1454struct vm_struct *find_vm_area(const void *addr) 1463struct vm_struct *find_vm_area(const void *addr)
1455{ 1464{
@@ -1463,12 +1472,14 @@ struct vm_struct *find_vm_area(const void *addr)
1463} 1472}
1464 1473
1465/** 1474/**
1466 * remove_vm_area - find and remove a continuous kernel virtual area 1475 * remove_vm_area - find and remove a continuous kernel virtual area
1467 * @addr: base address 1476 * @addr: base address
1468 * 1477 *
1469 * Search for the kernel VM area starting at @addr, and remove it. 1478 * Search for the kernel VM area starting at @addr, and remove it.
1470 * This function returns the found VM area, but using it is NOT safe 1479 * This function returns the found VM area, but using it is NOT safe
1471 * on SMP machines, except for its size or flags. 1480 * on SMP machines, except for its size or flags.
1481 *
1482 * Return: pointer to the found area or %NULL on faulure
1472 */ 1483 */
1473struct vm_struct *remove_vm_area(const void *addr) 1484struct vm_struct *remove_vm_area(const void *addr)
1474{ 1485{
@@ -1505,7 +1516,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
1505 addr)) 1516 addr))
1506 return; 1517 return;
1507 1518
1508 area = find_vmap_area((unsigned long)addr)->vm; 1519 area = find_vm_area(addr);
1509 if (unlikely(!area)) { 1520 if (unlikely(!area)) {
1510 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 1521 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
1511 addr); 1522 addr);
@@ -1548,11 +1559,11 @@ static inline void __vfree_deferred(const void *addr)
1548} 1559}
1549 1560
1550/** 1561/**
1551 * vfree_atomic - release memory allocated by vmalloc() 1562 * vfree_atomic - release memory allocated by vmalloc()
1552 * @addr: memory base address 1563 * @addr: memory base address
1553 * 1564 *
1554 * This one is just like vfree() but can be called in any atomic context 1565 * This one is just like vfree() but can be called in any atomic context
1555 * except NMIs. 1566 * except NMIs.
1556 */ 1567 */
1557void vfree_atomic(const void *addr) 1568void vfree_atomic(const void *addr)
1558{ 1569{
@@ -1565,21 +1576,29 @@ void vfree_atomic(const void *addr)
1565 __vfree_deferred(addr); 1576 __vfree_deferred(addr);
1566} 1577}
1567 1578
1579static void __vfree(const void *addr)
1580{
1581 if (unlikely(in_interrupt()))
1582 __vfree_deferred(addr);
1583 else
1584 __vunmap(addr, 1);
1585}
1586
1568/** 1587/**
1569 * vfree - release memory allocated by vmalloc() 1588 * vfree - release memory allocated by vmalloc()
1570 * @addr: memory base address 1589 * @addr: memory base address
1571 * 1590 *
1572 * Free the virtually continuous memory area starting at @addr, as 1591 * Free the virtually continuous memory area starting at @addr, as
1573 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 1592 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
1574 * NULL, no operation is performed. 1593 * NULL, no operation is performed.
1575 * 1594 *
1576 * Must not be called in NMI context (strictly speaking, only if we don't 1595 * Must not be called in NMI context (strictly speaking, only if we don't
1577 * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 1596 * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
1578 * conventions for vfree() arch-depenedent would be a really bad idea) 1597 * conventions for vfree() arch-depenedent would be a really bad idea)
1579 * 1598 *
1580 * May sleep if called *not* from interrupt context. 1599 * May sleep if called *not* from interrupt context.
1581 * 1600 *
1582 * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) 1601 * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
1583 */ 1602 */
1584void vfree(const void *addr) 1603void vfree(const void *addr)
1585{ 1604{
@@ -1591,21 +1610,19 @@ void vfree(const void *addr)
1591 1610
1592 if (!addr) 1611 if (!addr)
1593 return; 1612 return;
1594 if (unlikely(in_interrupt())) 1613
1595 __vfree_deferred(addr); 1614 __vfree(addr);
1596 else
1597 __vunmap(addr, 1);
1598} 1615}
1599EXPORT_SYMBOL(vfree); 1616EXPORT_SYMBOL(vfree);
1600 1617
1601/** 1618/**
1602 * vunmap - release virtual mapping obtained by vmap() 1619 * vunmap - release virtual mapping obtained by vmap()
1603 * @addr: memory base address 1620 * @addr: memory base address
1604 * 1621 *
1605 * Free the virtually contiguous memory area starting at @addr, 1622 * Free the virtually contiguous memory area starting at @addr,
1606 * which was created from the page array passed to vmap(). 1623 * which was created from the page array passed to vmap().
1607 * 1624 *
1608 * Must not be called in interrupt context. 1625 * Must not be called in interrupt context.
1609 */ 1626 */
1610void vunmap(const void *addr) 1627void vunmap(const void *addr)
1611{ 1628{
@@ -1617,17 +1634,19 @@ void vunmap(const void *addr)
1617EXPORT_SYMBOL(vunmap); 1634EXPORT_SYMBOL(vunmap);
1618 1635
1619/** 1636/**
1620 * vmap - map an array of pages into virtually contiguous space 1637 * vmap - map an array of pages into virtually contiguous space
1621 * @pages: array of page pointers 1638 * @pages: array of page pointers
1622 * @count: number of pages to map 1639 * @count: number of pages to map
1623 * @flags: vm_area->flags 1640 * @flags: vm_area->flags
1624 * @prot: page protection for the mapping 1641 * @prot: page protection for the mapping
1625 * 1642 *
1626 * Maps @count pages from @pages into contiguous kernel virtual 1643 * Maps @count pages from @pages into contiguous kernel virtual
1627 * space. 1644 * space.
1645 *
1646 * Return: the address of the area or %NULL on failure
1628 */ 1647 */
1629void *vmap(struct page **pages, unsigned int count, 1648void *vmap(struct page **pages, unsigned int count,
1630 unsigned long flags, pgprot_t prot) 1649 unsigned long flags, pgprot_t prot)
1631{ 1650{
1632 struct vm_struct *area; 1651 struct vm_struct *area;
1633 unsigned long size; /* In bytes */ 1652 unsigned long size; /* In bytes */
@@ -1709,25 +1728,27 @@ fail:
1709 warn_alloc(gfp_mask, NULL, 1728 warn_alloc(gfp_mask, NULL,
1710 "vmalloc: allocation failure, allocated %ld of %ld bytes", 1729 "vmalloc: allocation failure, allocated %ld of %ld bytes",
1711 (area->nr_pages*PAGE_SIZE), area->size); 1730 (area->nr_pages*PAGE_SIZE), area->size);
1712 vfree(area->addr); 1731 __vfree(area->addr);
1713 return NULL; 1732 return NULL;
1714} 1733}
1715 1734
1716/** 1735/**
1717 * __vmalloc_node_range - allocate virtually contiguous memory 1736 * __vmalloc_node_range - allocate virtually contiguous memory
1718 * @size: allocation size 1737 * @size: allocation size
1719 * @align: desired alignment 1738 * @align: desired alignment
1720 * @start: vm area range start 1739 * @start: vm area range start
1721 * @end: vm area range end 1740 * @end: vm area range end
1722 * @gfp_mask: flags for the page level allocator 1741 * @gfp_mask: flags for the page level allocator
1723 * @prot: protection mask for the allocated pages 1742 * @prot: protection mask for the allocated pages
1724 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 1743 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
1725 * @node: node to use for allocation or NUMA_NO_NODE 1744 * @node: node to use for allocation or NUMA_NO_NODE
1726 * @caller: caller's return address 1745 * @caller: caller's return address
1727 * 1746 *
1728 * Allocate enough pages to cover @size from the page level 1747 * Allocate enough pages to cover @size from the page level
1729 * allocator with @gfp_mask flags. Map them into contiguous 1748 * allocator with @gfp_mask flags. Map them into contiguous
1730 * kernel virtual space, using a pagetable protection of @prot. 1749 * kernel virtual space, using a pagetable protection of @prot.
1750 *
1751 * Return: the address of the area or %NULL on failure
1731 */ 1752 */
1732void *__vmalloc_node_range(unsigned long size, unsigned long align, 1753void *__vmalloc_node_range(unsigned long size, unsigned long align,
1733 unsigned long start, unsigned long end, gfp_t gfp_mask, 1754 unsigned long start, unsigned long end, gfp_t gfp_mask,
@@ -1768,25 +1789,35 @@ fail:
1768 return NULL; 1789 return NULL;
1769} 1790}
1770 1791
1792/*
1793 * This is only for performance analysis of vmalloc and stress purpose.
1794 * It is required by vmalloc test module, therefore do not use it other
1795 * than that.
1796 */
1797#ifdef CONFIG_TEST_VMALLOC_MODULE
1798EXPORT_SYMBOL_GPL(__vmalloc_node_range);
1799#endif
1800
1771/** 1801/**
1772 * __vmalloc_node - allocate virtually contiguous memory 1802 * __vmalloc_node - allocate virtually contiguous memory
1773 * @size: allocation size 1803 * @size: allocation size
1774 * @align: desired alignment 1804 * @align: desired alignment
1775 * @gfp_mask: flags for the page level allocator 1805 * @gfp_mask: flags for the page level allocator
1776 * @prot: protection mask for the allocated pages 1806 * @prot: protection mask for the allocated pages
1777 * @node: node to use for allocation or NUMA_NO_NODE 1807 * @node: node to use for allocation or NUMA_NO_NODE
1778 * @caller: caller's return address 1808 * @caller: caller's return address
1779 * 1809 *
1780 * Allocate enough pages to cover @size from the page level 1810 * Allocate enough pages to cover @size from the page level
1781 * allocator with @gfp_mask flags. Map them into contiguous 1811 * allocator with @gfp_mask flags. Map them into contiguous
1782 * kernel virtual space, using a pagetable protection of @prot. 1812 * kernel virtual space, using a pagetable protection of @prot.
1783 * 1813 *
1784 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 1814 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
1785 * and __GFP_NOFAIL are not supported 1815 * and __GFP_NOFAIL are not supported
1786 * 1816 *
1787 * Any use of gfp flags outside of GFP_KERNEL should be consulted 1817 * Any use of gfp flags outside of GFP_KERNEL should be consulted
1788 * with mm people. 1818 * with mm people.
1789 * 1819 *
1820 * Return: pointer to the allocated memory or %NULL on error
1790 */ 1821 */
1791static void *__vmalloc_node(unsigned long size, unsigned long align, 1822static void *__vmalloc_node(unsigned long size, unsigned long align,
1792 gfp_t gfp_mask, pgprot_t prot, 1823 gfp_t gfp_mask, pgprot_t prot,
@@ -1818,13 +1849,16 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
1818} 1849}
1819 1850
1820/** 1851/**
1821 * vmalloc - allocate virtually contiguous memory 1852 * vmalloc - allocate virtually contiguous memory
1822 * @size: allocation size 1853 * @size: allocation size
1823 * Allocate enough pages to cover @size from the page level 1854 *
1824 * allocator and map them into contiguous kernel virtual space. 1855 * Allocate enough pages to cover @size from the page level
1856 * allocator and map them into contiguous kernel virtual space.
1857 *
1858 * For tight control over page level allocator and protection flags
1859 * use __vmalloc() instead.
1825 * 1860 *
1826 * For tight control over page level allocator and protection flags 1861 * Return: pointer to the allocated memory or %NULL on error
1827 * use __vmalloc() instead.
1828 */ 1862 */
1829void *vmalloc(unsigned long size) 1863void *vmalloc(unsigned long size)
1830{ 1864{
@@ -1834,14 +1868,17 @@ void *vmalloc(unsigned long size)
1834EXPORT_SYMBOL(vmalloc); 1868EXPORT_SYMBOL(vmalloc);
1835 1869
1836/** 1870/**
1837 * vzalloc - allocate virtually contiguous memory with zero fill 1871 * vzalloc - allocate virtually contiguous memory with zero fill
1838 * @size: allocation size 1872 * @size: allocation size
1839 * Allocate enough pages to cover @size from the page level 1873 *
1840 * allocator and map them into contiguous kernel virtual space. 1874 * Allocate enough pages to cover @size from the page level
1841 * The memory allocated is set to zero. 1875 * allocator and map them into contiguous kernel virtual space.
1842 * 1876 * The memory allocated is set to zero.
1843 * For tight control over page level allocator and protection flags 1877 *
1844 * use __vmalloc() instead. 1878 * For tight control over page level allocator and protection flags
1879 * use __vmalloc() instead.
1880 *
1881 * Return: pointer to the allocated memory or %NULL on error
1845 */ 1882 */
1846void *vzalloc(unsigned long size) 1883void *vzalloc(unsigned long size)
1847{ 1884{
@@ -1856,34 +1893,30 @@ EXPORT_SYMBOL(vzalloc);
1856 * 1893 *
1857 * The resulting memory area is zeroed so it can be mapped to userspace 1894 * The resulting memory area is zeroed so it can be mapped to userspace
1858 * without leaking data. 1895 * without leaking data.
1896 *
1897 * Return: pointer to the allocated memory or %NULL on error
1859 */ 1898 */
1860void *vmalloc_user(unsigned long size) 1899void *vmalloc_user(unsigned long size)
1861{ 1900{
1862 struct vm_struct *area; 1901 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
1863 void *ret; 1902 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
1864 1903 VM_USERMAP, NUMA_NO_NODE,
1865 ret = __vmalloc_node(size, SHMLBA, 1904 __builtin_return_address(0));
1866 GFP_KERNEL | __GFP_ZERO,
1867 PAGE_KERNEL, NUMA_NO_NODE,
1868 __builtin_return_address(0));
1869 if (ret) {
1870 area = find_vm_area(ret);
1871 area->flags |= VM_USERMAP;
1872 }
1873 return ret;
1874} 1905}
1875EXPORT_SYMBOL(vmalloc_user); 1906EXPORT_SYMBOL(vmalloc_user);
1876 1907
1877/** 1908/**
1878 * vmalloc_node - allocate memory on a specific node 1909 * vmalloc_node - allocate memory on a specific node
1879 * @size: allocation size 1910 * @size: allocation size
1880 * @node: numa node 1911 * @node: numa node
1912 *
1913 * Allocate enough pages to cover @size from the page level
1914 * allocator and map them into contiguous kernel virtual space.
1881 * 1915 *
1882 * Allocate enough pages to cover @size from the page level 1916 * For tight control over page level allocator and protection flags
1883 * allocator and map them into contiguous kernel virtual space. 1917 * use __vmalloc() instead.
1884 * 1918 *
1885 * For tight control over page level allocator and protection flags 1919 * Return: pointer to the allocated memory or %NULL on error
1886 * use __vmalloc() instead.
1887 */ 1920 */
1888void *vmalloc_node(unsigned long size, int node) 1921void *vmalloc_node(unsigned long size, int node)
1889{ 1922{
@@ -1903,6 +1936,8 @@ EXPORT_SYMBOL(vmalloc_node);
1903 * 1936 *
1904 * For tight control over page level allocator and protection flags 1937 * For tight control over page level allocator and protection flags
1905 * use __vmalloc_node() instead. 1938 * use __vmalloc_node() instead.
1939 *
1940 * Return: pointer to the allocated memory or %NULL on error
1906 */ 1941 */
1907void *vzalloc_node(unsigned long size, int node) 1942void *vzalloc_node(unsigned long size, int node)
1908{ 1943{
@@ -1912,17 +1947,18 @@ void *vzalloc_node(unsigned long size, int node)
1912EXPORT_SYMBOL(vzalloc_node); 1947EXPORT_SYMBOL(vzalloc_node);
1913 1948
1914/** 1949/**
1915 * vmalloc_exec - allocate virtually contiguous, executable memory 1950 * vmalloc_exec - allocate virtually contiguous, executable memory
1916 * @size: allocation size 1951 * @size: allocation size
1917 * 1952 *
1918 * Kernel-internal function to allocate enough pages to cover @size 1953 * Kernel-internal function to allocate enough pages to cover @size
1919 * the page level allocator and map them into contiguous and 1954 * the page level allocator and map them into contiguous and
1920 * executable kernel virtual space. 1955 * executable kernel virtual space.
1956 *
1957 * For tight control over page level allocator and protection flags
1958 * use __vmalloc() instead.
1921 * 1959 *
1922 * For tight control over page level allocator and protection flags 1960 * Return: pointer to the allocated memory or %NULL on error
1923 * use __vmalloc() instead.
1924 */ 1961 */
1925
1926void *vmalloc_exec(unsigned long size) 1962void *vmalloc_exec(unsigned long size)
1927{ 1963{
1928 return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, 1964 return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
@@ -1942,11 +1978,13 @@ void *vmalloc_exec(unsigned long size)
1942#endif 1978#endif
1943 1979
1944/** 1980/**
1945 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 1981 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
1946 * @size: allocation size 1982 * @size: allocation size
1947 * 1983 *
1948 * Allocate enough 32bit PA addressable pages to cover @size from the 1984 * Allocate enough 32bit PA addressable pages to cover @size from the
1949 * page level allocator and map them into contiguous kernel virtual space. 1985 * page level allocator and map them into contiguous kernel virtual space.
1986 *
1987 * Return: pointer to the allocated memory or %NULL on error
1950 */ 1988 */
1951void *vmalloc_32(unsigned long size) 1989void *vmalloc_32(unsigned long size)
1952{ 1990{
@@ -1957,23 +1995,19 @@ EXPORT_SYMBOL(vmalloc_32);
1957 1995
1958/** 1996/**
1959 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 1997 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
1960 * @size: allocation size 1998 * @size: allocation size
1961 * 1999 *
1962 * The resulting memory area is 32bit addressable and zeroed so it can be 2000 * The resulting memory area is 32bit addressable and zeroed so it can be
1963 * mapped to userspace without leaking data. 2001 * mapped to userspace without leaking data.
2002 *
2003 * Return: pointer to the allocated memory or %NULL on error
1964 */ 2004 */
1965void *vmalloc_32_user(unsigned long size) 2005void *vmalloc_32_user(unsigned long size)
1966{ 2006{
1967 struct vm_struct *area; 2007 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
1968 void *ret; 2008 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1969 2009 VM_USERMAP, NUMA_NO_NODE,
1970 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 2010 __builtin_return_address(0));
1971 NUMA_NO_NODE, __builtin_return_address(0));
1972 if (ret) {
1973 area = find_vm_area(ret);
1974 area->flags |= VM_USERMAP;
1975 }
1976 return ret;
1977} 2011}
1978EXPORT_SYMBOL(vmalloc_32_user); 2012EXPORT_SYMBOL(vmalloc_32_user);
1979 2013
@@ -2059,31 +2093,29 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
2059} 2093}
2060 2094
2061/** 2095/**
2062 * vread() - read vmalloc area in a safe way. 2096 * vread() - read vmalloc area in a safe way.
2063 * @buf: buffer for reading data 2097 * @buf: buffer for reading data
2064 * @addr: vm address. 2098 * @addr: vm address.
2065 * @count: number of bytes to be read. 2099 * @count: number of bytes to be read.
2066 * 2100 *
2067 * Returns # of bytes which addr and buf should be increased. 2101 * This function checks that addr is a valid vmalloc'ed area, and
2068 * (same number to @count). Returns 0 if [addr...addr+count) doesn't 2102 * copy data from that area to a given buffer. If the given memory range
2069 * includes any intersect with alive vmalloc area. 2103 * of [addr...addr+count) includes some valid address, data is copied to
2070 * 2104 * proper area of @buf. If there are memory holes, they'll be zero-filled.
2071 * This function checks that addr is a valid vmalloc'ed area, and 2105 * IOREMAP area is treated as memory hole and no copy is done.
2072 * copy data from that area to a given buffer. If the given memory range 2106 *
2073 * of [addr...addr+count) includes some valid address, data is copied to 2107 * If [addr...addr+count) doesn't includes any intersects with alive
2074 * proper area of @buf. If there are memory holes, they'll be zero-filled. 2108 * vm_struct area, returns 0. @buf should be kernel's buffer.
2075 * IOREMAP area is treated as memory hole and no copy is done. 2109 *
2076 * 2110 * Note: In usual ops, vread() is never necessary because the caller
2077 * If [addr...addr+count) doesn't includes any intersects with alive 2111 * should know vmalloc() area is valid and can use memcpy().
2078 * vm_struct area, returns 0. @buf should be kernel's buffer. 2112 * This is for routines which have to access vmalloc area without
2079 * 2113 * any informaion, as /dev/kmem.
2080 * Note: In usual ops, vread() is never necessary because the caller 2114 *
2081 * should know vmalloc() area is valid and can use memcpy(). 2115 * Return: number of bytes for which addr and buf should be increased
2082 * This is for routines which have to access vmalloc area without 2116 * (same number as @count) or %0 if [addr...addr+count) doesn't
2083 * any informaion, as /dev/kmem. 2117 * include any intersection with valid vmalloc area
2084 *
2085 */ 2118 */
2086
2087long vread(char *buf, char *addr, unsigned long count) 2119long vread(char *buf, char *addr, unsigned long count)
2088{ 2120{
2089 struct vmap_area *va; 2121 struct vmap_area *va;
@@ -2140,31 +2172,29 @@ finished:
2140} 2172}
2141 2173
2142/** 2174/**
2143 * vwrite() - write vmalloc area in a safe way. 2175 * vwrite() - write vmalloc area in a safe way.
2144 * @buf: buffer for source data 2176 * @buf: buffer for source data
2145 * @addr: vm address. 2177 * @addr: vm address.
2146 * @count: number of bytes to be read. 2178 * @count: number of bytes to be read.
2147 * 2179 *
2148 * Returns # of bytes which addr and buf should be incresed. 2180 * This function checks that addr is a valid vmalloc'ed area, and
2149 * (same number to @count). 2181 * copy data from a buffer to the given addr. If specified range of
2150 * If [addr...addr+count) doesn't includes any intersect with valid 2182 * [addr...addr+count) includes some valid address, data is copied from
2151 * vmalloc area, returns 0. 2183 * proper area of @buf. If there are memory holes, no copy to hole.
2152 * 2184 * IOREMAP area is treated as memory hole and no copy is done.
2153 * This function checks that addr is a valid vmalloc'ed area, and 2185 *
2154 * copy data from a buffer to the given addr. If specified range of 2186 * If [addr...addr+count) doesn't includes any intersects with alive
2155 * [addr...addr+count) includes some valid address, data is copied from 2187 * vm_struct area, returns 0. @buf should be kernel's buffer.
2156 * proper area of @buf. If there are memory holes, no copy to hole. 2188 *
2157 * IOREMAP area is treated as memory hole and no copy is done. 2189 * Note: In usual ops, vwrite() is never necessary because the caller
2158 * 2190 * should know vmalloc() area is valid and can use memcpy().
2159 * If [addr...addr+count) doesn't includes any intersects with alive 2191 * This is for routines which have to access vmalloc area without
2160 * vm_struct area, returns 0. @buf should be kernel's buffer. 2192 * any informaion, as /dev/kmem.
2161 * 2193 *
2162 * Note: In usual ops, vwrite() is never necessary because the caller 2194 * Return: number of bytes for which addr and buf should be
2163 * should know vmalloc() area is valid and can use memcpy(). 2195 * increased (same number as @count) or %0 if [addr...addr+count)
2164 * This is for routines which have to access vmalloc area without 2196 * doesn't include any intersection with valid vmalloc area
2165 * any informaion, as /dev/kmem.
2166 */ 2197 */
2167
2168long vwrite(char *buf, char *addr, unsigned long count) 2198long vwrite(char *buf, char *addr, unsigned long count)
2169{ 2199{
2170 struct vmap_area *va; 2200 struct vmap_area *va;
@@ -2216,20 +2246,20 @@ finished:
2216} 2246}
2217 2247
2218/** 2248/**
2219 * remap_vmalloc_range_partial - map vmalloc pages to userspace 2249 * remap_vmalloc_range_partial - map vmalloc pages to userspace
2220 * @vma: vma to cover 2250 * @vma: vma to cover
2221 * @uaddr: target user address to start at 2251 * @uaddr: target user address to start at
2222 * @kaddr: virtual address of vmalloc kernel memory 2252 * @kaddr: virtual address of vmalloc kernel memory
2223 * @size: size of map area 2253 * @size: size of map area
2224 * 2254 *
2225 * Returns: 0 for success, -Exxx on failure 2255 * Returns: 0 for success, -Exxx on failure
2226 * 2256 *
2227 * This function checks that @kaddr is a valid vmalloc'ed area, 2257 * This function checks that @kaddr is a valid vmalloc'ed area,
2228 * and that it is big enough to cover the range starting at 2258 * and that it is big enough to cover the range starting at
2229 * @uaddr in @vma. Will return failure if that criteria isn't 2259 * @uaddr in @vma. Will return failure if that criteria isn't
2230 * met. 2260 * met.
2231 * 2261 *
2232 * Similar to remap_pfn_range() (see mm/memory.c) 2262 * Similar to remap_pfn_range() (see mm/memory.c)
2233 */ 2263 */
2234int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 2264int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2235 void *kaddr, unsigned long size) 2265 void *kaddr, unsigned long size)
@@ -2248,7 +2278,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2248 if (!(area->flags & VM_USERMAP)) 2278 if (!(area->flags & VM_USERMAP))
2249 return -EINVAL; 2279 return -EINVAL;
2250 2280
2251 if (kaddr + size > area->addr + area->size) 2281 if (kaddr + size > area->addr + get_vm_area_size(area))
2252 return -EINVAL; 2282 return -EINVAL;
2253 2283
2254 do { 2284 do {
@@ -2271,18 +2301,18 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2271EXPORT_SYMBOL(remap_vmalloc_range_partial); 2301EXPORT_SYMBOL(remap_vmalloc_range_partial);
2272 2302
2273/** 2303/**
2274 * remap_vmalloc_range - map vmalloc pages to userspace 2304 * remap_vmalloc_range - map vmalloc pages to userspace
2275 * @vma: vma to cover (map full range of vma) 2305 * @vma: vma to cover (map full range of vma)
2276 * @addr: vmalloc memory 2306 * @addr: vmalloc memory
2277 * @pgoff: number of pages into addr before first page to map 2307 * @pgoff: number of pages into addr before first page to map
2278 * 2308 *
2279 * Returns: 0 for success, -Exxx on failure 2309 * Returns: 0 for success, -Exxx on failure
2280 * 2310 *
2281 * This function checks that addr is a valid vmalloc'ed area, and 2311 * This function checks that addr is a valid vmalloc'ed area, and
2282 * that it is big enough to cover the vma. Will return failure if 2312 * that it is big enough to cover the vma. Will return failure if
2283 * that criteria isn't met. 2313 * that criteria isn't met.
2284 * 2314 *
2285 * Similar to remap_pfn_range() (see mm/memory.c) 2315 * Similar to remap_pfn_range() (see mm/memory.c)
2286 */ 2316 */
2287int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 2317int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2288 unsigned long pgoff) 2318 unsigned long pgoff)
@@ -2314,18 +2344,18 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
2314} 2344}
2315 2345
2316/** 2346/**
2317 * alloc_vm_area - allocate a range of kernel address space 2347 * alloc_vm_area - allocate a range of kernel address space
2318 * @size: size of the area 2348 * @size: size of the area
2319 * @ptes: returns the PTEs for the address space 2349 * @ptes: returns the PTEs for the address space
2320 * 2350 *
2321 * Returns: NULL on failure, vm_struct on success 2351 * Returns: NULL on failure, vm_struct on success
2322 * 2352 *
2323 * This function reserves a range of kernel address space, and 2353 * This function reserves a range of kernel address space, and
2324 * allocates pagetables to map that range. No actual mappings 2354 * allocates pagetables to map that range. No actual mappings
2325 * are created. 2355 * are created.
2326 * 2356 *
2327 * If @ptes is non-NULL, pointers to the PTEs (in init_mm) 2357 * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
2328 * allocated for the VM area are returned. 2358 * allocated for the VM area are returned.
2329 */ 2359 */
2330struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) 2360struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
2331{ 2361{
@@ -2751,4 +2781,3 @@ static int __init proc_vmalloc_init(void)
2751module_init(proc_vmalloc_init); 2781module_init(proc_vmalloc_init);
2752 2782
2753#endif 2783#endif
2754
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e979705bbf32..a5ad0b35ab8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -374,7 +374,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
374 */ 374 */
375int prealloc_shrinker(struct shrinker *shrinker) 375int prealloc_shrinker(struct shrinker *shrinker)
376{ 376{
377 size_t size = sizeof(*shrinker->nr_deferred); 377 unsigned int size = sizeof(*shrinker->nr_deferred);
378 378
379 if (shrinker->flags & SHRINKER_NUMA_AWARE) 379 if (shrinker->flags & SHRINKER_NUMA_AWARE)
380 size *= nr_node_ids; 380 size *= nr_node_ids;
@@ -952,7 +952,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
952 */ 952 */
953 if (reclaimed && page_is_file_cache(page) && 953 if (reclaimed && page_is_file_cache(page) &&
954 !mapping_exiting(mapping) && !dax_mapping(mapping)) 954 !mapping_exiting(mapping) && !dax_mapping(mapping))
955 shadow = workingset_eviction(mapping, page); 955 shadow = workingset_eviction(page);
956 __delete_from_page_cache(page, shadow); 956 __delete_from_page_cache(page, shadow);
957 xa_unlock_irqrestore(&mapping->i_pages, flags); 957 xa_unlock_irqrestore(&mapping->i_pages, flags);
958 958
@@ -1106,16 +1106,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1106{ 1106{
1107 LIST_HEAD(ret_pages); 1107 LIST_HEAD(ret_pages);
1108 LIST_HEAD(free_pages); 1108 LIST_HEAD(free_pages);
1109 int pgactivate = 0;
1110 unsigned nr_unqueued_dirty = 0;
1111 unsigned nr_dirty = 0;
1112 unsigned nr_congested = 0;
1113 unsigned nr_reclaimed = 0; 1109 unsigned nr_reclaimed = 0;
1114 unsigned nr_writeback = 0;
1115 unsigned nr_immediate = 0;
1116 unsigned nr_ref_keep = 0;
1117 unsigned nr_unmap_fail = 0;
1118 1110
1111 memset(stat, 0, sizeof(*stat));
1119 cond_resched(); 1112 cond_resched();
1120 1113
1121 while (!list_empty(page_list)) { 1114 while (!list_empty(page_list)) {
@@ -1159,10 +1152,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1159 */ 1152 */
1160 page_check_dirty_writeback(page, &dirty, &writeback); 1153 page_check_dirty_writeback(page, &dirty, &writeback);
1161 if (dirty || writeback) 1154 if (dirty || writeback)
1162 nr_dirty++; 1155 stat->nr_dirty++;
1163 1156
1164 if (dirty && !writeback) 1157 if (dirty && !writeback)
1165 nr_unqueued_dirty++; 1158 stat->nr_unqueued_dirty++;
1166 1159
1167 /* 1160 /*
1168 * Treat this page as congested if the underlying BDI is or if 1161 * Treat this page as congested if the underlying BDI is or if
@@ -1174,7 +1167,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1174 if (((dirty || writeback) && mapping && 1167 if (((dirty || writeback) && mapping &&
1175 inode_write_congested(mapping->host)) || 1168 inode_write_congested(mapping->host)) ||
1176 (writeback && PageReclaim(page))) 1169 (writeback && PageReclaim(page)))
1177 nr_congested++; 1170 stat->nr_congested++;
1178 1171
1179 /* 1172 /*
1180 * If a page at the tail of the LRU is under writeback, there 1173 * If a page at the tail of the LRU is under writeback, there
@@ -1223,7 +1216,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1223 if (current_is_kswapd() && 1216 if (current_is_kswapd() &&
1224 PageReclaim(page) && 1217 PageReclaim(page) &&
1225 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { 1218 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1226 nr_immediate++; 1219 stat->nr_immediate++;
1227 goto activate_locked; 1220 goto activate_locked;
1228 1221
1229 /* Case 2 above */ 1222 /* Case 2 above */
@@ -1241,7 +1234,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1241 * and it's also appropriate in global reclaim. 1234 * and it's also appropriate in global reclaim.
1242 */ 1235 */
1243 SetPageReclaim(page); 1236 SetPageReclaim(page);
1244 nr_writeback++; 1237 stat->nr_writeback++;
1245 goto activate_locked; 1238 goto activate_locked;
1246 1239
1247 /* Case 3 above */ 1240 /* Case 3 above */
@@ -1261,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1261 case PAGEREF_ACTIVATE: 1254 case PAGEREF_ACTIVATE:
1262 goto activate_locked; 1255 goto activate_locked;
1263 case PAGEREF_KEEP: 1256 case PAGEREF_KEEP:
1264 nr_ref_keep++; 1257 stat->nr_ref_keep++;
1265 goto keep_locked; 1258 goto keep_locked;
1266 case PAGEREF_RECLAIM: 1259 case PAGEREF_RECLAIM:
1267 case PAGEREF_RECLAIM_CLEAN: 1260 case PAGEREF_RECLAIM_CLEAN:
@@ -1326,7 +1319,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1326 if (unlikely(PageTransHuge(page))) 1319 if (unlikely(PageTransHuge(page)))
1327 flags |= TTU_SPLIT_HUGE_PMD; 1320 flags |= TTU_SPLIT_HUGE_PMD;
1328 if (!try_to_unmap(page, flags)) { 1321 if (!try_to_unmap(page, flags)) {
1329 nr_unmap_fail++; 1322 stat->nr_unmap_fail++;
1330 goto activate_locked; 1323 goto activate_locked;
1331 } 1324 }
1332 } 1325 }
@@ -1474,7 +1467,7 @@ activate_locked:
1474 VM_BUG_ON_PAGE(PageActive(page), page); 1467 VM_BUG_ON_PAGE(PageActive(page), page);
1475 if (!PageMlocked(page)) { 1468 if (!PageMlocked(page)) {
1476 SetPageActive(page); 1469 SetPageActive(page);
1477 pgactivate++; 1470 stat->nr_activate++;
1478 count_memcg_page_event(page, PGACTIVATE); 1471 count_memcg_page_event(page, PGACTIVATE);
1479 } 1472 }
1480keep_locked: 1473keep_locked:
@@ -1489,18 +1482,8 @@ keep:
1489 free_unref_page_list(&free_pages); 1482 free_unref_page_list(&free_pages);
1490 1483
1491 list_splice(&ret_pages, page_list); 1484 list_splice(&ret_pages, page_list);
1492 count_vm_events(PGACTIVATE, pgactivate); 1485 count_vm_events(PGACTIVATE, stat->nr_activate);
1493 1486
1494 if (stat) {
1495 stat->nr_dirty = nr_dirty;
1496 stat->nr_congested = nr_congested;
1497 stat->nr_unqueued_dirty = nr_unqueued_dirty;
1498 stat->nr_writeback = nr_writeback;
1499 stat->nr_immediate = nr_immediate;
1500 stat->nr_activate = pgactivate;
1501 stat->nr_ref_keep = nr_ref_keep;
1502 stat->nr_unmap_fail = nr_unmap_fail;
1503 }
1504 return nr_reclaimed; 1487 return nr_reclaimed;
1505} 1488}
1506 1489
@@ -1512,6 +1495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1512 .priority = DEF_PRIORITY, 1495 .priority = DEF_PRIORITY,
1513 .may_unmap = 1, 1496 .may_unmap = 1,
1514 }; 1497 };
1498 struct reclaim_stat dummy_stat;
1515 unsigned long ret; 1499 unsigned long ret;
1516 struct page *page, *next; 1500 struct page *page, *next;
1517 LIST_HEAD(clean_pages); 1501 LIST_HEAD(clean_pages);
@@ -1525,7 +1509,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1525 } 1509 }
1526 1510
1527 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, 1511 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1528 TTU_IGNORE_ACCESS, NULL, true); 1512 TTU_IGNORE_ACCESS, &dummy_stat, true);
1529 list_splice(&clean_pages, page_list); 1513 list_splice(&clean_pages, page_list);
1530 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); 1514 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1531 return ret; 1515 return ret;
@@ -1630,8 +1614,8 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1630 1614
1631} 1615}
1632 1616
1633/* 1617/**
1634 * zone_lru_lock is heavily contended. Some of the functions that 1618 * pgdat->lru_lock is heavily contended. Some of the functions that
1635 * shrink the lists perform better by taking out a batch of pages 1619 * shrink the lists perform better by taking out a batch of pages
1636 * and working on them outside the LRU lock. 1620 * and working on them outside the LRU lock.
1637 * 1621 *
@@ -1653,7 +1637,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1653static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1637static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1654 struct lruvec *lruvec, struct list_head *dst, 1638 struct lruvec *lruvec, struct list_head *dst,
1655 unsigned long *nr_scanned, struct scan_control *sc, 1639 unsigned long *nr_scanned, struct scan_control *sc,
1656 isolate_mode_t mode, enum lru_list lru) 1640 enum lru_list lru)
1657{ 1641{
1658 struct list_head *src = &lruvec->lists[lru]; 1642 struct list_head *src = &lruvec->lists[lru];
1659 unsigned long nr_taken = 0; 1643 unsigned long nr_taken = 0;
@@ -1662,6 +1646,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1662 unsigned long skipped = 0; 1646 unsigned long skipped = 0;
1663 unsigned long scan, total_scan, nr_pages; 1647 unsigned long scan, total_scan, nr_pages;
1664 LIST_HEAD(pages_skipped); 1648 LIST_HEAD(pages_skipped);
1649 isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1665 1650
1666 scan = 0; 1651 scan = 0;
1667 for (total_scan = 0; 1652 for (total_scan = 0;
@@ -1765,11 +1750,11 @@ int isolate_lru_page(struct page *page)
1765 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); 1750 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1766 1751
1767 if (PageLRU(page)) { 1752 if (PageLRU(page)) {
1768 struct zone *zone = page_zone(page); 1753 pg_data_t *pgdat = page_pgdat(page);
1769 struct lruvec *lruvec; 1754 struct lruvec *lruvec;
1770 1755
1771 spin_lock_irq(zone_lru_lock(zone)); 1756 spin_lock_irq(&pgdat->lru_lock);
1772 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 1757 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1773 if (PageLRU(page)) { 1758 if (PageLRU(page)) {
1774 int lru = page_lru(page); 1759 int lru = page_lru(page);
1775 get_page(page); 1760 get_page(page);
@@ -1777,7 +1762,7 @@ int isolate_lru_page(struct page *page)
1777 del_page_from_lru_list(page, lruvec, lru); 1762 del_page_from_lru_list(page, lruvec, lru);
1778 ret = 0; 1763 ret = 0;
1779 } 1764 }
1780 spin_unlock_irq(zone_lru_lock(zone)); 1765 spin_unlock_irq(&pgdat->lru_lock);
1781 } 1766 }
1782 return ret; 1767 return ret;
1783} 1768}
@@ -1899,8 +1884,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1899 unsigned long nr_scanned; 1884 unsigned long nr_scanned;
1900 unsigned long nr_reclaimed = 0; 1885 unsigned long nr_reclaimed = 0;
1901 unsigned long nr_taken; 1886 unsigned long nr_taken;
1902 struct reclaim_stat stat = {}; 1887 struct reclaim_stat stat;
1903 isolate_mode_t isolate_mode = 0;
1904 int file = is_file_lru(lru); 1888 int file = is_file_lru(lru);
1905 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1889 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1906 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1890 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
@@ -1921,13 +1905,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1921 1905
1922 lru_add_drain(); 1906 lru_add_drain();
1923 1907
1924 if (!sc->may_unmap)
1925 isolate_mode |= ISOLATE_UNMAPPED;
1926
1927 spin_lock_irq(&pgdat->lru_lock); 1908 spin_lock_irq(&pgdat->lru_lock);
1928 1909
1929 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, 1910 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1930 &nr_scanned, sc, isolate_mode, lru); 1911 &nr_scanned, sc, lru);
1931 1912
1932 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 1913 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1933 reclaim_stat->recent_scanned[file] += nr_taken; 1914 reclaim_stat->recent_scanned[file] += nr_taken;
@@ -2009,9 +1990,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
2009 * processes, from rmap. 1990 * processes, from rmap.
2010 * 1991 *
2011 * If the pages are mostly unmapped, the processing is fast and it is 1992 * If the pages are mostly unmapped, the processing is fast and it is
2012 * appropriate to hold zone_lru_lock across the whole operation. But if 1993 * appropriate to hold pgdat->lru_lock across the whole operation. But if
2013 * the pages are mapped, the processing is slow (page_referenced()) so we 1994 * the pages are mapped, the processing is slow (page_referenced()) so we
2014 * should drop zone_lru_lock around each page. It's impossible to balance 1995 * should drop pgdat->lru_lock around each page. It's impossible to balance
2015 * this, so instead we remove the pages from the LRU while processing them. 1996 * this, so instead we remove the pages from the LRU while processing them.
2016 * It is safe to rely on PG_active against the non-LRU pages in here because 1997 * It is safe to rely on PG_active against the non-LRU pages in here because
2017 * nobody will play with that bit on a non-LRU page. 1998 * nobody will play with that bit on a non-LRU page.
@@ -2084,19 +2065,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
2084 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 2065 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2085 unsigned nr_deactivate, nr_activate; 2066 unsigned nr_deactivate, nr_activate;
2086 unsigned nr_rotated = 0; 2067 unsigned nr_rotated = 0;
2087 isolate_mode_t isolate_mode = 0;
2088 int file = is_file_lru(lru); 2068 int file = is_file_lru(lru);
2089 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2069 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2090 2070
2091 lru_add_drain(); 2071 lru_add_drain();
2092 2072
2093 if (!sc->may_unmap)
2094 isolate_mode |= ISOLATE_UNMAPPED;
2095
2096 spin_lock_irq(&pgdat->lru_lock); 2073 spin_lock_irq(&pgdat->lru_lock);
2097 2074
2098 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 2075 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2099 &nr_scanned, sc, isolate_mode, lru); 2076 &nr_scanned, sc, lru);
2100 2077
2101 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2078 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2102 reclaim_stat->recent_scanned[file] += nr_taken; 2079 reclaim_stat->recent_scanned[file] += nr_taken;
@@ -2754,16 +2731,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2754 sc->nr_reclaimed - reclaimed); 2731 sc->nr_reclaimed - reclaimed);
2755 2732
2756 /* 2733 /*
2757 * Direct reclaim and kswapd have to scan all memory 2734 * Kswapd have to scan all memory cgroups to fulfill
2758 * cgroups to fulfill the overall scan target for the 2735 * the overall scan target for the node.
2759 * node.
2760 * 2736 *
2761 * Limit reclaim, on the other hand, only cares about 2737 * Limit reclaim, on the other hand, only cares about
2762 * nr_to_reclaim pages to be reclaimed and it will 2738 * nr_to_reclaim pages to be reclaimed and it will
2763 * retry with decreasing priority if one round over the 2739 * retry with decreasing priority if one round over the
2764 * whole hierarchy is not sufficient. 2740 * whole hierarchy is not sufficient.
2765 */ 2741 */
2766 if (!global_reclaim(sc) && 2742 if (!current_is_kswapd() &&
2767 sc->nr_reclaimed >= sc->nr_to_reclaim) { 2743 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2768 mem_cgroup_iter_break(root, memcg); 2744 mem_cgroup_iter_break(root, memcg);
2769 break; 2745 break;
@@ -3527,7 +3503,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
3527 * 3503 *
3528 * kswapd scans the zones in the highmem->normal->dma direction. It skips 3504 * kswapd scans the zones in the highmem->normal->dma direction. It skips
3529 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 3505 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
3530 * found to have free_pages <= high_wmark_pages(zone), any page is that zone 3506 * found to have free_pages <= high_wmark_pages(zone), any page in that zone
3531 * or lower is eligible for reclaim until at least one usable zone is 3507 * or lower is eligible for reclaim until at least one usable zone is
3532 * balanced. 3508 * balanced.
3533 */ 3509 */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 83b30edc2f7f..36b56f858f0f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2121,21 +2121,14 @@ static int __init extfrag_debug_init(void)
2121 struct dentry *extfrag_debug_root; 2121 struct dentry *extfrag_debug_root;
2122 2122
2123 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 2123 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
2124 if (!extfrag_debug_root)
2125 return -ENOMEM;
2126 2124
2127 if (!debugfs_create_file("unusable_index", 0444, 2125 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
2128 extfrag_debug_root, NULL, &unusable_file_ops)) 2126 &unusable_file_ops);
2129 goto fail;
2130 2127
2131 if (!debugfs_create_file("extfrag_index", 0444, 2128 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
2132 extfrag_debug_root, NULL, &extfrag_file_ops)) 2129 &extfrag_file_ops);
2133 goto fail;
2134 2130
2135 return 0; 2131 return 0;
2136fail:
2137 debugfs_remove_recursive(extfrag_debug_root);
2138 return -ENOMEM;
2139} 2132}
2140 2133
2141module_init(extfrag_debug_init); 2134module_init(extfrag_debug_init);
diff --git a/mm/workingset.c b/mm/workingset.c
index dcb994f2acc2..0bedf67502d5 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -215,13 +215,12 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
215 215
216/** 216/**
217 * workingset_eviction - note the eviction of a page from memory 217 * workingset_eviction - note the eviction of a page from memory
218 * @mapping: address space the page was backing
219 * @page: the page being evicted 218 * @page: the page being evicted
220 * 219 *
221 * Returns a shadow entry to be stored in @mapping->i_pages in place 220 * Returns a shadow entry to be stored in @page->mapping->i_pages in place
222 * of the evicted @page so that a later refault can be detected. 221 * of the evicted @page so that a later refault can be detected.
223 */ 222 */
224void *workingset_eviction(struct address_space *mapping, struct page *page) 223void *workingset_eviction(struct page *page)
225{ 224{
226 struct pglist_data *pgdat = page_pgdat(page); 225 struct pglist_data *pgdat = page_pgdat(page);
227 struct mem_cgroup *memcg = page_memcg(page); 226 struct mem_cgroup *memcg = page_memcg(page);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 6ac919847ce6..f3f5a78cd062 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -158,6 +158,7 @@
158#include <linux/etherdevice.h> 158#include <linux/etherdevice.h>
159#include <linux/kthread.h> 159#include <linux/kthread.h>
160#include <linux/prefetch.h> 160#include <linux/prefetch.h>
161#include <linux/mmzone.h>
161#include <net/net_namespace.h> 162#include <net/net_namespace.h>
162#include <net/checksum.h> 163#include <net/checksum.h>
163#include <net/ipv6.h> 164#include <net/ipv6.h>
@@ -3625,7 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
3625 pkt_dev->svlan_cfi = 0; 3626 pkt_dev->svlan_cfi = 0;
3626 pkt_dev->svlan_id = 0xffff; 3627 pkt_dev->svlan_id = 0xffff;
3627 pkt_dev->burst = 1; 3628 pkt_dev->burst = 1;
3628 pkt_dev->node = -1; 3629 pkt_dev->node = NUMA_NO_NODE;
3629 3630
3630 err = pktgen_setup_dev(t->net, pkt_dev, ifname); 3631 err = pktgen_setup_dev(t->net, pkt_dev, ifname);
3631 if (err) 3632 if (err)
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 86e1e37eb4e8..b37e6e0a1026 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -15,6 +15,7 @@
15#include <linux/netlink.h> 15#include <linux/netlink.h>
16#include <linux/qrtr.h> 16#include <linux/qrtr.h>
17#include <linux/termios.h> /* For TIOCINQ/OUTQ */ 17#include <linux/termios.h> /* For TIOCINQ/OUTQ */
18#include <linux/numa.h>
18 19
19#include <net/sock.h> 20#include <net/sock.h>
20 21
@@ -101,7 +102,7 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk)
101 return container_of(sk, struct qrtr_sock, sk); 102 return container_of(sk, struct qrtr_sock, sk);
102} 103}
103 104
104static unsigned int qrtr_local_nid = -1; 105static unsigned int qrtr_local_nid = NUMA_NO_NODE;
105 106
106/* for node ids */ 107/* for node ids */
107static RADIX_TREE(qrtr_nodes, GFP_KERNEL); 108static RADIX_TREE(qrtr_nodes, GFP_KERNEL);
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 6deabedc67fc..6410bd22fe38 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -27,14 +27,9 @@ else
27 $(call cc-param,asan-globals=1) \ 27 $(call cc-param,asan-globals=1) \
28 $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ 28 $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \
29 $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \ 29 $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \
30 $(call cc-param,asan-use-after-scope=1) \
31 $(call cc-param,asan-instrument-allocas=1) 30 $(call cc-param,asan-instrument-allocas=1)
32endif 31endif
33 32
34ifdef CONFIG_KASAN_EXTRA
35CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope)
36endif
37
38endif # CONFIG_KASAN_GENERIC 33endif # CONFIG_KASAN_GENERIC
39 34
40ifdef CONFIG_KASAN_SW_TAGS 35ifdef CONFIG_KASAN_SW_TAGS
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 98a7d63a723e..bcdd45df3f51 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -37,6 +37,13 @@ parse_symbol() {
37 symbol=${symbol#\(} 37 symbol=${symbol#\(}
38 symbol=${symbol%\)} 38 symbol=${symbol%\)}
39 39
40 # Strip segment
41 local segment
42 if [[ $symbol == *:* ]] ; then
43 segment=${symbol%%:*}:
44 symbol=${symbol#*:}
45 fi
46
40 # Strip the symbol name so that we could look it up 47 # Strip the symbol name so that we could look it up
41 local name=${symbol%+*} 48 local name=${symbol%+*}
42 49
@@ -84,7 +91,7 @@ parse_symbol() {
84 code=${code//$'\n'/' '} 91 code=${code//$'\n'/' '}
85 92
86 # Replace old address with pretty line numbers 93 # Replace old address with pretty line numbers
87 symbol="$name ($code)" 94 symbol="$segment$name ($code)"
88} 95}
89 96
90decode_code() { 97decode_code() {
diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig
index d45f7f36b859..d9fd9988ef27 100644
--- a/scripts/gcc-plugins/Kconfig
+++ b/scripts/gcc-plugins/Kconfig
@@ -68,10 +68,6 @@ config GCC_PLUGIN_LATENT_ENTROPY
68 68
69config GCC_PLUGIN_STRUCTLEAK 69config GCC_PLUGIN_STRUCTLEAK
70 bool "Force initialization of variables containing userspace addresses" 70 bool "Force initialization of variables containing userspace addresses"
71 # Currently STRUCTLEAK inserts initialization out of live scope of
72 # variables from KASAN point of view. This leads to KASAN false
73 # positive reports. Prohibit this combination for now.
74 depends on !KASAN_EXTRA
75 help 71 help
76 This plugin zero-initializes any structures containing a 72 This plugin zero-initializes any structures containing a
77 __user attribute. This can prevent some classes of information 73 __user attribute. This can prevent some classes of information
diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h
new file mode 100644
index 000000000000..110b0e5d0fb0
--- /dev/null
+++ b/tools/include/linux/numa.h
@@ -0,0 +1,16 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_NUMA_H
3#define _LINUX_NUMA_H
4
5
6#ifdef CONFIG_NODES_SHIFT
7#define NODES_SHIFT CONFIG_NODES_SHIFT
8#else
9#define NODES_SHIFT 0
10#endif
11
12#define MAX_NUMNODES (1 << NODES_SHIFT)
13
14#define NUMA_NO_NODE (-1)
15
16#endif /* _LINUX_NUMA_H */
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 44195514b19e..98ad783efc69 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -34,6 +34,7 @@
34#include <sys/types.h> 34#include <sys/types.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/time64.h> 36#include <linux/time64.h>
37#include <linux/numa.h>
37 38
38#include <numa.h> 39#include <numa.h>
39#include <numaif.h> 40#include <numaif.h>
@@ -298,7 +299,7 @@ static cpu_set_t bind_to_node(int target_node)
298 299
299 CPU_ZERO(&mask); 300 CPU_ZERO(&mask);
300 301
301 if (target_node == -1) { 302 if (target_node == NUMA_NO_NODE) {
302 for (cpu = 0; cpu < g->p.nr_cpus; cpu++) 303 for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
303 CPU_SET(cpu, &mask); 304 CPU_SET(cpu, &mask);
304 } else { 305 } else {
@@ -339,7 +340,7 @@ static void bind_to_memnode(int node)
339 unsigned long nodemask; 340 unsigned long nodemask;
340 int ret; 341 int ret;
341 342
342 if (node == -1) 343 if (node == NUMA_NO_NODE)
343 return; 344 return;
344 345
345 BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); 346 BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8);
@@ -1363,7 +1364,7 @@ static void init_thread_data(void)
1363 int cpu; 1364 int cpu;
1364 1365
1365 /* Allow all nodes by default: */ 1366 /* Allow all nodes by default: */
1366 td->bind_node = -1; 1367 td->bind_node = NUMA_NO_NODE;
1367 1368
1368 /* Allow all CPUs by default: */ 1369 /* Allow all CPUs by default: */
1369 CPU_ZERO(&td->bind_cpumask); 1370 CPU_ZERO(&td->bind_cpumask);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 400ee81a3043..6a94f07c4164 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -48,6 +48,7 @@ TARGETS += sysctl
48ifneq (1, $(quicktest)) 48ifneq (1, $(quicktest))
49TARGETS += timers 49TARGETS += timers
50endif 50endif
51TARGETS += tmpfs
51TARGETS += user 52TARGETS += user
52TARGETS += vm 53TARGETS += vm
53TARGETS += x86 54TARGETS += x86
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 10baa1652fc2..c67d32eeb668 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -54,6 +54,22 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
54 return fd; 54 return fd;
55} 55}
56 56
57static int mfd_assert_reopen_fd(int fd_in)
58{
59 int r, fd;
60 char path[100];
61
62 sprintf(path, "/proc/self/fd/%d", fd_in);
63
64 fd = open(path, O_RDWR);
65 if (fd < 0) {
66 printf("re-open of existing fd %d failed\n", fd_in);
67 abort();
68 }
69
70 return fd;
71}
72
57static void mfd_fail_new(const char *name, unsigned int flags) 73static void mfd_fail_new(const char *name, unsigned int flags)
58{ 74{
59 int r; 75 int r;
@@ -255,6 +271,25 @@ static void mfd_assert_read(int fd)
255 munmap(p, mfd_def_size); 271 munmap(p, mfd_def_size);
256} 272}
257 273
274/* Test that PROT_READ + MAP_SHARED mappings work. */
275static void mfd_assert_read_shared(int fd)
276{
277 void *p;
278
279 /* verify PROT_READ and MAP_SHARED *is* allowed */
280 p = mmap(NULL,
281 mfd_def_size,
282 PROT_READ,
283 MAP_SHARED,
284 fd,
285 0);
286 if (p == MAP_FAILED) {
287 printf("mmap() failed: %m\n");
288 abort();
289 }
290 munmap(p, mfd_def_size);
291}
292
258static void mfd_assert_write(int fd) 293static void mfd_assert_write(int fd)
259{ 294{
260 ssize_t l; 295 ssize_t l;
@@ -693,6 +728,44 @@ static void test_seal_write(void)
693} 728}
694 729
695/* 730/*
731 * Test SEAL_FUTURE_WRITE
732 * Test whether SEAL_FUTURE_WRITE actually prevents modifications.
733 */
734static void test_seal_future_write(void)
735{
736 int fd, fd2;
737 void *p;
738
739 printf("%s SEAL-FUTURE-WRITE\n", memfd_str);
740
741 fd = mfd_assert_new("kern_memfd_seal_future_write",
742 mfd_def_size,
743 MFD_CLOEXEC | MFD_ALLOW_SEALING);
744
745 p = mfd_assert_mmap_shared(fd);
746
747 mfd_assert_has_seals(fd, 0);
748
749 mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE);
750 mfd_assert_has_seals(fd, F_SEAL_FUTURE_WRITE);
751
752 /* read should pass, writes should fail */
753 mfd_assert_read(fd);
754 mfd_assert_read_shared(fd);
755 mfd_fail_write(fd);
756
757 fd2 = mfd_assert_reopen_fd(fd);
758 /* read should pass, writes should still fail */
759 mfd_assert_read(fd2);
760 mfd_assert_read_shared(fd2);
761 mfd_fail_write(fd2);
762
763 munmap(p, mfd_def_size);
764 close(fd2);
765 close(fd);
766}
767
768/*
696 * Test SEAL_SHRINK 769 * Test SEAL_SHRINK
697 * Test whether SEAL_SHRINK actually prevents shrinking 770 * Test whether SEAL_SHRINK actually prevents shrinking
698 */ 771 */
@@ -945,6 +1018,7 @@ int main(int argc, char **argv)
945 test_basic(); 1018 test_basic();
946 1019
947 test_seal_write(); 1020 test_seal_write();
1021 test_seal_future_write();
948 test_seal_shrink(); 1022 test_seal_shrink();
949 test_seal_grow(); 1023 test_seal_grow();
950 test_seal_resize(); 1024 test_seal_resize();
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index 29bac5ef9a93..444ad39d3700 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -2,6 +2,7 @@
2/fd-002-posix-eq 2/fd-002-posix-eq
3/fd-003-kthread 3/fd-003-kthread
4/proc-loadavg-001 4/proc-loadavg-001
5/proc-pid-vm
5/proc-self-map-files-001 6/proc-self-map-files-001
6/proc-self-map-files-002 7/proc-self-map-files-002
7/proc-self-syscall 8/proc-self-syscall
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index 434d033ee067..5163dc887aa3 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -6,6 +6,7 @@ TEST_GEN_PROGS += fd-001-lookup
6TEST_GEN_PROGS += fd-002-posix-eq 6TEST_GEN_PROGS += fd-002-posix-eq
7TEST_GEN_PROGS += fd-003-kthread 7TEST_GEN_PROGS += fd-003-kthread
8TEST_GEN_PROGS += proc-loadavg-001 8TEST_GEN_PROGS += proc-loadavg-001
9TEST_GEN_PROGS += proc-pid-vm
9TEST_GEN_PROGS += proc-self-map-files-001 10TEST_GEN_PROGS += proc-self-map-files-001
10TEST_GEN_PROGS += proc-self-map-files-002 11TEST_GEN_PROGS += proc-self-map-files-002
11TEST_GEN_PROGS += proc-self-syscall 12TEST_GEN_PROGS += proc-self-syscall
diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c
index fcff7047000d..471e2aa28077 100644
--- a/tools/testing/selftests/proc/proc-loadavg-001.c
+++ b/tools/testing/selftests/proc/proc-loadavg-001.c
@@ -30,7 +30,7 @@ int main(void)
30 30
31 if (unshare(CLONE_NEWPID) == -1) { 31 if (unshare(CLONE_NEWPID) == -1) {
32 if (errno == ENOSYS || errno == EPERM) 32 if (errno == ENOSYS || errno == EPERM)
33 return 2; 33 return 4;
34 return 1; 34 return 1;
35 } 35 }
36 36
diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c
new file mode 100644
index 000000000000..bbe8150d18aa
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-pid-vm.c
@@ -0,0 +1,406 @@
1/*
2 * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16/*
17 * Fork and exec tiny 1 page executable which precisely controls its VM.
18 * Test /proc/$PID/maps
19 * Test /proc/$PID/smaps
20 * Test /proc/$PID/smaps_rollup
21 * Test /proc/$PID/statm
22 *
23 * FIXME require CONFIG_TMPFS which can be disabled
24 * FIXME test other values from "smaps"
25 * FIXME support other archs
26 */
27#undef NDEBUG
28#include <assert.h>
29#include <errno.h>
30#include <sched.h>
31#include <signal.h>
32#include <stdint.h>
33#include <stdio.h>
34#include <string.h>
35#include <stdlib.h>
36#include <sys/mount.h>
37#include <sys/types.h>
38#include <sys/stat.h>
39#include <fcntl.h>
40#include <unistd.h>
41#include <sys/syscall.h>
42#include <sys/uio.h>
43#include <linux/kdev_t.h>
44
45static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags)
46{
47 return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags);
48}
49
50static void make_private_tmp(void)
51{
52 if (unshare(CLONE_NEWNS) == -1) {
53 if (errno == ENOSYS || errno == EPERM) {
54 exit(4);
55 }
56 exit(1);
57 }
58 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
59 exit(1);
60 }
61 if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) {
62 exit(1);
63 }
64}
65
66static pid_t pid = -1;
67static void ate(void)
68{
69 if (pid > 0) {
70 kill(pid, SIGTERM);
71 }
72}
73
74struct elf64_hdr {
75 uint8_t e_ident[16];
76 uint16_t e_type;
77 uint16_t e_machine;
78 uint32_t e_version;
79 uint64_t e_entry;
80 uint64_t e_phoff;
81 uint64_t e_shoff;
82 uint32_t e_flags;
83 uint16_t e_ehsize;
84 uint16_t e_phentsize;
85 uint16_t e_phnum;
86 uint16_t e_shentsize;
87 uint16_t e_shnum;
88 uint16_t e_shstrndx;
89};
90
91struct elf64_phdr {
92 uint32_t p_type;
93 uint32_t p_flags;
94 uint64_t p_offset;
95 uint64_t p_vaddr;
96 uint64_t p_paddr;
97 uint64_t p_filesz;
98 uint64_t p_memsz;
99 uint64_t p_align;
100};
101
102#ifdef __x86_64__
103#define PAGE_SIZE 4096
104#define VADDR (1UL << 32)
105#define MAPS_OFFSET 73
106
107#define syscall 0x0f, 0x05
108#define mov_rdi(x) \
109 0x48, 0xbf, \
110 (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \
111 ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
112
113#define mov_rsi(x) \
114 0x48, 0xbe, \
115 (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \
116 ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
117
118#define mov_eax(x) \
119 0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff
120
121static const uint8_t payload[] = {
122 /* Casually unmap stack, vDSO and everything else. */
123 /* munmap */
124 mov_rdi(VADDR + 4096),
125 mov_rsi((1ULL << 47) - 4096 - VADDR - 4096),
126 mov_eax(11),
127 syscall,
128
129 /* Ping parent. */
130 /* write(0, &c, 1); */
131 0x31, 0xff, /* xor edi, edi */
132 0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00, /* lea rsi, [rip] */
133 0xba, 0x01, 0x00, 0x00, 0x00, /* mov edx, 1 */
134 mov_eax(1),
135 syscall,
136
137 /* 1: pause(); */
138 mov_eax(34),
139 syscall,
140
141 0xeb, 0xf7, /* jmp 1b */
142};
143
144static int make_exe(const uint8_t *payload, size_t len)
145{
146 struct elf64_hdr h;
147 struct elf64_phdr ph;
148
149 struct iovec iov[3] = {
150 {&h, sizeof(struct elf64_hdr)},
151 {&ph, sizeof(struct elf64_phdr)},
152 {(void *)payload, len},
153 };
154 int fd, fd1;
155 char buf[64];
156
157 memset(&h, 0, sizeof(h));
158 h.e_ident[0] = 0x7f;
159 h.e_ident[1] = 'E';
160 h.e_ident[2] = 'L';
161 h.e_ident[3] = 'F';
162 h.e_ident[4] = 2;
163 h.e_ident[5] = 1;
164 h.e_ident[6] = 1;
165 h.e_ident[7] = 0;
166 h.e_type = 2;
167 h.e_machine = 0x3e;
168 h.e_version = 1;
169 h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr);
170 h.e_phoff = sizeof(struct elf64_hdr);
171 h.e_shoff = 0;
172 h.e_flags = 0;
173 h.e_ehsize = sizeof(struct elf64_hdr);
174 h.e_phentsize = sizeof(struct elf64_phdr);
175 h.e_phnum = 1;
176 h.e_shentsize = 0;
177 h.e_shnum = 0;
178 h.e_shstrndx = 0;
179
180 memset(&ph, 0, sizeof(ph));
181 ph.p_type = 1;
182 ph.p_flags = (1<<2)|1;
183 ph.p_offset = 0;
184 ph.p_vaddr = VADDR;
185 ph.p_paddr = 0;
186 ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload);
187 ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload);
188 ph.p_align = 4096;
189
190 fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700);
191 if (fd == -1) {
192 exit(1);
193 }
194
195 if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) {
196 exit(1);
197 }
198
199 /* Avoid ETXTBSY on exec. */
200 snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd);
201 fd1 = open(buf, O_RDONLY|O_CLOEXEC);
202 close(fd);
203
204 return fd1;
205}
206#endif
207
208#ifdef __x86_64__
209int main(void)
210{
211 int pipefd[2];
212 int exec_fd;
213
214 atexit(ate);
215
216 make_private_tmp();
217
218 /* Reserve fd 0 for 1-byte pipe ping from child. */
219 close(0);
220 if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) {
221 return 1;
222 }
223
224 exec_fd = make_exe(payload, sizeof(payload));
225
226 if (pipe(pipefd) == -1) {
227 return 1;
228 }
229 if (dup2(pipefd[1], 0) != 0) {
230 return 1;
231 }
232
233 pid = fork();
234 if (pid == -1) {
235 return 1;
236 }
237 if (pid == 0) {
238 sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH);
239 return 1;
240 }
241
242 char _;
243 if (read(pipefd[0], &_, 1) != 1) {
244 return 1;
245 }
246
247 struct stat st;
248 if (fstat(exec_fd, &st) == -1) {
249 return 1;
250 }
251
252 /* Generate "head -n1 /proc/$PID/maps" */
253 char buf0[256];
254 memset(buf0, ' ', sizeof(buf0));
255 int len = snprintf(buf0, sizeof(buf0),
256 "%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu",
257 VADDR, VADDR + PAGE_SIZE,
258 MAJOR(st.st_dev), MINOR(st.st_dev),
259 (unsigned long long)st.st_ino);
260 buf0[len] = ' ';
261 snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET,
262 "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino);
263
264
265 /* Test /proc/$PID/maps */
266 {
267 char buf[256];
268 ssize_t rv;
269 int fd;
270
271 snprintf(buf, sizeof(buf), "/proc/%u/maps", pid);
272 fd = open(buf, O_RDONLY);
273 if (fd == -1) {
274 return 1;
275 }
276 rv = read(fd, buf, sizeof(buf));
277 assert(rv == strlen(buf0));
278 assert(memcmp(buf, buf0, strlen(buf0)) == 0);
279 }
280
281 /* Test /proc/$PID/smaps */
282 {
283 char buf[1024];
284 ssize_t rv;
285 int fd;
286
287 snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid);
288 fd = open(buf, O_RDONLY);
289 if (fd == -1) {
290 return 1;
291 }
292 rv = read(fd, buf, sizeof(buf));
293 assert(0 <= rv && rv <= sizeof(buf));
294
295 assert(rv >= strlen(buf0));
296 assert(memcmp(buf, buf0, strlen(buf0)) == 0);
297
298#define RSS1 "Rss: 4 kB\n"
299#define RSS2 "Rss: 0 kB\n"
300#define PSS1 "Pss: 4 kB\n"
301#define PSS2 "Pss: 0 kB\n"
302 assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
303 memmem(buf, rv, RSS2, strlen(RSS2)));
304 assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
305 memmem(buf, rv, PSS2, strlen(PSS2)));
306
307 static const char *S[] = {
308 "Size: 4 kB\n",
309 "KernelPageSize: 4 kB\n",
310 "MMUPageSize: 4 kB\n",
311 "Anonymous: 0 kB\n",
312 "AnonHugePages: 0 kB\n",
313 "Shared_Hugetlb: 0 kB\n",
314 "Private_Hugetlb: 0 kB\n",
315 "Locked: 0 kB\n",
316 };
317 int i;
318
319 for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) {
320 assert(memmem(buf, rv, S[i], strlen(S[i])));
321 }
322 }
323
324 /* Test /proc/$PID/smaps_rollup */
325 {
326 char bufr[256];
327 memset(bufr, ' ', sizeof(bufr));
328 len = snprintf(bufr, sizeof(bufr),
329 "%08lx-%08lx ---p 00000000 00:00 0",
330 VADDR, VADDR + PAGE_SIZE);
331 bufr[len] = ' ';
332 snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET,
333 "[rollup]\n");
334
335 char buf[1024];
336 ssize_t rv;
337 int fd;
338
339 snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid);
340 fd = open(buf, O_RDONLY);
341 if (fd == -1) {
342 return 1;
343 }
344 rv = read(fd, buf, sizeof(buf));
345 assert(0 <= rv && rv <= sizeof(buf));
346
347 assert(rv >= strlen(bufr));
348 assert(memcmp(buf, bufr, strlen(bufr)) == 0);
349
350 assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
351 memmem(buf, rv, RSS2, strlen(RSS2)));
352 assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
353 memmem(buf, rv, PSS2, strlen(PSS2)));
354
355 static const char *S[] = {
356 "Anonymous: 0 kB\n",
357 "AnonHugePages: 0 kB\n",
358 "Shared_Hugetlb: 0 kB\n",
359 "Private_Hugetlb: 0 kB\n",
360 "Locked: 0 kB\n",
361 };
362 int i;
363
364 for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) {
365 assert(memmem(buf, rv, S[i], strlen(S[i])));
366 }
367 }
368
369 /* Test /proc/$PID/statm */
370 {
371 char buf[64];
372 ssize_t rv;
373 int fd;
374
375 snprintf(buf, sizeof(buf), "/proc/%u/statm", pid);
376 fd = open(buf, O_RDONLY);
377 if (fd == -1) {
378 return 1;
379 }
380 rv = read(fd, buf, sizeof(buf));
381 assert(rv == 7 * 2);
382
383 assert(buf[0] == '1'); /* ->total_vm */
384 assert(buf[1] == ' ');
385 assert(buf[2] == '0' || buf[2] == '1'); /* rss */
386 assert(buf[3] == ' ');
387 assert(buf[4] == '0' || buf[2] == '1'); /* file rss */
388 assert(buf[5] == ' ');
389 assert(buf[6] == '1'); /* ELF executable segments */
390 assert(buf[7] == ' ');
391 assert(buf[8] == '0');
392 assert(buf[9] == ' ');
393 assert(buf[10] == '0'); /* ->data_vm + ->stack_vm */
394 assert(buf[11] == ' ');
395 assert(buf[12] == '0');
396 assert(buf[13] == '\n');
397 }
398
399 return 0;
400}
401#else
402int main(void)
403{
404 return 4;
405}
406#endif
diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c
index 85744425b08d..762cb01f2ca7 100644
--- a/tools/testing/selftests/proc/proc-self-map-files-002.c
+++ b/tools/testing/selftests/proc/proc-self-map-files-002.c
@@ -63,7 +63,7 @@ int main(void)
63 p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); 63 p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0);
64 if (p == MAP_FAILED) { 64 if (p == MAP_FAILED) {
65 if (errno == EPERM) 65 if (errno == EPERM)
66 return 2; 66 return 4;
67 return 1; 67 return 1;
68 } 68 }
69 69
diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c
index 5ab5f4810e43..9f6d000c0245 100644
--- a/tools/testing/selftests/proc/proc-self-syscall.c
+++ b/tools/testing/selftests/proc/proc-self-syscall.c
@@ -20,7 +20,6 @@
20#include <sys/stat.h> 20#include <sys/stat.h>
21#include <fcntl.h> 21#include <fcntl.h>
22#include <errno.h> 22#include <errno.h>
23#include <unistd.h>
24#include <string.h> 23#include <string.h>
25#include <stdio.h> 24#include <stdio.h>
26 25
@@ -39,7 +38,7 @@ int main(void)
39 fd = open("/proc/self/syscall", O_RDONLY); 38 fd = open("/proc/self/syscall", O_RDONLY);
40 if (fd == -1) { 39 if (fd == -1) {
41 if (errno == ENOENT) 40 if (errno == ENOENT)
42 return 2; 41 return 4;
43 return 1; 42 return 1;
44 } 43 }
45 44
diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c
index a38b2fbaa7ad..b467b98a457d 100644
--- a/tools/testing/selftests/proc/proc-self-wchan.c
+++ b/tools/testing/selftests/proc/proc-self-wchan.c
@@ -27,7 +27,7 @@ int main(void)
27 fd = open("/proc/self/wchan", O_RDONLY); 27 fd = open("/proc/self/wchan", O_RDONLY);
28 if (fd == -1) { 28 if (fd == -1) {
29 if (errno == ENOENT) 29 if (errno == ENOENT)
30 return 2; 30 return 4;
31 return 1; 31 return 1;
32 } 32 }
33 33
diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c
index 563e752e6eba..b3ef9e14d6cc 100644
--- a/tools/testing/selftests/proc/read.c
+++ b/tools/testing/selftests/proc/read.c
@@ -26,8 +26,10 @@
26#include <dirent.h> 26#include <dirent.h>
27#include <stdbool.h> 27#include <stdbool.h>
28#include <stdlib.h> 28#include <stdlib.h>
29#include <stdio.h>
29#include <string.h> 30#include <string.h>
30#include <sys/stat.h> 31#include <sys/stat.h>
32#include <sys/vfs.h>
31#include <fcntl.h> 33#include <fcntl.h>
32#include <unistd.h> 34#include <unistd.h>
33 35
@@ -123,10 +125,22 @@ static void f(DIR *d, unsigned int level)
123int main(void) 125int main(void)
124{ 126{
125 DIR *d; 127 DIR *d;
128 struct statfs sfs;
126 129
127 d = opendir("/proc"); 130 d = opendir("/proc");
128 if (!d) 131 if (!d)
132 return 4;
133
134 /* Ensure /proc is proc. */
135 if (fstatfs(dirfd(d), &sfs) == -1) {
136 return 1;
137 }
138 if (sfs.f_type != 0x9fa0) {
139 fprintf(stderr, "error: unexpected f_type %lx\n", (long)sfs.f_type);
129 return 2; 140 return 2;
141 }
142
130 f(d, 0); 143 f(d, 0);
144
131 return 0; 145 return 0;
132} 146}
diff --git a/tools/testing/selftests/tmpfs/.gitignore b/tools/testing/selftests/tmpfs/.gitignore
new file mode 100644
index 000000000000..a96838fad74d
--- /dev/null
+++ b/tools/testing/selftests/tmpfs/.gitignore
@@ -0,0 +1 @@
/bug-link-o-tmpfile
diff --git a/tools/testing/selftests/tmpfs/Makefile b/tools/testing/selftests/tmpfs/Makefile
new file mode 100644
index 000000000000..953c81299181
--- /dev/null
+++ b/tools/testing/selftests/tmpfs/Makefile
@@ -0,0 +1,7 @@
1CFLAGS += -Wall -O2
2CFLAGS += -D_GNU_SOURCE
3
4TEST_GEN_PROGS :=
5TEST_GEN_PROGS += bug-link-o-tmpfile
6
7include ../lib.mk
diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
new file mode 100644
index 000000000000..b5c3ddb90942
--- /dev/null
+++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16/* Test that open(O_TMPFILE), linkat() doesn't screw accounting. */
17#include <errno.h>
18#include <sched.h>
19#include <stdio.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <fcntl.h>
23#include <sys/mount.h>
24#include <unistd.h>
25
26int main(void)
27{
28 int fd;
29
30 if (unshare(CLONE_NEWNS) == -1) {
31 if (errno == ENOSYS || errno == EPERM) {
32 fprintf(stderr, "error: unshare, errno %d\n", errno);
33 return 4;
34 }
35 fprintf(stderr, "error: unshare, errno %d\n", errno);
36 return 1;
37 }
38 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
39 fprintf(stderr, "error: mount '/', errno %d\n", errno);
40 return 1;
41 }
42
43 /* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */
44 if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) {
45 fprintf(stderr, "error: mount tmpfs, errno %d\n", errno);
46 return 1;
47 }
48
49 fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600);
50 if (fd == -1) {
51 fprintf(stderr, "error: open 1, errno %d\n", errno);
52 return 1;
53 }
54 if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) {
55 fprintf(stderr, "error: linkat, errno %d\n", errno);
56 return 1;
57 }
58 close(fd);
59
60 fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600);
61 if (fd == -1) {
62 fprintf(stderr, "error: open 2, errno %d\n", errno);
63 return 1;
64 }
65
66 return 0;
67}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 584a91ae4a8f..951c507a27f7 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -211,4 +211,20 @@ else
211 echo "[PASS]" 211 echo "[PASS]"
212fi 212fi
213 213
214echo "------------------------------------"
215echo "running vmalloc stability smoke test"
216echo "------------------------------------"
217./test_vmalloc.sh smoke
218ret_val=$?
219
220if [ $ret_val -eq 0 ]; then
221 echo "[PASS]"
222elif [ $ret_val -eq $ksft_skip ]; then
223 echo "[SKIP]"
224 exitcode=$ksft_skip
225else
226 echo "[FAIL]"
227 exitcode=1
228fi
229
214exit $exitcode 230exit $exitcode
diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh
new file mode 100644
index 000000000000..06d2bb109f06
--- /dev/null
+++ b/tools/testing/selftests/vm/test_vmalloc.sh
@@ -0,0 +1,176 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
5#
6# This is a test script for the kernel test driver to analyse vmalloc
7# allocator. Therefore it is just a kernel module loader. You can specify
8# and pass different parameters in order to:
9# a) analyse performance of vmalloc allocations;
10# b) stressing and stability check of vmalloc subsystem.
11
12TEST_NAME="vmalloc"
13DRIVER="test_${TEST_NAME}"
14
15# 1 if fails
16exitcode=1
17
18# Kselftest framework requirement - SKIP code is 4.
19ksft_skip=4
20
21#
22# Static templates for performance, stressing and smoke tests.
23# Also it is possible to pass any supported parameters manualy.
24#
25PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3"
26SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10"
27STRESS_PARAM="test_repeat_count=20"
28
29check_test_requirements()
30{
31 uid=$(id -u)
32 if [ $uid -ne 0 ]; then
33 echo "$0: Must be run as root"
34 exit $ksft_skip
35 fi
36
37 if ! which modprobe > /dev/null 2>&1; then
38 echo "$0: You need modprobe installed"
39 exit $ksft_skip
40 fi
41
42 if ! modinfo $DRIVER > /dev/null 2>&1; then
43 echo "$0: You must have the following enabled in your kernel:"
44 echo "CONFIG_TEST_VMALLOC=m"
45 exit $ksft_skip
46 fi
47}
48
49run_perfformance_check()
50{
51 echo "Run performance tests to evaluate how fast vmalloc allocation is."
52 echo "It runs all test cases on one single CPU with sequential order."
53
54 modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
55 echo "Done."
56 echo "Ccheck the kernel message buffer to see the summary."
57}
58
59run_stability_check()
60{
61 echo "Run stability tests. In order to stress vmalloc subsystem we run"
62 echo "all available test cases on all available CPUs simultaneously."
63 echo "It will take time, so be patient."
64
65 modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
66 echo "Done."
67 echo "Check the kernel ring buffer to see the summary."
68}
69
70run_smoke_check()
71{
72 echo "Run smoke test. Note, this test provides basic coverage."
73 echo "Please check $0 output how it can be used"
74 echo "for deep performance analysis as well as stress testing."
75
76 modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
77 echo "Done."
78 echo "Check the kernel ring buffer to see the summary."
79}
80
81usage()
82{
83 echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
84 echo "manual parameters"
85 echo
86 echo "Valid tests and parameters:"
87 echo
88 modinfo $DRIVER
89 echo
90 echo "Example usage:"
91 echo
92 echo "# Shows help message"
93 echo "./${DRIVER}.sh"
94 echo
95 echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs"
96 echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5"
97 echo
98 echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
99 echo "sequential order"
100 echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 "
101 echo "run_test_mask=23"
102 echo
103 echo -n "# Runs all tests on all online CPUs, shuffled order, repeats "
104 echo "20 times"
105 echo "./${DRIVER}.sh test_repeat_count=20"
106 echo
107 echo "# Performance analysis"
108 echo "./${DRIVER}.sh performance"
109 echo
110 echo "# Stress testing"
111 echo "./${DRIVER}.sh stress"
112 echo
113 exit 0
114}
115
116function validate_passed_args()
117{
118 VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
119
120 #
121 # Something has been passed, check it.
122 #
123 for passed_arg in $@; do
124 key=${passed_arg//=*/}
125 val="${passed_arg:$((${#key}+1))}"
126 valid=0
127
128 for valid_arg in $VALID_ARGS; do
129 if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
130 valid=1
131 break
132 fi
133 done
134
135 if [[ $valid -ne 1 ]]; then
136 echo "Error: key or value is not correct: ${key} $val"
137 exit $exitcode
138 fi
139 done
140}
141
142function run_manual_check()
143{
144 #
145 # Validate passed parameters. If there is wrong one,
146 # the script exists and does not execute further.
147 #
148 validate_passed_args $@
149
150 echo "Run the test with following parameters: $@"
151 modprobe $DRIVER $@ > /dev/null 2>&1
152 echo "Done."
153 echo "Check the kernel ring buffer to see the summary."
154}
155
156function run_test()
157{
158 if [ $# -eq 0 ]; then
159 usage
160 else
161 if [[ "$1" = "performance" ]]; then
162 run_perfformance_check
163 elif [[ "$1" = "stress" ]]; then
164 run_stability_check
165 elif [[ "$1" = "smoke" ]]; then
166 run_smoke_check
167 else
168 run_manual_check $@
169 fi
170 fi
171}
172
173check_test_requirements
174run_test $@
175
176exit 0
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 1ff3a6c0367b..6f64b2b93234 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -133,7 +133,7 @@ static const char * const page_flag_names[] = {
133 [KPF_NOPAGE] = "n:nopage", 133 [KPF_NOPAGE] = "n:nopage",
134 [KPF_KSM] = "x:ksm", 134 [KPF_KSM] = "x:ksm",
135 [KPF_THP] = "t:thp", 135 [KPF_THP] = "t:thp",
136 [KPF_BALLOON] = "o:balloon", 136 [KPF_OFFLINE] = "o:offline",
137 [KPF_PGTABLE] = "g:pgtable", 137 [KPF_PGTABLE] = "g:pgtable",
138 [KPF_ZERO_PAGE] = "z:zero_page", 138 [KPF_ZERO_PAGE] = "z:zero_page",
139 [KPF_IDLE] = "i:idle_page", 139 [KPF_IDLE] = "i:idle_page",
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
index 334b16db0ebb..73818f1b2ef8 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c
@@ -110,39 +110,42 @@ static void fatal(const char *x, ...)
110static void usage(void) 110static void usage(void)
111{ 111{
112 printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" 112 printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n"
113 "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" 113 "slabinfo [-aADefhilnosrStTvz1LXBU] [N=K] [-dafzput] [slab-regexp]\n"
114 "-a|--aliases Show aliases\n" 114 "-a|--aliases Show aliases\n"
115 "-A|--activity Most active slabs first\n" 115 "-A|--activity Most active slabs first\n"
116 "-d<options>|--debug=<options> Set/Clear Debug options\n" 116 "-B|--Bytes Show size in bytes\n"
117 "-D|--display-active Switch line format to activity\n" 117 "-D|--display-active Switch line format to activity\n"
118 "-e|--empty Show empty slabs\n" 118 "-e|--empty Show empty slabs\n"
119 "-f|--first-alias Show first alias\n" 119 "-f|--first-alias Show first alias\n"
120 "-h|--help Show usage information\n" 120 "-h|--help Show usage information\n"
121 "-i|--inverted Inverted list\n" 121 "-i|--inverted Inverted list\n"
122 "-l|--slabs Show slabs\n" 122 "-l|--slabs Show slabs\n"
123 "-L|--Loss Sort by loss\n"
123 "-n|--numa Show NUMA information\n" 124 "-n|--numa Show NUMA information\n"
124 "-o|--ops Show kmem_cache_ops\n" 125 "-N|--lines=K Show the first K slabs\n"
126 "-o|--ops Show kmem_cache_ops\n"
127 "-r|--report Detailed report on single slabs\n"
125 "-s|--shrink Shrink slabs\n" 128 "-s|--shrink Shrink slabs\n"
126 "-r|--report Detailed report on single slabs\n"
127 "-S|--Size Sort by size\n" 129 "-S|--Size Sort by size\n"
128 "-t|--tracking Show alloc/free information\n" 130 "-t|--tracking Show alloc/free information\n"
129 "-T|--Totals Show summary information\n" 131 "-T|--Totals Show summary information\n"
132 "-U|--Unreclaim Show unreclaimable slabs only\n"
130 "-v|--validate Validate slabs\n" 133 "-v|--validate Validate slabs\n"
131 "-z|--zero Include empty slabs\n" 134 "-z|--zero Include empty slabs\n"
132 "-1|--1ref Single reference\n" 135 "-1|--1ref Single reference\n"
133 "-N|--lines=K Show the first K slabs\n"
134 "-L|--Loss Sort by loss\n"
135 "-X|--Xtotals Show extended summary information\n" 136 "-X|--Xtotals Show extended summary information\n"
136 "-B|--Bytes Show size in bytes\n" 137
137 "-U|--Unreclaim Show unreclaimable slabs only\n" 138 "\n"
138 "\nValid debug options (FZPUT may be combined)\n" 139 "-d | --debug Switch off all debug options\n"
139 "a / A Switch on all debug options (=FZUP)\n" 140 "-da | --debug=a Switch on all debug options (--debug=FZPU)\n"
140 "- Switch off all debug options\n" 141
141 "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" 142 "\n"
142 "z / Z Redzoning\n" 143 "-d[afzput] | --debug=[afzput]\n"
143 "p / P Poisoning\n" 144 " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n"
144 "u / U Tracking\n" 145 " z | Z Redzoning\n"
145 "t / T Tracing\n" 146 " p | P Poisoning\n"
147 " u | U Tracking\n"
148 " t | T Tracing\n"
146 ); 149 );
147} 150}
148 151