aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-10-26 22:33:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-26 22:33:41 -0400
commit345671ea0f9258f410eb057b9ced9cefbbe5dc78 (patch)
treefe97ba3d27679789e6aa34e39b002ee64ce25412
parent4904008165c8a1c48602b8316139691b8c735e6e (diff)
parent22146c3ce98962436e401f7b7016a6f664c9ffb5 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few misc things - ocfs2 updates - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (132 commits) hugetlbfs: dirty pages as they are added to pagecache mm: export add_swap_extent() mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS tools/testing/selftests/vm/map_fixed_noreplace.c: add test for MAP_FIXED_NOREPLACE mm: thp: relocate flush_cache_range() in migrate_misplaced_transhuge_page() mm: thp: fix mmu_notifier in migrate_misplaced_transhuge_page() mm: thp: fix MADV_DONTNEED vs migrate_misplaced_transhuge_page race condition mm/kasan/quarantine.c: make quarantine_lock a raw_spinlock_t mm/gup: cache dev_pagemap while pinning pages Revert "x86/e820: put !E820_TYPE_RAM regions into memblock.reserved" mm: return zero_resv_unavail optimization mm: zero remaining unavailable struct pages tools/testing/selftests/vm/gup_benchmark.c: add MAP_HUGETLB option tools/testing/selftests/vm/gup_benchmark.c: add MAP_SHARED option tools/testing/selftests/vm/gup_benchmark.c: allow user specified file tools/testing/selftests/vm/gup_benchmark.c: fix 'write' flag usage mm/gup_benchmark.c: add additional pinning methods mm/gup_benchmark.c: time put_page() mm: don't raise MEMCG_OOM event due to failed high-order allocation mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock ...
-rw-r--r--Documentation/accounting/psi.txt73
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst22
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt12
-rw-r--r--Documentation/filesystems/proc.txt4
-rw-r--r--Documentation/vm/slub.rst12
-rw-r--r--Documentation/x86/pat.txt4
-rw-r--r--arch/alpha/Kconfig2
-rw-r--r--arch/alpha/kernel/core_irongate.c4
-rw-r--r--arch/alpha/kernel/setup.c98
-rw-r--r--arch/alpha/mm/numa.c113
-rw-r--r--arch/arm/include/asm/hugetlb-3level.h32
-rw-r--r--arch/arm/include/asm/hugetlb.h33
-rw-r--r--arch/arm64/include/asm/hugetlb.h39
-rw-r--r--arch/arm64/include/asm/string.h14
-rw-r--r--arch/arm64/kernel/arm64ksyms.c7
-rw-r--r--arch/arm64/lib/memchr.S2
-rw-r--r--arch/arm64/lib/memcmp.S2
-rw-r--r--arch/arm64/lib/strchr.S2
-rw-r--r--arch/arm64/lib/strcmp.S2
-rw-r--r--arch/arm64/lib/strlen.S2
-rw-r--r--arch/arm64/lib/strncmp.S2
-rw-r--r--arch/arm64/lib/strnlen.S2
-rw-r--r--arch/arm64/lib/strrchr.S2
-rw-r--r--arch/hexagon/Kconfig3
-rw-r--r--arch/hexagon/mm/init.c20
-rw-r--r--arch/ia64/include/asm/hugetlb.h47
-rw-r--r--arch/ia64/include/asm/pgtable.h1
-rw-r--r--arch/mips/include/asm/hugetlb.h40
-rw-r--r--arch/nios2/Kconfig3
-rw-r--r--arch/nios2/kernel/prom.c17
-rw-r--r--arch/nios2/kernel/setup.c39
-rw-r--r--arch/parisc/include/asm/hugetlb.h33
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/hugetlb.h43
-rw-r--r--arch/powerpc/include/asm/nohash/32/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h1
-rw-r--r--arch/powerpc/platforms/cell/cpufreq_spudemand.c2
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c9
-rw-r--r--arch/s390/appldata/appldata_os.c4
-rw-r--r--arch/sh/include/asm/hugetlb.h54
-rw-r--r--arch/sparc/include/asm/hugetlb.h40
-rw-r--r--arch/um/Kconfig2
-rw-r--r--arch/um/kernel/physmem.c22
-rw-r--r--arch/unicore32/Kconfig1
-rw-r--r--arch/unicore32/mm/init.c54
-rw-r--r--arch/x86/entry/vdso/vma.c24
-rw-r--r--arch/x86/include/asm/hugetlb.h69
-rw-r--r--arch/x86/kernel/e820.c15
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/include/asm/vga.h19
-rw-r--r--block/blk-iolatency.c8
-rw-r--r--drivers/base/node.c19
-rw-r--r--drivers/cpuidle/governors/menu.c4
-rw-r--r--drivers/infiniband/hw/hfi1/mmu_rb.c1
-rw-r--r--drivers/iommu/amd_iommu_v2.c1
-rw-r--r--drivers/iommu/intel-svm.c1
-rw-r--r--drivers/misc/sgi-gru/grutlbpurge.c1
-rw-r--r--drivers/of/fdt.c11
-rw-r--r--drivers/staging/android/ion/ion_page_pool.c8
-rw-r--r--fs/cramfs/inode.c5
-rw-r--r--fs/dcache.c38
-rw-r--r--fs/iomap.c2
-rw-r--r--fs/kernfs/mount.c3
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmthread.c2
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/loadavg.c3
-rw-r--r--fs/proc/meminfo.c16
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/userfaultfd.c8
-rw-r--r--include/asm-generic/hugetlb.h88
-rw-r--r--include/asm-generic/pgtable.h4
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/cgroup.h15
-rw-r--r--include/linux/delayacct.h23
-rw-r--r--include/linux/hmm.h2
-rw-r--r--include/linux/huge_mm.h8
-rw-r--r--include/linux/iomap.h4
-rw-r--r--include/linux/linkage.h1
-rw-r--r--include/linux/math64.h3
-rw-r--r--include/linux/memblock.h15
-rw-r--r--include/linux/memcontrol.h15
-rw-r--r--include/linux/mm.h48
-rw-r--r--include/linux/mmu_notifier.h27
-rw-r--r--include/linux/mmzone.h4
-rw-r--r--include/linux/page-flags.h14
-rw-r--r--include/linux/pfn_t.h4
-rw-r--r--include/linux/psi.h53
-rw-r--r--include/linux/psi_types.h92
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/linux/sched/loadavg.h24
-rw-r--r--include/linux/slab.h56
-rw-r--r--include/linux/swap.h15
-rw-r--r--include/trace/events/mmflags.h1
-rw-r--r--include/uapi/linux/taskstats.h6
-rw-r--r--init/Kconfig19
-rw-r--r--kernel/cgroup/cgroup.c45
-rw-r--r--kernel/debug/kdb/kdb_main.c7
-rw-r--r--kernel/delayacct.c15
-rw-r--r--kernel/fork.c59
-rw-r--r--kernel/memremap.c25
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/loadavg.c139
-rw-r--r--kernel/sched/psi.c759
-rw-r--r--kernel/sched/sched.h178
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--lib/test_kasan.c70
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/debug.c46
-rw-r--r--mm/filemap.c37
-rw-r--r--mm/gup.c115
-rw-r--r--mm/gup_benchmark.c37
-rw-r--r--mm/hmm.c12
-rw-r--r--mm/huge_memory.c31
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/kasan/quarantine.c18
-rw-r--r--mm/kmemleak.c42
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c54
-rw-r--r--mm/memory.c156
-rw-r--r--mm/memory_hotplug.c146
-rw-r--r--mm/mempolicy.c35
-rw-r--r--mm/migrate.c44
-rw-r--r--mm/mmap.c96
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/mremap.c20
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page-writeback.c33
-rw-r--r--mm/page_alloc.c362
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/slab.c8
-rw-r--r--mm/slab_common.c115
-rw-r--r--mm/slub.c83
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c83
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c31
-rw-r--r--mm/vmstat.c10
-rw-r--r--mm/workingset.c135
-rw-r--r--mm/zsmalloc.c2
-rwxr-xr-xscripts/tags.sh2
-rw-r--r--tools/accounting/getdelays.c8
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile1
-rw-r--r--tools/testing/selftests/vm/gup_benchmark.c42
-rw-r--r--tools/testing/selftests/vm/map_fixed_noreplace.c206
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c134
-rw-r--r--virt/kvm/kvm_main.c1
156 files changed, 3400 insertions, 1988 deletions
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
new file mode 100644
index 000000000000..b8ca28b60215
--- /dev/null
+++ b/Documentation/accounting/psi.txt
@@ -0,0 +1,73 @@
1================================
2PSI - Pressure Stall Information
3================================
4
5:Date: April, 2018
6:Author: Johannes Weiner <hannes@cmpxchg.org>
7
8When CPU, memory or IO devices are contended, workloads experience
9latency spikes, throughput losses, and run the risk of OOM kills.
10
11Without an accurate measure of such contention, users are forced to
12either play it safe and under-utilize their hardware resources, or
13roll the dice and frequently suffer the disruptions resulting from
14excessive overcommit.
15
16The psi feature identifies and quantifies the disruptions caused by
17such resource crunches and the time impact it has on complex workloads
18or even entire systems.
19
20Having an accurate measure of productivity losses caused by resource
21scarcity aids users in sizing workloads to hardware--or provisioning
22hardware according to workload demand.
23
24As psi aggregates this information in realtime, systems can be managed
25dynamically using techniques such as load shedding, migrating jobs to
26other systems or data centers, or strategically pausing or killing low
27priority or restartable batch jobs.
28
29This allows maximizing hardware utilization without sacrificing
30workload health or risking major disruptions such as OOM kills.
31
32Pressure interface
33==================
34
35Pressure information for each resource is exported through the
36respective file in /proc/pressure/ -- cpu, memory, and io.
37
38The format for CPU is as such:
39
40some avg10=0.00 avg60=0.00 avg300=0.00 total=0
41
42and for memory and IO:
43
44some avg10=0.00 avg60=0.00 avg300=0.00 total=0
45full avg10=0.00 avg60=0.00 avg300=0.00 total=0
46
47The "some" line indicates the share of time in which at least some
48tasks are stalled on a given resource.
49
50The "full" line indicates the share of time in which all non-idle
51tasks are stalled on a given resource simultaneously. In this state
52actual CPU cycles are going to waste, and a workload that spends
53extended time in this state is considered to be thrashing. This has
54severe impact on performance, and it's useful to distinguish this
55situation from a state where some tasks are stalled but the CPU is
56still doing productive work. As such, time spent in this subset of the
57stall state is tracked separately and exported in the "full" averages.
58
59The ratios are tracked as recent trends over ten, sixty, and three
60hundred second windows, which gives insight into short term events as
61well as medium and long term trends. The total absolute stall time is
62tracked and exported as well, to allow detection of latency spikes
63which wouldn't necessarily make a dent in the time averages, or to
64average trends over custom time frames.
65
66Cgroup2 interface
67=================
68
69In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
70mounted, pressure stall information is also tracked for tasks grouped
71into cgroups. Each subdirectory in the cgroupfs mountpoint contains
72cpu.pressure, memory.pressure, and io.pressure files; the format is
73the same as the /proc/pressure/ files.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index caf36105a1c7..8384c681a4b2 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -966,6 +966,12 @@ All time durations are in microseconds.
966 $PERIOD duration. "max" for $MAX indicates no limit. If only 966 $PERIOD duration. "max" for $MAX indicates no limit. If only
967 one number is written, $MAX is updated. 967 one number is written, $MAX is updated.
968 968
969 cpu.pressure
970 A read-only nested-key file which exists on non-root cgroups.
971
972 Shows pressure stall information for CPU. See
973 Documentation/accounting/psi.txt for details.
974
969 975
970Memory 976Memory
971------ 977------
@@ -1127,6 +1133,10 @@ PAGE_SIZE multiple when read back.
1127 disk readahead. For now OOM in memory cgroup kills 1133 disk readahead. For now OOM in memory cgroup kills
1128 tasks iff shortage has happened inside page fault. 1134 tasks iff shortage has happened inside page fault.
1129 1135
1136 This event is not raised if the OOM killer is not
1137 considered as an option, e.g. for failed high-order
1138 allocations.
1139
1130 oom_kill 1140 oom_kill
1131 The number of processes belonging to this cgroup 1141 The number of processes belonging to this cgroup
1132 killed by any kind of OOM killer. 1142 killed by any kind of OOM killer.
@@ -1271,6 +1281,12 @@ PAGE_SIZE multiple when read back.
1271 higher than the limit for an extended period of time. This 1281 higher than the limit for an extended period of time. This
1272 reduces the impact on the workload and memory management. 1282 reduces the impact on the workload and memory management.
1273 1283
1284 memory.pressure
1285 A read-only nested-key file which exists on non-root cgroups.
1286
1287 Shows pressure stall information for memory. See
1288 Documentation/accounting/psi.txt for details.
1289
1274 1290
1275Usage Guidelines 1291Usage Guidelines
1276~~~~~~~~~~~~~~~~ 1292~~~~~~~~~~~~~~~~
@@ -1408,6 +1424,12 @@ IO Interface Files
1408 1424
1409 8:16 rbps=2097152 wbps=max riops=max wiops=max 1425 8:16 rbps=2097152 wbps=max riops=max wiops=max
1410 1426
1427 io.pressure
1428 A read-only nested-key file which exists on non-root cgroups.
1429
1430 Shows pressure stall information for IO. See
1431 Documentation/accounting/psi.txt for details.
1432
1411 1433
1412Writeback 1434Writeback
1413~~~~~~~~~ 1435~~~~~~~~~
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 47ca5cda0eef..b90fe3b6bc6c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4851,6 +4851,18 @@
4851 This is actually a boot loader parameter; the value is 4851 This is actually a boot loader parameter; the value is
4852 passed to the kernel using a special protocol. 4852 passed to the kernel using a special protocol.
4853 4853
4854 vm_debug[=options] [KNL] Available with CONFIG_DEBUG_VM=y.
4855 May slow down system boot speed, especially when
4856 enabled on systems with a large amount of memory.
4857 All options are enabled by default, and this
4858 interface is meant to allow for selectively
4859 enabling or disabling specific virtual memory
4860 debugging features.
4861
4862 Available options are:
4863 P Enable page structure init time poisoning
4864 - Disable all of the above options
4865
4854 vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact 4866 vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact
4855 size of <nn>. This can be used to increase the 4867 size of <nn>. This can be used to increase the
4856 minimum size (128MB on x86). It can also be used to 4868 minimum size (128MB on x86). It can also be used to
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 22b4b00dee31..12a5e6e693b6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -858,6 +858,7 @@ Writeback: 0 kB
858AnonPages: 861800 kB 858AnonPages: 861800 kB
859Mapped: 280372 kB 859Mapped: 280372 kB
860Shmem: 644 kB 860Shmem: 644 kB
861KReclaimable: 168048 kB
861Slab: 284364 kB 862Slab: 284364 kB
862SReclaimable: 159856 kB 863SReclaimable: 159856 kB
863SUnreclaim: 124508 kB 864SUnreclaim: 124508 kB
@@ -925,6 +926,9 @@ AnonHugePages: Non-file backed huge pages mapped into userspace page tables
925ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated 926ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated
926 with huge pages 927 with huge pages
927ShmemPmdMapped: Shared memory mapped into userspace with huge pages 928ShmemPmdMapped: Shared memory mapped into userspace with huge pages
929KReclaimable: Kernel allocations that the kernel will attempt to reclaim
930 under memory pressure. Includes SReclaimable (below), and other
931 direct allocations with a shrinker.
928 Slab: in-kernel data structures cache 932 Slab: in-kernel data structures cache
929SReclaimable: Part of Slab, that might be reclaimed, such as caches 933SReclaimable: Part of Slab, that might be reclaimed, such as caches
930 SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure 934 SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
index 3a775fd64e2d..195928808bac 100644
--- a/Documentation/vm/slub.rst
+++ b/Documentation/vm/slub.rst
@@ -36,9 +36,10 @@ debugging is enabled. Format:
36 36
37slub_debug=<Debug-Options> 37slub_debug=<Debug-Options>
38 Enable options for all slabs 38 Enable options for all slabs
39slub_debug=<Debug-Options>,<slab name>
40 Enable options only for select slabs
41 39
40slub_debug=<Debug-Options>,<slab name1>,<slab name2>,...
41 Enable options only for select slabs (no spaces
42 after a comma)
42 43
43Possible debug options are:: 44Possible debug options are::
44 45
@@ -62,7 +63,12 @@ Trying to find an issue in the dentry cache? Try::
62 63
63 slub_debug=,dentry 64 slub_debug=,dentry
64 65
65to only enable debugging on the dentry cache. 66to only enable debugging on the dentry cache. You may use an asterisk at the
67end of the slab name, in order to cover all slabs with the same prefix. For
68example, here's how you can poison the dentry cache as well as all kmalloc
69slabs:
70
71 slub_debug=P,kmalloc-*,dentry
66 72
67Red zoning and tracking may realign the slab. We can just apply sanity checks 73Red zoning and tracking may realign the slab. We can just apply sanity checks
68to the dentry cache with:: 74to the dentry cache with::
diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt
index 2a4ee6302122..481d8d8536ac 100644
--- a/Documentation/x86/pat.txt
+++ b/Documentation/x86/pat.txt
@@ -90,12 +90,12 @@ pci proc | -- | -- | WC |
90Advanced APIs for drivers 90Advanced APIs for drivers
91------------------------- 91-------------------------
92A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range, 92A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range,
93vm_insert_pfn 93vmf_insert_pfn
94 94
95Drivers wanting to export some pages to userspace do it by using mmap 95Drivers wanting to export some pages to userspace do it by using mmap
96interface and a combination of 96interface and a combination of
971) pgprot_noncached() 971) pgprot_noncached()
982) io_remap_pfn_range() or remap_pfn_range() or vm_insert_pfn() 982) io_remap_pfn_range() or remap_pfn_range() or vmf_insert_pfn()
99 99
100With PAT support, a new API pgprot_writecombine is being added. So, drivers can 100With PAT support, a new API pgprot_writecombine is being added. So, drivers can
101continue to use the above sequence, with either pgprot_noncached() or 101continue to use the above sequence, with either pgprot_noncached() or
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 5b4f88363453..620b0a711ee4 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -31,6 +31,8 @@ config ALPHA
31 select ODD_RT_SIGACTION 31 select ODD_RT_SIGACTION
32 select OLD_SIGSUSPEND 32 select OLD_SIGSUSPEND
33 select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 33 select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
34 select HAVE_MEMBLOCK
35 select NO_BOOTMEM
34 help 36 help
35 The Alpha is a 64-bit general-purpose processor designed and 37 The Alpha is a 64-bit general-purpose processor designed and
36 marketed by the Digital Equipment Corporation of blessed memory, 38 marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c
index aec757250e07..f70986683fc6 100644
--- a/arch/alpha/kernel/core_irongate.c
+++ b/arch/alpha/kernel/core_irongate.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/initrd.h> 22#include <linux/initrd.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24 25
25#include <asm/ptrace.h> 26#include <asm/ptrace.h>
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
@@ -241,8 +242,7 @@ albacore_init_arch(void)
241 size / 1024); 242 size / 1024);
242 } 243 }
243#endif 244#endif
244 reserve_bootmem_node(NODE_DATA(0), pci_mem, memtop - 245 memblock_reserve(pci_mem, memtop - pci_mem);
245 pci_mem, BOOTMEM_DEFAULT);
246 printk("irongate_init_arch: temporarily reserving " 246 printk("irongate_init_arch: temporarily reserving "
247 "region %08lx-%08lx for PCI\n", pci_mem, memtop - 1); 247 "region %08lx-%08lx for PCI\n", pci_mem, memtop - 1);
248 } 248 }
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 5576f7646fb6..4f0d94471bc9 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -30,6 +30,7 @@
30#include <linux/ioport.h> 30#include <linux/ioport.h>
31#include <linux/platform_device.h> 31#include <linux/platform_device.h>
32#include <linux/bootmem.h> 32#include <linux/bootmem.h>
33#include <linux/memblock.h>
33#include <linux/pci.h> 34#include <linux/pci.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/root_dev.h> 36#include <linux/root_dev.h>
@@ -312,9 +313,7 @@ setup_memory(void *kernel_end)
312{ 313{
313 struct memclust_struct * cluster; 314 struct memclust_struct * cluster;
314 struct memdesc_struct * memdesc; 315 struct memdesc_struct * memdesc;
315 unsigned long start_kernel_pfn, end_kernel_pfn; 316 unsigned long kernel_size;
316 unsigned long bootmap_size, bootmap_pages, bootmap_start;
317 unsigned long start, end;
318 unsigned long i; 317 unsigned long i;
319 318
320 /* Find free clusters, and init and free the bootmem accordingly. */ 319 /* Find free clusters, and init and free the bootmem accordingly. */
@@ -322,6 +321,8 @@ setup_memory(void *kernel_end)
322 (hwrpb->mddt_offset + (unsigned long) hwrpb); 321 (hwrpb->mddt_offset + (unsigned long) hwrpb);
323 322
324 for_each_mem_cluster(memdesc, cluster, i) { 323 for_each_mem_cluster(memdesc, cluster, i) {
324 unsigned long end;
325
325 printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n", 326 printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n",
326 i, cluster->usage, cluster->start_pfn, 327 i, cluster->usage, cluster->start_pfn,
327 cluster->start_pfn + cluster->numpages); 328 cluster->start_pfn + cluster->numpages);
@@ -335,6 +336,9 @@ setup_memory(void *kernel_end)
335 end = cluster->start_pfn + cluster->numpages; 336 end = cluster->start_pfn + cluster->numpages;
336 if (end > max_low_pfn) 337 if (end > max_low_pfn)
337 max_low_pfn = end; 338 max_low_pfn = end;
339
340 memblock_add(PFN_PHYS(cluster->start_pfn),
341 cluster->numpages << PAGE_SHIFT);
338 } 342 }
339 343
340 /* 344 /*
@@ -363,87 +367,9 @@ setup_memory(void *kernel_end)
363 max_low_pfn = mem_size_limit; 367 max_low_pfn = mem_size_limit;
364 } 368 }
365 369
366 /* Find the bounds of kernel memory. */ 370 /* Reserve the kernel memory. */
367 start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); 371 kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS;
368 end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); 372 memblock_reserve(KERNEL_START_PHYS, kernel_size);
369 bootmap_start = -1;
370
371 try_again:
372 if (max_low_pfn <= end_kernel_pfn)
373 panic("not enough memory to boot");
374
375 /* We need to know how many physically contiguous pages
376 we'll need for the bootmap. */
377 bootmap_pages = bootmem_bootmap_pages(max_low_pfn);
378
379 /* Now find a good region where to allocate the bootmap. */
380 for_each_mem_cluster(memdesc, cluster, i) {
381 if (cluster->usage & 3)
382 continue;
383
384 start = cluster->start_pfn;
385 end = start + cluster->numpages;
386 if (start >= max_low_pfn)
387 continue;
388 if (end > max_low_pfn)
389 end = max_low_pfn;
390 if (start < start_kernel_pfn) {
391 if (end > end_kernel_pfn
392 && end - end_kernel_pfn >= bootmap_pages) {
393 bootmap_start = end_kernel_pfn;
394 break;
395 } else if (end > start_kernel_pfn)
396 end = start_kernel_pfn;
397 } else if (start < end_kernel_pfn)
398 start = end_kernel_pfn;
399 if (end - start >= bootmap_pages) {
400 bootmap_start = start;
401 break;
402 }
403 }
404
405 if (bootmap_start == ~0UL) {
406 max_low_pfn >>= 1;
407 goto try_again;
408 }
409
410 /* Allocate the bootmap and mark the whole MM as reserved. */
411 bootmap_size = init_bootmem(bootmap_start, max_low_pfn);
412
413 /* Mark the free regions. */
414 for_each_mem_cluster(memdesc, cluster, i) {
415 if (cluster->usage & 3)
416 continue;
417
418 start = cluster->start_pfn;
419 end = cluster->start_pfn + cluster->numpages;
420 if (start >= max_low_pfn)
421 continue;
422 if (end > max_low_pfn)
423 end = max_low_pfn;
424 if (start < start_kernel_pfn) {
425 if (end > end_kernel_pfn) {
426 free_bootmem(PFN_PHYS(start),
427 (PFN_PHYS(start_kernel_pfn)
428 - PFN_PHYS(start)));
429 printk("freeing pages %ld:%ld\n",
430 start, start_kernel_pfn);
431 start = end_kernel_pfn;
432 } else if (end > start_kernel_pfn)
433 end = start_kernel_pfn;
434 } else if (start < end_kernel_pfn)
435 start = end_kernel_pfn;
436 if (start >= end)
437 continue;
438
439 free_bootmem(PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
440 printk("freeing pages %ld:%ld\n", start, end);
441 }
442
443 /* Reserve the bootmap memory. */
444 reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size,
445 BOOTMEM_DEFAULT);
446 printk("reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
447 373
448#ifdef CONFIG_BLK_DEV_INITRD 374#ifdef CONFIG_BLK_DEV_INITRD
449 initrd_start = INITRD_START; 375 initrd_start = INITRD_START;
@@ -459,8 +385,8 @@ setup_memory(void *kernel_end)
459 initrd_end, 385 initrd_end,
460 phys_to_virt(PFN_PHYS(max_low_pfn))); 386 phys_to_virt(PFN_PHYS(max_low_pfn)));
461 } else { 387 } else {
462 reserve_bootmem(virt_to_phys((void *)initrd_start), 388 memblock_reserve(virt_to_phys((void *)initrd_start),
463 INITRD_SIZE, BOOTMEM_DEFAULT); 389 INITRD_SIZE);
464 } 390 }
465 } 391 }
466#endif /* CONFIG_BLK_DEV_INITRD */ 392#endif /* CONFIG_BLK_DEV_INITRD */
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index a9e86475f169..26cd925d19b1 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -11,6 +11,7 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/memblock.h>
14#include <linux/swap.h> 15#include <linux/swap.h>
15#include <linux/initrd.h> 16#include <linux/initrd.h>
16#include <linux/pfn.h> 17#include <linux/pfn.h>
@@ -59,12 +60,10 @@ setup_memory_node(int nid, void *kernel_end)
59 struct memclust_struct * cluster; 60 struct memclust_struct * cluster;
60 struct memdesc_struct * memdesc; 61 struct memdesc_struct * memdesc;
61 unsigned long start_kernel_pfn, end_kernel_pfn; 62 unsigned long start_kernel_pfn, end_kernel_pfn;
62 unsigned long bootmap_size, bootmap_pages, bootmap_start;
63 unsigned long start, end; 63 unsigned long start, end;
64 unsigned long node_pfn_start, node_pfn_end; 64 unsigned long node_pfn_start, node_pfn_end;
65 unsigned long node_min_pfn, node_max_pfn; 65 unsigned long node_min_pfn, node_max_pfn;
66 int i; 66 int i;
67 unsigned long node_datasz = PFN_UP(sizeof(pg_data_t));
68 int show_init = 0; 67 int show_init = 0;
69 68
70 /* Find the bounds of current node */ 69 /* Find the bounds of current node */
@@ -134,24 +133,14 @@ setup_memory_node(int nid, void *kernel_end)
134 /* Cute trick to make sure our local node data is on local memory */ 133 /* Cute trick to make sure our local node data is on local memory */
135 node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); 134 node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT));
136#endif 135#endif
137 /* Quasi-mark the pg_data_t as in-use */
138 node_min_pfn += node_datasz;
139 if (node_min_pfn >= node_max_pfn) {
140 printk(" not enough mem to reserve NODE_DATA");
141 return;
142 }
143 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
144
145 printk(" Detected node memory: start %8lu, end %8lu\n", 136 printk(" Detected node memory: start %8lu, end %8lu\n",
146 node_min_pfn, node_max_pfn); 137 node_min_pfn, node_max_pfn);
147 138
148 DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); 139 DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid));
149 DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata);
150 140
151 /* Find the bounds of kernel memory. */ 141 /* Find the bounds of kernel memory. */
152 start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); 142 start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
153 end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); 143 end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
154 bootmap_start = -1;
155 144
156 if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) 145 if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn))
157 panic("kernel loaded out of ram"); 146 panic("kernel loaded out of ram");
@@ -161,89 +150,11 @@ setup_memory_node(int nid, void *kernel_end)
161 has much larger alignment than 8Mb, so it's safe. */ 150 has much larger alignment than 8Mb, so it's safe. */
162 node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); 151 node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1);
163 152
164 /* We need to know how many physically contiguous pages 153 memblock_add(PFN_PHYS(node_min_pfn),
165 we'll need for the bootmap. */ 154 (node_max_pfn - node_min_pfn) << PAGE_SHIFT);
166 bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn);
167
168 /* Now find a good region where to allocate the bootmap. */
169 for_each_mem_cluster(memdesc, cluster, i) {
170 if (cluster->usage & 3)
171 continue;
172
173 start = cluster->start_pfn;
174 end = start + cluster->numpages;
175
176 if (start >= node_max_pfn || end <= node_min_pfn)
177 continue;
178
179 if (end > node_max_pfn)
180 end = node_max_pfn;
181 if (start < node_min_pfn)
182 start = node_min_pfn;
183
184 if (start < start_kernel_pfn) {
185 if (end > end_kernel_pfn
186 && end - end_kernel_pfn >= bootmap_pages) {
187 bootmap_start = end_kernel_pfn;
188 break;
189 } else if (end > start_kernel_pfn)
190 end = start_kernel_pfn;
191 } else if (start < end_kernel_pfn)
192 start = end_kernel_pfn;
193 if (end - start >= bootmap_pages) {
194 bootmap_start = start;
195 break;
196 }
197 }
198
199 if (bootmap_start == -1)
200 panic("couldn't find a contiguous place for the bootmap");
201
202 /* Allocate the bootmap and mark the whole MM as reserved. */
203 bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start,
204 node_min_pfn, node_max_pfn);
205 DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n",
206 bootmap_start, bootmap_size, bootmap_pages);
207 155
208 /* Mark the free regions. */ 156 NODE_DATA(nid)->node_start_pfn = node_min_pfn;
209 for_each_mem_cluster(memdesc, cluster, i) { 157 NODE_DATA(nid)->node_present_pages = node_max_pfn - node_min_pfn;
210 if (cluster->usage & 3)
211 continue;
212
213 start = cluster->start_pfn;
214 end = cluster->start_pfn + cluster->numpages;
215
216 if (start >= node_max_pfn || end <= node_min_pfn)
217 continue;
218
219 if (end > node_max_pfn)
220 end = node_max_pfn;
221 if (start < node_min_pfn)
222 start = node_min_pfn;
223
224 if (start < start_kernel_pfn) {
225 if (end > end_kernel_pfn) {
226 free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start),
227 (PFN_PHYS(start_kernel_pfn)
228 - PFN_PHYS(start)));
229 printk(" freeing pages %ld:%ld\n",
230 start, start_kernel_pfn);
231 start = end_kernel_pfn;
232 } else if (end > start_kernel_pfn)
233 end = start_kernel_pfn;
234 } else if (start < end_kernel_pfn)
235 start = end_kernel_pfn;
236 if (start >= end)
237 continue;
238
239 free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
240 printk(" freeing pages %ld:%ld\n", start, end);
241 }
242
243 /* Reserve the bootmap memory. */
244 reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start),
245 bootmap_size, BOOTMEM_DEFAULT);
246 printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
247 158
248 node_set_online(nid); 159 node_set_online(nid);
249} 160}
@@ -251,6 +162,7 @@ setup_memory_node(int nid, void *kernel_end)
251void __init 162void __init
252setup_memory(void *kernel_end) 163setup_memory(void *kernel_end)
253{ 164{
165 unsigned long kernel_size;
254 int nid; 166 int nid;
255 167
256 show_mem_layout(); 168 show_mem_layout();
@@ -262,6 +174,9 @@ setup_memory(void *kernel_end)
262 for (nid = 0; nid < MAX_NUMNODES; nid++) 174 for (nid = 0; nid < MAX_NUMNODES; nid++)
263 setup_memory_node(nid, kernel_end); 175 setup_memory_node(nid, kernel_end);
264 176
177 kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS;
178 memblock_reserve(KERNEL_START_PHYS, kernel_size);
179
265#ifdef CONFIG_BLK_DEV_INITRD 180#ifdef CONFIG_BLK_DEV_INITRD
266 initrd_start = INITRD_START; 181 initrd_start = INITRD_START;
267 if (initrd_start) { 182 if (initrd_start) {
@@ -279,9 +194,8 @@ setup_memory(void *kernel_end)
279 phys_to_virt(PFN_PHYS(max_low_pfn))); 194 phys_to_virt(PFN_PHYS(max_low_pfn)));
280 } else { 195 } else {
281 nid = kvaddr_to_nid(initrd_start); 196 nid = kvaddr_to_nid(initrd_start);
282 reserve_bootmem_node(NODE_DATA(nid), 197 memblock_reserve(virt_to_phys((void *)initrd_start),
283 virt_to_phys((void *)initrd_start), 198 INITRD_SIZE);
284 INITRD_SIZE, BOOTMEM_DEFAULT);
285 } 199 }
286 } 200 }
287#endif /* CONFIG_BLK_DEV_INITRD */ 201#endif /* CONFIG_BLK_DEV_INITRD */
@@ -303,9 +217,8 @@ void __init paging_init(void)
303 dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 217 dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
304 218
305 for_each_online_node(nid) { 219 for_each_online_node(nid) {
306 bootmem_data_t *bdata = &bootmem_node_data[nid]; 220 unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
307 unsigned long start_pfn = bdata->node_min_pfn; 221 unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_present_pages;
308 unsigned long end_pfn = bdata->node_low_pfn;
309 222
310 if (dma_local_pfn >= end_pfn - start_pfn) 223 if (dma_local_pfn >= end_pfn - start_pfn)
311 zones_size[ZONE_DMA] = end_pfn - start_pfn; 224 zones_size[ZONE_DMA] = end_pfn - start_pfn;
diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h
index d4014fbe5ea3..0d9f3918fa7e 100644
--- a/arch/arm/include/asm/hugetlb-3level.h
+++ b/arch/arm/include/asm/hugetlb-3level.h
@@ -29,6 +29,7 @@
29 * ptes. 29 * ptes.
30 * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes). 30 * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes).
31 */ 31 */
32#define __HAVE_ARCH_HUGE_PTEP_GET
32static inline pte_t huge_ptep_get(pte_t *ptep) 33static inline pte_t huge_ptep_get(pte_t *ptep)
33{ 34{
34 pte_t retval = *ptep; 35 pte_t retval = *ptep;
@@ -37,35 +38,4 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
37 return retval; 38 return retval;
38} 39}
39 40
40static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
41 pte_t *ptep, pte_t pte)
42{
43 set_pte_at(mm, addr, ptep, pte);
44}
45
46static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
47 unsigned long addr, pte_t *ptep)
48{
49 ptep_clear_flush(vma, addr, ptep);
50}
51
52static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
53 unsigned long addr, pte_t *ptep)
54{
55 ptep_set_wrprotect(mm, addr, ptep);
56}
57
58static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
59 unsigned long addr, pte_t *ptep)
60{
61 return ptep_get_and_clear(mm, addr, ptep);
62}
63
64static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
65 unsigned long addr, pte_t *ptep,
66 pte_t pte, int dirty)
67{
68 return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
69}
70
71#endif /* _ASM_ARM_HUGETLB_3LEVEL_H */ 41#endif /* _ASM_ARM_HUGETLB_3LEVEL_H */
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 7d26f6c4f0f5..b67256c22b08 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -23,18 +23,8 @@
23#define _ASM_ARM_HUGETLB_H 23#define _ASM_ARM_HUGETLB_H
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm-generic/hugetlb.h>
27
28#include <asm/hugetlb-3level.h> 26#include <asm/hugetlb-3level.h>
29 27#include <asm-generic/hugetlb.h>
30static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
31 unsigned long addr, unsigned long end,
32 unsigned long floor,
33 unsigned long ceiling)
34{
35 free_pgd_range(tlb, addr, end, floor, ceiling);
36}
37
38 28
39static inline int is_hugepage_only_range(struct mm_struct *mm, 29static inline int is_hugepage_only_range(struct mm_struct *mm,
40 unsigned long addr, unsigned long len) 30 unsigned long addr, unsigned long len)
@@ -42,27 +32,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
42 return 0; 32 return 0;
43} 33}
44 34
45static inline int prepare_hugepage_range(struct file *file,
46 unsigned long addr, unsigned long len)
47{
48 struct hstate *h = hstate_file(file);
49 if (len & ~huge_page_mask(h))
50 return -EINVAL;
51 if (addr & ~huge_page_mask(h))
52 return -EINVAL;
53 return 0;
54}
55
56static inline int huge_pte_none(pte_t pte)
57{
58 return pte_none(pte);
59}
60
61static inline pte_t huge_pte_wrprotect(pte_t pte)
62{
63 return pte_wrprotect(pte);
64}
65
66static inline void arch_clear_hugepage_flags(struct page *page) 35static inline void arch_clear_hugepage_flags(struct page *page)
67{ 36{
68 clear_bit(PG_dcache_clean, &page->flags); 37 clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index e73f68569624..fb6609875455 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -20,48 +20,18 @@
20 20
21#include <asm/page.h> 21#include <asm/page.h>
22 22
23#define __HAVE_ARCH_HUGE_PTEP_GET
23static inline pte_t huge_ptep_get(pte_t *ptep) 24static inline pte_t huge_ptep_get(pte_t *ptep)
24{ 25{
25 return READ_ONCE(*ptep); 26 return READ_ONCE(*ptep);
26} 27}
27 28
28
29
30static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
31 unsigned long addr, unsigned long end,
32 unsigned long floor,
33 unsigned long ceiling)
34{
35 free_pgd_range(tlb, addr, end, floor, ceiling);
36}
37
38static inline int is_hugepage_only_range(struct mm_struct *mm, 29static inline int is_hugepage_only_range(struct mm_struct *mm,
39 unsigned long addr, unsigned long len) 30 unsigned long addr, unsigned long len)
40{ 31{
41 return 0; 32 return 0;
42} 33}
43 34
44static inline int prepare_hugepage_range(struct file *file,
45 unsigned long addr, unsigned long len)
46{
47 struct hstate *h = hstate_file(file);
48 if (len & ~huge_page_mask(h))
49 return -EINVAL;
50 if (addr & ~huge_page_mask(h))
51 return -EINVAL;
52 return 0;
53}
54
55static inline int huge_pte_none(pte_t pte)
56{
57 return pte_none(pte);
58}
59
60static inline pte_t huge_pte_wrprotect(pte_t pte)
61{
62 return pte_wrprotect(pte);
63}
64
65static inline void arch_clear_hugepage_flags(struct page *page) 35static inline void arch_clear_hugepage_flags(struct page *page)
66{ 36{
67 clear_bit(PG_dcache_clean, &page->flags); 37 clear_bit(PG_dcache_clean, &page->flags);
@@ -70,20 +40,25 @@ static inline void arch_clear_hugepage_flags(struct page *page)
70extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, 40extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
71 struct page *page, int writable); 41 struct page *page, int writable);
72#define arch_make_huge_pte arch_make_huge_pte 42#define arch_make_huge_pte arch_make_huge_pte
43#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
73extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 44extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
74 pte_t *ptep, pte_t pte); 45 pte_t *ptep, pte_t pte);
46#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
75extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, 47extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
76 unsigned long addr, pte_t *ptep, 48 unsigned long addr, pte_t *ptep,
77 pte_t pte, int dirty); 49 pte_t pte, int dirty);
50#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
78extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, 51extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
79 unsigned long addr, pte_t *ptep); 52 unsigned long addr, pte_t *ptep);
53#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
80extern void huge_ptep_set_wrprotect(struct mm_struct *mm, 54extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
81 unsigned long addr, pte_t *ptep); 55 unsigned long addr, pte_t *ptep);
56#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
82extern void huge_ptep_clear_flush(struct vm_area_struct *vma, 57extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
83 unsigned long addr, pte_t *ptep); 58 unsigned long addr, pte_t *ptep);
59#define __HAVE_ARCH_HUGE_PTE_CLEAR
84extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, 60extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
85 pte_t *ptep, unsigned long sz); 61 pte_t *ptep, unsigned long sz);
86#define huge_pte_clear huge_pte_clear
87extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, 62extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
88 pte_t *ptep, pte_t pte, unsigned long sz); 63 pte_t *ptep, pte_t pte, unsigned long sz);
89#define set_huge_swap_pte_at set_huge_swap_pte_at 64#define set_huge_swap_pte_at set_huge_swap_pte_at
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index dd95d33a5bd5..03a6c256b7ec 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -16,6 +16,7 @@
16#ifndef __ASM_STRING_H 16#ifndef __ASM_STRING_H
17#define __ASM_STRING_H 17#define __ASM_STRING_H
18 18
19#ifndef CONFIG_KASAN
19#define __HAVE_ARCH_STRRCHR 20#define __HAVE_ARCH_STRRCHR
20extern char *strrchr(const char *, int c); 21extern char *strrchr(const char *, int c);
21 22
@@ -34,6 +35,13 @@ extern __kernel_size_t strlen(const char *);
34#define __HAVE_ARCH_STRNLEN 35#define __HAVE_ARCH_STRNLEN
35extern __kernel_size_t strnlen(const char *, __kernel_size_t); 36extern __kernel_size_t strnlen(const char *, __kernel_size_t);
36 37
38#define __HAVE_ARCH_MEMCMP
39extern int memcmp(const void *, const void *, size_t);
40
41#define __HAVE_ARCH_MEMCHR
42extern void *memchr(const void *, int, __kernel_size_t);
43#endif
44
37#define __HAVE_ARCH_MEMCPY 45#define __HAVE_ARCH_MEMCPY
38extern void *memcpy(void *, const void *, __kernel_size_t); 46extern void *memcpy(void *, const void *, __kernel_size_t);
39extern void *__memcpy(void *, const void *, __kernel_size_t); 47extern void *__memcpy(void *, const void *, __kernel_size_t);
@@ -42,16 +50,10 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
42extern void *memmove(void *, const void *, __kernel_size_t); 50extern void *memmove(void *, const void *, __kernel_size_t);
43extern void *__memmove(void *, const void *, __kernel_size_t); 51extern void *__memmove(void *, const void *, __kernel_size_t);
44 52
45#define __HAVE_ARCH_MEMCHR
46extern void *memchr(const void *, int, __kernel_size_t);
47
48#define __HAVE_ARCH_MEMSET 53#define __HAVE_ARCH_MEMSET
49extern void *memset(void *, int, __kernel_size_t); 54extern void *memset(void *, int, __kernel_size_t);
50extern void *__memset(void *, int, __kernel_size_t); 55extern void *__memset(void *, int, __kernel_size_t);
51 56
52#define __HAVE_ARCH_MEMCMP
53extern int memcmp(const void *, const void *, size_t);
54
55#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 57#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
56#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 58#define __HAVE_ARCH_MEMCPY_FLUSHCACHE
57void memcpy_flushcache(void *dst, const void *src, size_t cnt); 59void memcpy_flushcache(void *dst, const void *src, size_t cnt);
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20b70b2..72f63a59b008 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,20 +44,23 @@ EXPORT_SYMBOL(__arch_copy_in_user);
44EXPORT_SYMBOL(memstart_addr); 44EXPORT_SYMBOL(memstart_addr);
45 45
46 /* string / mem functions */ 46 /* string / mem functions */
47#ifndef CONFIG_KASAN
47EXPORT_SYMBOL(strchr); 48EXPORT_SYMBOL(strchr);
48EXPORT_SYMBOL(strrchr); 49EXPORT_SYMBOL(strrchr);
49EXPORT_SYMBOL(strcmp); 50EXPORT_SYMBOL(strcmp);
50EXPORT_SYMBOL(strncmp); 51EXPORT_SYMBOL(strncmp);
51EXPORT_SYMBOL(strlen); 52EXPORT_SYMBOL(strlen);
52EXPORT_SYMBOL(strnlen); 53EXPORT_SYMBOL(strnlen);
54EXPORT_SYMBOL(memcmp);
55EXPORT_SYMBOL(memchr);
56#endif
57
53EXPORT_SYMBOL(memset); 58EXPORT_SYMBOL(memset);
54EXPORT_SYMBOL(memcpy); 59EXPORT_SYMBOL(memcpy);
55EXPORT_SYMBOL(memmove); 60EXPORT_SYMBOL(memmove);
56EXPORT_SYMBOL(__memset); 61EXPORT_SYMBOL(__memset);
57EXPORT_SYMBOL(__memcpy); 62EXPORT_SYMBOL(__memcpy);
58EXPORT_SYMBOL(__memmove); 63EXPORT_SYMBOL(__memmove);
59EXPORT_SYMBOL(memchr);
60EXPORT_SYMBOL(memcmp);
61 64
62 /* atomic bitops */ 65 /* atomic bitops */
63EXPORT_SYMBOL(set_bit); 66EXPORT_SYMBOL(set_bit);
diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S
index 4444c1d25f4b..0f164a4baf52 100644
--- a/arch/arm64/lib/memchr.S
+++ b/arch/arm64/lib/memchr.S
@@ -30,7 +30,7 @@
30 * Returns: 30 * Returns:
31 * x0 - address of first occurrence of 'c' or 0 31 * x0 - address of first occurrence of 'c' or 0
32 */ 32 */
33ENTRY(memchr) 33WEAK(memchr)
34 and w1, w1, #0xff 34 and w1, w1, #0xff
351: subs x2, x2, #1 351: subs x2, x2, #1
36 b.mi 2f 36 b.mi 2f
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index 2a4e239bd17a..fb295f52e9f8 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -58,7 +58,7 @@ pos .req x11
58limit_wd .req x12 58limit_wd .req x12
59mask .req x13 59mask .req x13
60 60
61ENTRY(memcmp) 61WEAK(memcmp)
62 cbz limit, .Lret0 62 cbz limit, .Lret0
63 eor tmp1, src1, src2 63 eor tmp1, src1, src2
64 tst tmp1, #7 64 tst tmp1, #7
diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S
index dae0cf5591f9..7c83091d1bcd 100644
--- a/arch/arm64/lib/strchr.S
+++ b/arch/arm64/lib/strchr.S
@@ -29,7 +29,7 @@
29 * Returns: 29 * Returns:
30 * x0 - address of first occurrence of 'c' or 0 30 * x0 - address of first occurrence of 'c' or 0
31 */ 31 */
32ENTRY(strchr) 32WEAK(strchr)
33 and w1, w1, #0xff 33 and w1, w1, #0xff
341: ldrb w2, [x0], #1 341: ldrb w2, [x0], #1
35 cmp w2, w1 35 cmp w2, w1
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61760ef..7d5d15398bfb 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,7 +60,7 @@ tmp3 .req x9
60zeroones .req x10 60zeroones .req x10
61pos .req x11 61pos .req x11
62 62
63ENTRY(strcmp) 63WEAK(strcmp)
64 eor tmp1, src1, src2 64 eor tmp1, src1, src2
65 mov zeroones, #REP8_01 65 mov zeroones, #REP8_01
66 tst tmp1, #7 66 tst tmp1, #7
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index 55ccc8e24c08..8e0b14205dcb 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -56,7 +56,7 @@ pos .req x12
56#define REP8_7f 0x7f7f7f7f7f7f7f7f 56#define REP8_7f 0x7f7f7f7f7f7f7f7f
57#define REP8_80 0x8080808080808080 57#define REP8_80 0x8080808080808080
58 58
59ENTRY(strlen) 59WEAK(strlen)
60 mov zeroones, #REP8_01 60 mov zeroones, #REP8_01
61 bic src, srcin, #15 61 bic src, srcin, #15
62 ands tmp1, srcin, #15 62 ands tmp1, srcin, #15
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044761c6..66bd145935d9 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,7 +64,7 @@ limit_wd .req x13
64mask .req x14 64mask .req x14
65endloop .req x15 65endloop .req x15
66 66
67ENTRY(strncmp) 67WEAK(strncmp)
68 cbz limit, .Lret0 68 cbz limit, .Lret0
69 eor tmp1, src1, src2 69 eor tmp1, src1, src2
70 mov zeroones, #REP8_01 70 mov zeroones, #REP8_01
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
index eae38da6e0bb..355be04441fe 100644
--- a/arch/arm64/lib/strnlen.S
+++ b/arch/arm64/lib/strnlen.S
@@ -59,7 +59,7 @@ limit_wd .req x14
59#define REP8_7f 0x7f7f7f7f7f7f7f7f 59#define REP8_7f 0x7f7f7f7f7f7f7f7f
60#define REP8_80 0x8080808080808080 60#define REP8_80 0x8080808080808080
61 61
62ENTRY(strnlen) 62WEAK(strnlen)
63 cbz limit, .Lhit_limit 63 cbz limit, .Lhit_limit
64 mov zeroones, #REP8_01 64 mov zeroones, #REP8_01
65 bic src, srcin, #15 65 bic src, srcin, #15
diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S
index f8e2784d5752..ea84924d5990 100644
--- a/arch/arm64/lib/strrchr.S
+++ b/arch/arm64/lib/strrchr.S
@@ -29,7 +29,7 @@
29 * Returns: 29 * Returns:
30 * x0 - address of last occurrence of 'c' or 0 30 * x0 - address of last occurrence of 'c' or 0
31 */ 31 */
32ENTRY(strrchr) 32WEAK(strrchr)
33 mov x3, #0 33 mov x3, #0
34 and w1, w1, #0xff 34 and w1, w1, #0xff
351: ldrb w2, [x0], #1 351: ldrb w2, [x0], #1
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 3ef46522e89f..7b25d7c8fa49 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -21,6 +21,9 @@ config HEXAGON
21 select GENERIC_IRQ_SHOW 21 select GENERIC_IRQ_SHOW
22 select HAVE_ARCH_KGDB 22 select HAVE_ARCH_KGDB
23 select HAVE_ARCH_TRACEHOOK 23 select HAVE_ARCH_TRACEHOOK
24 select HAVE_MEMBLOCK
25 select ARCH_DISCARD_MEMBLOCK
26 select NO_BOOTMEM
24 select NEED_SG_DMA_LENGTH 27 select NEED_SG_DMA_LENGTH
25 select NO_IOPORT_MAP 28 select NO_IOPORT_MAP
26 select GENERIC_IOMAP 29 select GENERIC_IOMAP
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index 1495d45e472d..d789b9cc0189 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <asm/atomic.h> 25#include <asm/atomic.h>
25#include <linux/highmem.h> 26#include <linux/highmem.h>
26#include <asm/tlb.h> 27#include <asm/tlb.h>
@@ -176,7 +177,6 @@ size_t hexagon_coherent_pool_size = (size_t) (DMA_RESERVE << 22);
176 177
177void __init setup_arch_memory(void) 178void __init setup_arch_memory(void)
178{ 179{
179 int bootmap_size;
180 /* XXX Todo: this probably should be cleaned up */ 180 /* XXX Todo: this probably should be cleaned up */
181 u32 *segtable = (u32 *) &swapper_pg_dir[0]; 181 u32 *segtable = (u32 *) &swapper_pg_dir[0];
182 u32 *segtable_end; 182 u32 *segtable_end;
@@ -195,18 +195,22 @@ void __init setup_arch_memory(void)
195 bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) & 195 bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) &
196 ~((BIG_KERNEL_PAGE_SIZE) - 1)); 196 ~((BIG_KERNEL_PAGE_SIZE) - 1));
197 197
198 memblock_add(PHYS_OFFSET,
199 (bootmem_lastpg - ARCH_PFN_OFFSET) << PAGE_SHIFT);
200
201 /* Reserve kernel text/data/bss */
202 memblock_reserve(PHYS_OFFSET,
203 (bootmem_startpg - ARCH_PFN_OFFSET) << PAGE_SHIFT);
198 /* 204 /*
199 * Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached) 205 * Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached)
200 * memory allocation 206 * memory allocation
201 */ 207 */
202
203 max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES); 208 max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES);
204 min_low_pfn = ARCH_PFN_OFFSET; 209 min_low_pfn = ARCH_PFN_OFFSET;
205 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmem_startpg, min_low_pfn, max_low_pfn); 210 memblock_reserve(PFN_PHYS(max_low_pfn), DMA_RESERVED_BYTES);
206 211
207 printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg); 212 printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg);
208 printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg); 213 printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg);
209 printk(KERN_INFO "bootmap_size: %d\n", bootmap_size);
210 printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn); 214 printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn);
211 printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn); 215 printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn);
212 216
@@ -257,14 +261,6 @@ void __init setup_arch_memory(void)
257#endif 261#endif
258 262
259 /* 263 /*
260 * Free all the memory that wasn't taken up by the bootmap, the DMA
261 * reserve, or kernel itself.
262 */
263 free_bootmem(PFN_PHYS(bootmem_startpg) + bootmap_size,
264 PFN_PHYS(bootmem_lastpg - bootmem_startpg) - bootmap_size -
265 DMA_RESERVED_BYTES);
266
267 /*
268 * The bootmem allocator seemingly just lives to feed memory 264 * The bootmem allocator seemingly just lives to feed memory
269 * to the paging system 265 * to the paging system
270 */ 266 */
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index 74d2a5540aaf..36cc0396b214 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -3,13 +3,13 @@
3#define _ASM_IA64_HUGETLB_H 3#define _ASM_IA64_HUGETLB_H
4 4
5#include <asm/page.h> 5#include <asm/page.h>
6#include <asm-generic/hugetlb.h>
7
8 6
7#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
9void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 8void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
10 unsigned long end, unsigned long floor, 9 unsigned long end, unsigned long floor,
11 unsigned long ceiling); 10 unsigned long ceiling);
12 11
12#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
13int prepare_hugepage_range(struct file *file, 13int prepare_hugepage_range(struct file *file,
14 unsigned long addr, unsigned long len); 14 unsigned long addr, unsigned long len);
15 15
@@ -21,53 +21,16 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
21 REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); 21 REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
22} 22}
23 23
24static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 24#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
25 pte_t *ptep, pte_t pte)
26{
27 set_pte_at(mm, addr, ptep, pte);
28}
29
30static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
31 unsigned long addr, pte_t *ptep)
32{
33 return ptep_get_and_clear(mm, addr, ptep);
34}
35
36static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, 25static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
37 unsigned long addr, pte_t *ptep) 26 unsigned long addr, pte_t *ptep)
38{ 27{
39} 28}
40 29
41static inline int huge_pte_none(pte_t pte)
42{
43 return pte_none(pte);
44}
45
46static inline pte_t huge_pte_wrprotect(pte_t pte)
47{
48 return pte_wrprotect(pte);
49}
50
51static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
52 unsigned long addr, pte_t *ptep)
53{
54 ptep_set_wrprotect(mm, addr, ptep);
55}
56
57static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
58 unsigned long addr, pte_t *ptep,
59 pte_t pte, int dirty)
60{
61 return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
62}
63
64static inline pte_t huge_ptep_get(pte_t *ptep)
65{
66 return *ptep;
67}
68
69static inline void arch_clear_hugepage_flags(struct page *page) 30static inline void arch_clear_hugepage_flags(struct page *page)
70{ 31{
71} 32}
72 33
34#include <asm-generic/hugetlb.h>
35
73#endif /* _ASM_IA64_HUGETLB_H */ 36#endif /* _ASM_IA64_HUGETLB_H */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 165827774bea..b1e7468eb65a 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -544,7 +544,6 @@ extern struct page *zero_page_memmap_ptr;
544 544
545# ifdef CONFIG_VIRTUAL_MEM_MAP 545# ifdef CONFIG_VIRTUAL_MEM_MAP
546 /* arch mem_map init routine is needed due to holes in a virtual mem_map */ 546 /* arch mem_map init routine is needed due to holes in a virtual mem_map */
547# define __HAVE_ARCH_MEMMAP_INIT
548 extern void memmap_init (unsigned long size, int nid, unsigned long zone, 547 extern void memmap_init (unsigned long size, int nid, unsigned long zone,
549 unsigned long start_pfn); 548 unsigned long start_pfn);
550# endif /* CONFIG_VIRTUAL_MEM_MAP */ 549# endif /* CONFIG_VIRTUAL_MEM_MAP */
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index 982bc0685330..425bb6fc3bda 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -10,8 +10,6 @@
10#define __ASM_HUGETLB_H 10#define __ASM_HUGETLB_H
11 11
12#include <asm/page.h> 12#include <asm/page.h>
13#include <asm-generic/hugetlb.h>
14
15 13
16static inline int is_hugepage_only_range(struct mm_struct *mm, 14static inline int is_hugepage_only_range(struct mm_struct *mm,
17 unsigned long addr, 15 unsigned long addr,
@@ -20,6 +18,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
20 return 0; 18 return 0;
21} 19}
22 20
21#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
23static inline int prepare_hugepage_range(struct file *file, 22static inline int prepare_hugepage_range(struct file *file,
24 unsigned long addr, 23 unsigned long addr,
25 unsigned long len) 24 unsigned long len)
@@ -38,21 +37,7 @@ static inline int prepare_hugepage_range(struct file *file,
38 return 0; 37 return 0;
39} 38}
40 39
41static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 40#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
42 unsigned long addr,
43 unsigned long end,
44 unsigned long floor,
45 unsigned long ceiling)
46{
47 free_pgd_range(tlb, addr, end, floor, ceiling);
48}
49
50static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
51 pte_t *ptep, pte_t pte)
52{
53 set_pte_at(mm, addr, ptep, pte);
54}
55
56static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, 41static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
57 unsigned long addr, pte_t *ptep) 42 unsigned long addr, pte_t *ptep)
58{ 43{
@@ -64,29 +49,21 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
64 return pte; 49 return pte;
65} 50}
66 51
52#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
67static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, 53static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
68 unsigned long addr, pte_t *ptep) 54 unsigned long addr, pte_t *ptep)
69{ 55{
70 flush_tlb_page(vma, addr & huge_page_mask(hstate_vma(vma))); 56 flush_tlb_page(vma, addr & huge_page_mask(hstate_vma(vma)));
71} 57}
72 58
59#define __HAVE_ARCH_HUGE_PTE_NONE
73static inline int huge_pte_none(pte_t pte) 60static inline int huge_pte_none(pte_t pte)
74{ 61{
75 unsigned long val = pte_val(pte) & ~_PAGE_GLOBAL; 62 unsigned long val = pte_val(pte) & ~_PAGE_GLOBAL;
76 return !val || (val == (unsigned long)invalid_pte_table); 63 return !val || (val == (unsigned long)invalid_pte_table);
77} 64}
78 65
79static inline pte_t huge_pte_wrprotect(pte_t pte) 66#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
80{
81 return pte_wrprotect(pte);
82}
83
84static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
85 unsigned long addr, pte_t *ptep)
86{
87 ptep_set_wrprotect(mm, addr, ptep);
88}
89
90static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, 67static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
91 unsigned long addr, 68 unsigned long addr,
92 pte_t *ptep, pte_t pte, 69 pte_t *ptep, pte_t pte,
@@ -105,13 +82,10 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
105 return changed; 82 return changed;
106} 83}
107 84
108static inline pte_t huge_ptep_get(pte_t *ptep)
109{
110 return *ptep;
111}
112
113static inline void arch_clear_hugepage_flags(struct page *page) 85static inline void arch_clear_hugepage_flags(struct page *page)
114{ 86{
115} 87}
116 88
89#include <asm-generic/hugetlb.h>
90
117#endif /* __ASM_HUGETLB_H */ 91#endif /* __ASM_HUGETLB_H */
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 03965692fbfe..2df0c57f2833 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -23,6 +23,9 @@ config NIOS2
23 select SPARSE_IRQ 23 select SPARSE_IRQ
24 select USB_ARCH_HAS_HCD if USB_SUPPORT 24 select USB_ARCH_HAS_HCD if USB_SUPPORT
25 select CPU_NO_EFFICIENT_FFS 25 select CPU_NO_EFFICIENT_FFS
26 select HAVE_MEMBLOCK
27 select ARCH_DISCARD_MEMBLOCK
28 select NO_BOOTMEM
26 29
27config GENERIC_CSUM 30config GENERIC_CSUM
28 def_bool y 31 def_bool y
diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c
index 8d7446a4b475..a6d4f7530247 100644
--- a/arch/nios2/kernel/prom.c
+++ b/arch/nios2/kernel/prom.c
@@ -32,23 +32,6 @@
32 32
33#include <asm/sections.h> 33#include <asm/sections.h>
34 34
35void __init early_init_dt_add_memory_arch(u64 base, u64 size)
36{
37 u64 kernel_start = (u64)virt_to_phys(_text);
38
39 if (!memory_size &&
40 (kernel_start >= base) && (kernel_start < (base + size)))
41 memory_size = size;
42
43}
44
45int __init early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size,
46 bool nomap)
47{
48 reserve_bootmem(base, size, BOOTMEM_DEFAULT);
49 return 0;
50}
51
52void __init early_init_devtree(void *params) 35void __init early_init_devtree(void *params)
53{ 36{
54 __be32 *dtb = (u32 *)__dtb_start; 37 __be32 *dtb = (u32 *)__dtb_start;
diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c
index 926a02b17b31..2d0011ddd4d5 100644
--- a/arch/nios2/kernel/setup.c
+++ b/arch/nios2/kernel/setup.c
@@ -17,6 +17,7 @@
17#include <linux/sched/task.h> 17#include <linux/sched/task.h>
18#include <linux/console.h> 18#include <linux/console.h>
19#include <linux/bootmem.h> 19#include <linux/bootmem.h>
20#include <linux/memblock.h>
20#include <linux/initrd.h> 21#include <linux/initrd.h>
21#include <linux/of_fdt.h> 22#include <linux/of_fdt.h>
22#include <linux/screen_info.h> 23#include <linux/screen_info.h>
@@ -143,10 +144,12 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6,
143 144
144void __init setup_arch(char **cmdline_p) 145void __init setup_arch(char **cmdline_p)
145{ 146{
146 int bootmap_size; 147 int dram_start;
147 148
148 console_verbose(); 149 console_verbose();
149 150
151 dram_start = memblock_start_of_DRAM();
152 memory_size = memblock_phys_mem_size();
150 memory_start = PAGE_ALIGN((unsigned long)__pa(_end)); 153 memory_start = PAGE_ALIGN((unsigned long)__pa(_end));
151 memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size; 154 memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size;
152 155
@@ -163,39 +166,11 @@ void __init setup_arch(char **cmdline_p)
163 max_low_pfn = PFN_DOWN(memory_end); 166 max_low_pfn = PFN_DOWN(memory_end);
164 max_mapnr = max_low_pfn; 167 max_mapnr = max_low_pfn;
165 168
166 /* 169 memblock_reserve(dram_start, memory_start - dram_start);
167 * give all the memory to the bootmap allocator, tell it to put the
168 * boot mem_map at the start of memory
169 */
170 pr_debug("init_bootmem_node(?,%#lx, %#x, %#lx)\n",
171 min_low_pfn, PFN_DOWN(PHYS_OFFSET), max_low_pfn);
172 bootmap_size = init_bootmem_node(NODE_DATA(0),
173 min_low_pfn, PFN_DOWN(PHYS_OFFSET),
174 max_low_pfn);
175
176 /*
177 * free the usable memory, we have to make sure we do not free
178 * the bootmem bitmap so we then reserve it after freeing it :-)
179 */
180 pr_debug("free_bootmem(%#lx, %#lx)\n",
181 memory_start, memory_end - memory_start);
182 free_bootmem(memory_start, memory_end - memory_start);
183
184 /*
185 * Reserve the bootmem bitmap itself as well. We do this in two
186 * steps (first step was init_bootmem()) because this catches
187 * the (very unlikely) case of us accidentally initializing the
188 * bootmem allocator with an invalid RAM area.
189 *
190 * Arguments are start, size
191 */
192 pr_debug("reserve_bootmem(%#lx, %#x)\n", memory_start, bootmap_size);
193 reserve_bootmem(memory_start, bootmap_size, BOOTMEM_DEFAULT);
194
195#ifdef CONFIG_BLK_DEV_INITRD 170#ifdef CONFIG_BLK_DEV_INITRD
196 if (initrd_start) { 171 if (initrd_start) {
197 reserve_bootmem(virt_to_phys((void *)initrd_start), 172 memblock_reserve(virt_to_phys((void *)initrd_start),
198 initrd_end - initrd_start, BOOTMEM_DEFAULT); 173 initrd_end - initrd_start);
199 } 174 }
200#endif /* CONFIG_BLK_DEV_INITRD */ 175#endif /* CONFIG_BLK_DEV_INITRD */
201 176
diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h
index 58e0f4620426..7cb595dcb7d7 100644
--- a/arch/parisc/include/asm/hugetlb.h
+++ b/arch/parisc/include/asm/hugetlb.h
@@ -3,12 +3,12 @@
3#define _ASM_PARISC64_HUGETLB_H 3#define _ASM_PARISC64_HUGETLB_H
4 4
5#include <asm/page.h> 5#include <asm/page.h>
6#include <asm-generic/hugetlb.h>
7
8 6
7#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
9void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 8void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
10 pte_t *ptep, pte_t pte); 9 pte_t *ptep, pte_t pte);
11 10
11#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
12pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 12pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
13 pte_t *ptep); 13 pte_t *ptep);
14 14
@@ -22,6 +22,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
22 * If the arch doesn't supply something else, assume that hugepage 22 * If the arch doesn't supply something else, assume that hugepage
23 * size aligned regions are ok without further preparation. 23 * size aligned regions are ok without further preparation.
24 */ 24 */
25#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
25static inline int prepare_hugepage_range(struct file *file, 26static inline int prepare_hugepage_range(struct file *file,
26 unsigned long addr, unsigned long len) 27 unsigned long addr, unsigned long len)
27{ 28{
@@ -32,43 +33,25 @@ static inline int prepare_hugepage_range(struct file *file,
32 return 0; 33 return 0;
33} 34}
34 35
35static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 36#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
36 unsigned long addr, unsigned long end,
37 unsigned long floor,
38 unsigned long ceiling)
39{
40 free_pgd_range(tlb, addr, end, floor, ceiling);
41}
42
43static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, 37static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
44 unsigned long addr, pte_t *ptep) 38 unsigned long addr, pte_t *ptep)
45{ 39{
46} 40}
47 41
48static inline int huge_pte_none(pte_t pte) 42#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
49{
50 return pte_none(pte);
51}
52
53static inline pte_t huge_pte_wrprotect(pte_t pte)
54{
55 return pte_wrprotect(pte);
56}
57
58void huge_ptep_set_wrprotect(struct mm_struct *mm, 43void huge_ptep_set_wrprotect(struct mm_struct *mm,
59 unsigned long addr, pte_t *ptep); 44 unsigned long addr, pte_t *ptep);
60 45
46#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
61int huge_ptep_set_access_flags(struct vm_area_struct *vma, 47int huge_ptep_set_access_flags(struct vm_area_struct *vma,
62 unsigned long addr, pte_t *ptep, 48 unsigned long addr, pte_t *ptep,
63 pte_t pte, int dirty); 49 pte_t pte, int dirty);
64 50
65static inline pte_t huge_ptep_get(pte_t *ptep)
66{
67 return *ptep;
68}
69
70static inline void arch_clear_hugepage_flags(struct page *page) 51static inline void arch_clear_hugepage_flags(struct page *page)
71{ 52{
72} 53}
73 54
55#include <asm-generic/hugetlb.h>
56
74#endif /* _ASM_PARISC64_HUGETLB_H */ 57#endif /* _ASM_PARISC64_HUGETLB_H */
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index e61dd3ae5bc0..c21d33704633 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -311,12 +311,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
311{ 311{
312 pte_update(ptep, _PAGE_RW, 0); 312 pte_update(ptep, _PAGE_RW, 0);
313} 313}
314static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
315 unsigned long addr, pte_t *ptep)
316{
317 ptep_set_wrprotect(mm, addr, ptep);
318}
319
320 314
321static inline void __ptep_set_access_flags(struct vm_area_struct *vma, 315static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
322 pte_t *ptep, pte_t entry, 316 pte_t *ptep, pte_t entry,
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index cb5dd4078d42..c4a726c10af5 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -426,6 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
426 pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); 426 pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0);
427} 427}
428 428
429#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
429static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, 430static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
430 unsigned long addr, pte_t *ptep) 431 unsigned long addr, pte_t *ptep)
431{ 432{
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 2d00cc530083..383da1ab9e23 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -4,7 +4,6 @@
4 4
5#ifdef CONFIG_HUGETLB_PAGE 5#ifdef CONFIG_HUGETLB_PAGE
6#include <asm/page.h> 6#include <asm/page.h>
7#include <asm-generic/hugetlb.h>
8 7
9extern struct kmem_cache *hugepte_cache; 8extern struct kmem_cache *hugepte_cache;
10 9
@@ -110,31 +109,12 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
110void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); 109void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
111#endif 110#endif
112 111
112#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
113void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 113void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
114 unsigned long end, unsigned long floor, 114 unsigned long end, unsigned long floor,
115 unsigned long ceiling); 115 unsigned long ceiling);
116 116
117/* 117#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
118 * If the arch doesn't supply something else, assume that hugepage
119 * size aligned regions are ok without further preparation.
120 */
121static inline int prepare_hugepage_range(struct file *file,
122 unsigned long addr, unsigned long len)
123{
124 struct hstate *h = hstate_file(file);
125 if (len & ~huge_page_mask(h))
126 return -EINVAL;
127 if (addr & ~huge_page_mask(h))
128 return -EINVAL;
129 return 0;
130}
131
132static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
133 pte_t *ptep, pte_t pte)
134{
135 set_pte_at(mm, addr, ptep, pte);
136}
137
138static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, 118static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
139 unsigned long addr, pte_t *ptep) 119 unsigned long addr, pte_t *ptep)
140{ 120{
@@ -145,6 +125,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
145#endif 125#endif
146} 126}
147 127
128#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
148static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, 129static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
149 unsigned long addr, pte_t *ptep) 130 unsigned long addr, pte_t *ptep)
150{ 131{
@@ -153,29 +134,17 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
153 flush_hugetlb_page(vma, addr); 134 flush_hugetlb_page(vma, addr);
154} 135}
155 136
156static inline int huge_pte_none(pte_t pte) 137#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
157{
158 return pte_none(pte);
159}
160
161static inline pte_t huge_pte_wrprotect(pte_t pte)
162{
163 return pte_wrprotect(pte);
164}
165
166extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, 138extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
167 unsigned long addr, pte_t *ptep, 139 unsigned long addr, pte_t *ptep,
168 pte_t pte, int dirty); 140 pte_t pte, int dirty);
169 141
170static inline pte_t huge_ptep_get(pte_t *ptep)
171{
172 return *ptep;
173}
174
175static inline void arch_clear_hugepage_flags(struct page *page) 142static inline void arch_clear_hugepage_flags(struct page *page)
176{ 143{
177} 144}
178 145
146#include <asm-generic/hugetlb.h>
147
179#else /* ! CONFIG_HUGETLB_PAGE */ 148#else /* ! CONFIG_HUGETLB_PAGE */
180static inline void flush_hugetlb_page(struct vm_area_struct *vma, 149static inline void flush_hugetlb_page(struct vm_area_struct *vma,
181 unsigned long vmaddr) 150 unsigned long vmaddr)
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index f7b129a83054..3ffb0ff5a038 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -300,12 +300,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
300 300
301 pte_update(ptep, clr, set); 301 pte_update(ptep, clr, set);
302} 302}
303static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
304 unsigned long addr, pte_t *ptep)
305{
306 ptep_set_wrprotect(mm, addr, ptep);
307}
308
309 303
310static inline void __ptep_set_access_flags(struct vm_area_struct *vma, 304static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
311 pte_t *ptep, pte_t entry, 305 pte_t *ptep, pte_t entry,
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index dc6bb9da3f23..67421f74efcf 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -275,6 +275,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
275 pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); 275 pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
276} 276}
277 277
278#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
278static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, 279static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
279 unsigned long addr, pte_t *ptep) 280 unsigned long addr, pte_t *ptep)
280{ 281{
diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
index 882944c36ef5..5d8e8b6bb1cc 100644
--- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c
+++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
@@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info)
49 cpu = info->policy->cpu; 49 cpu = info->policy->cpu;
50 busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus); 50 busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus);
51 51
52 CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1); 52 info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1);
53 pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", 53 pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n",
54 cpu, busy_spus, info->busy_spus); 54 cpu, busy_spus, info->busy_spus);
55 55
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index c9ef3c532169..9fcccb4490b9 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -987,9 +987,9 @@ static void spu_calc_load(void)
987 unsigned long active_tasks; /* fixed-point */ 987 unsigned long active_tasks; /* fixed-point */
988 988
989 active_tasks = count_active_contexts() * FIXED_1; 989 active_tasks = count_active_contexts() * FIXED_1;
990 CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); 990 spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks);
991 CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); 991 spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks);
992 CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); 992 spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks);
993} 993}
994 994
995static void spusched_wake(struct timer_list *unused) 995static void spusched_wake(struct timer_list *unused)
@@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx,
1071 } 1071 }
1072} 1072}
1073 1073
1074#define LOAD_INT(x) ((x) >> FSHIFT)
1075#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
1076
1077static int show_spu_loadavg(struct seq_file *s, void *private) 1074static int show_spu_loadavg(struct seq_file *s, void *private)
1078{ 1075{
1079 int a, b, c; 1076 int a, b, c;
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 433a994b1a89..54f375627532 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -25,10 +25,6 @@
25 25
26#include "appldata.h" 26#include "appldata.h"
27 27
28
29#define LOAD_INT(x) ((x) >> FSHIFT)
30#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
31
32/* 28/*
33 * OS data 29 * OS data
34 * 30 *
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 735939c0f513..6f025fe18146 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -4,8 +4,6 @@
4 4
5#include <asm/cacheflush.h> 5#include <asm/cacheflush.h>
6#include <asm/page.h> 6#include <asm/page.h>
7#include <asm-generic/hugetlb.h>
8
9 7
10static inline int is_hugepage_only_range(struct mm_struct *mm, 8static inline int is_hugepage_only_range(struct mm_struct *mm,
11 unsigned long addr, 9 unsigned long addr,
@@ -17,6 +15,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
17 * If the arch doesn't supply something else, assume that hugepage 15 * If the arch doesn't supply something else, assume that hugepage
18 * size aligned regions are ok without further preparation. 16 * size aligned regions are ok without further preparation.
19 */ 17 */
18#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
20static inline int prepare_hugepage_range(struct file *file, 19static inline int prepare_hugepage_range(struct file *file,
21 unsigned long addr, unsigned long len) 20 unsigned long addr, unsigned long len)
22{ 21{
@@ -27,62 +26,17 @@ static inline int prepare_hugepage_range(struct file *file,
27 return 0; 26 return 0;
28} 27}
29 28
30static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 29#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
31 unsigned long addr, unsigned long end,
32 unsigned long floor,
33 unsigned long ceiling)
34{
35 free_pgd_range(tlb, addr, end, floor, ceiling);
36}
37
38static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
39 pte_t *ptep, pte_t pte)
40{
41 set_pte_at(mm, addr, ptep, pte);
42}
43
44static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
45 unsigned long addr, pte_t *ptep)
46{
47 return ptep_get_and_clear(mm, addr, ptep);
48}
49
50static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, 30static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
51 unsigned long addr, pte_t *ptep) 31 unsigned long addr, pte_t *ptep)
52{ 32{
53} 33}
54 34
55static inline int huge_pte_none(pte_t pte)
56{
57 return pte_none(pte);
58}
59
60static inline pte_t huge_pte_wrprotect(pte_t pte)
61{
62 return pte_wrprotect(pte);
63}
64
65static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
66 unsigned long addr, pte_t *ptep)
67{
68 ptep_set_wrprotect(mm, addr, ptep);
69}
70
71static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
72 unsigned long addr, pte_t *ptep,
73 pte_t pte, int dirty)
74{
75 return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
76}
77
78static inline pte_t huge_ptep_get(pte_t *ptep)
79{
80 return *ptep;
81}
82
83static inline void arch_clear_hugepage_flags(struct page *page) 35static inline void arch_clear_hugepage_flags(struct page *page)
84{ 36{
85 clear_bit(PG_dcache_clean, &page->flags); 37 clear_bit(PG_dcache_clean, &page->flags);
86} 38}
87 39
40#include <asm-generic/hugetlb.h>
41
88#endif /* _ASM_SH_HUGETLB_H */ 42#endif /* _ASM_SH_HUGETLB_H */
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 300557c66698..3963f80d1cb3 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -3,7 +3,6 @@
3#define _ASM_SPARC64_HUGETLB_H 3#define _ASM_SPARC64_HUGETLB_H
4 4
5#include <asm/page.h> 5#include <asm/page.h>
6#include <asm-generic/hugetlb.h>
7 6
8#ifdef CONFIG_HUGETLB_PAGE 7#ifdef CONFIG_HUGETLB_PAGE
9struct pud_huge_patch_entry { 8struct pud_huge_patch_entry {
@@ -13,9 +12,11 @@ struct pud_huge_patch_entry {
13extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end; 12extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end;
14#endif 13#endif
15 14
15#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
16void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 16void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
17 pte_t *ptep, pte_t pte); 17 pte_t *ptep, pte_t pte);
18 18
19#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
19pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 20pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
20 pte_t *ptep); 21 pte_t *ptep);
21 22
@@ -25,37 +26,13 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
25 return 0; 26 return 0;
26} 27}
27 28
28/* 29#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
29 * If the arch doesn't supply something else, assume that hugepage
30 * size aligned regions are ok without further preparation.
31 */
32static inline int prepare_hugepage_range(struct file *file,
33 unsigned long addr, unsigned long len)
34{
35 struct hstate *h = hstate_file(file);
36
37 if (len & ~huge_page_mask(h))
38 return -EINVAL;
39 if (addr & ~huge_page_mask(h))
40 return -EINVAL;
41 return 0;
42}
43
44static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, 30static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
45 unsigned long addr, pte_t *ptep) 31 unsigned long addr, pte_t *ptep)
46{ 32{
47} 33}
48 34
49static inline int huge_pte_none(pte_t pte) 35#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
50{
51 return pte_none(pte);
52}
53
54static inline pte_t huge_pte_wrprotect(pte_t pte)
55{
56 return pte_wrprotect(pte);
57}
58
59static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, 36static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
60 unsigned long addr, pte_t *ptep) 37 unsigned long addr, pte_t *ptep)
61{ 38{
@@ -63,6 +40,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
63 set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); 40 set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
64} 41}
65 42
43#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
66static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, 44static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
67 unsigned long addr, pte_t *ptep, 45 unsigned long addr, pte_t *ptep,
68 pte_t pte, int dirty) 46 pte_t pte, int dirty)
@@ -75,17 +53,15 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
75 return changed; 53 return changed;
76} 54}
77 55
78static inline pte_t huge_ptep_get(pte_t *ptep)
79{
80 return *ptep;
81}
82
83static inline void arch_clear_hugepage_flags(struct page *page) 56static inline void arch_clear_hugepage_flags(struct page *page)
84{ 57{
85} 58}
86 59
60#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
87void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 61void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
88 unsigned long end, unsigned long floor, 62 unsigned long end, unsigned long floor,
89 unsigned long ceiling); 63 unsigned long ceiling);
90 64
65#include <asm-generic/hugetlb.h>
66
91#endif /* _ASM_SPARC64_HUGETLB_H */ 67#endif /* _ASM_SPARC64_HUGETLB_H */
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 6b9938919f0b..10c15b8853ae 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -12,6 +12,8 @@ config UML
12 select HAVE_UID16 12 select HAVE_UID16
13 select HAVE_FUTEX_CMPXCHG if FUTEX 13 select HAVE_FUTEX_CMPXCHG if FUTEX
14 select HAVE_DEBUG_KMEMLEAK 14 select HAVE_DEBUG_KMEMLEAK
15 select HAVE_MEMBLOCK
16 select NO_BOOTMEM
15 select GENERIC_IRQ_SHOW 17 select GENERIC_IRQ_SHOW
16 select GENERIC_CPU_DEVICES 18 select GENERIC_CPU_DEVICES
17 select GENERIC_CLOCKEVENTS 19 select GENERIC_CLOCKEVENTS
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index f02596e9931d..296a91a04598 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/memblock.h>
8#include <linux/mm.h> 9#include <linux/mm.h>
9#include <linux/pfn.h> 10#include <linux/pfn.h>
10#include <asm/page.h> 11#include <asm/page.h>
@@ -80,28 +81,23 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end,
80 unsigned long len, unsigned long long highmem) 81 unsigned long len, unsigned long long highmem)
81{ 82{
82 unsigned long reserve = reserve_end - start; 83 unsigned long reserve = reserve_end - start;
83 unsigned long pfn = PFN_UP(__pa(reserve_end)); 84 long map_size = len - reserve;
84 unsigned long delta = (len - reserve) >> PAGE_SHIFT;
85 unsigned long offset, bootmap_size;
86 long map_size;
87 int err; 85 int err;
88 86
89 offset = uml_reserved - uml_physmem;
90 map_size = len - offset;
91 if(map_size <= 0) { 87 if(map_size <= 0) {
92 os_warn("Too few physical memory! Needed=%lu, given=%lu\n", 88 os_warn("Too few physical memory! Needed=%lu, given=%lu\n",
93 offset, len); 89 reserve, len);
94 exit(1); 90 exit(1);
95 } 91 }
96 92
97 physmem_fd = create_mem_file(len + highmem); 93 physmem_fd = create_mem_file(len + highmem);
98 94
99 err = os_map_memory((void *) uml_reserved, physmem_fd, offset, 95 err = os_map_memory((void *) reserve_end, physmem_fd, reserve,
100 map_size, 1, 1, 1); 96 map_size, 1, 1, 1);
101 if (err < 0) { 97 if (err < 0) {
102 os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " 98 os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p "
103 "failed - errno = %d\n", map_size, 99 "failed - errno = %d\n", map_size,
104 (void *) uml_reserved, err); 100 (void *) reserve_end, err);
105 exit(1); 101 exit(1);
106 } 102 }
107 103
@@ -113,9 +109,11 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end,
113 os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); 109 os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE);
114 os_fsync_file(physmem_fd); 110 os_fsync_file(physmem_fd);
115 111
116 bootmap_size = init_bootmem(pfn, pfn + delta); 112 memblock_add(__pa(start), len + highmem);
117 free_bootmem(__pa(reserve_end) + bootmap_size, 113 memblock_reserve(__pa(start), reserve);
118 len - bootmap_size - reserve); 114
115 min_low_pfn = PFN_UP(__pa(reserve_end));
116 max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT);
119} 117}
120 118
121int phys_mapping(unsigned long phys, unsigned long long *offset_out) 119int phys_mapping(unsigned long phys, unsigned long long *offset_out)
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 3a3b40f79558..0c5111b206bd 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -6,6 +6,7 @@ config UNICORE32
6 select ARCH_MIGHT_HAVE_PC_SERIO 6 select ARCH_MIGHT_HAVE_PC_SERIO
7 select DMA_DIRECT_OPS 7 select DMA_DIRECT_OPS
8 select HAVE_MEMBLOCK 8 select HAVE_MEMBLOCK
9 select NO_BOOTMEM
9 select HAVE_GENERIC_DMA_COHERENT 10 select HAVE_GENERIC_DMA_COHERENT
10 select HAVE_KERNEL_GZIP 11 select HAVE_KERNEL_GZIP
11 select HAVE_KERNEL_BZIP2 12 select HAVE_KERNEL_BZIP2
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 5f72a8d1d953..8f8699e62bd5 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -84,58 +84,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low,
84 } 84 }
85} 85}
86 86
87static void __init uc32_bootmem_init(unsigned long start_pfn,
88 unsigned long end_pfn)
89{
90 struct memblock_region *reg;
91 unsigned int boot_pages;
92 phys_addr_t bitmap;
93 pg_data_t *pgdat;
94
95 /*
96 * Allocate the bootmem bitmap page. This must be in a region
97 * of memory which has already been mapped.
98 */
99 boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
100 bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
101 __pfn_to_phys(end_pfn));
102
103 /*
104 * Initialise the bootmem allocator, handing the
105 * memory banks over to bootmem.
106 */
107 node_set_online(0);
108 pgdat = NODE_DATA(0);
109 init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);
110
111 /* Free the lowmem regions from memblock into bootmem. */
112 for_each_memblock(memory, reg) {
113 unsigned long start = memblock_region_memory_base_pfn(reg);
114 unsigned long end = memblock_region_memory_end_pfn(reg);
115
116 if (end >= end_pfn)
117 end = end_pfn;
118 if (start >= end)
119 break;
120
121 free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
122 }
123
124 /* Reserve the lowmem memblock reserved regions in bootmem. */
125 for_each_memblock(reserved, reg) {
126 unsigned long start = memblock_region_reserved_base_pfn(reg);
127 unsigned long end = memblock_region_reserved_end_pfn(reg);
128
129 if (end >= end_pfn)
130 end = end_pfn;
131 if (start >= end)
132 break;
133
134 reserve_bootmem(__pfn_to_phys(start),
135 (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
136 }
137}
138
139static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low, 87static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low,
140 unsigned long max_high) 88 unsigned long max_high)
141{ 89{
@@ -232,7 +180,7 @@ void __init bootmem_init(void)
232 180
233 find_limits(&min, &max_low, &max_high); 181 find_limits(&min, &max_low, &max_high);
234 182
235 uc32_bootmem_init(min, max_low); 183 node_set_online(0);
236 184
237 /* 185 /*
238 * Sparsemem tries to allocate bootmem in memory_present(), 186 * Sparsemem tries to allocate bootmem in memory_present(),
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 3f9d43f26f63..7eb878561910 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -39,7 +39,7 @@ void __init init_vdso_image(const struct vdso_image *image)
39 39
40struct linux_binprm; 40struct linux_binprm;
41 41
42static int vdso_fault(const struct vm_special_mapping *sm, 42static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
43 struct vm_area_struct *vma, struct vm_fault *vmf) 43 struct vm_area_struct *vma, struct vm_fault *vmf)
44{ 44{
45 const struct vdso_image *image = vma->vm_mm->context.vdso_image; 45 const struct vdso_image *image = vma->vm_mm->context.vdso_image;
@@ -84,12 +84,11 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
84 return 0; 84 return 0;
85} 85}
86 86
87static int vvar_fault(const struct vm_special_mapping *sm, 87static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
88 struct vm_area_struct *vma, struct vm_fault *vmf) 88 struct vm_area_struct *vma, struct vm_fault *vmf)
89{ 89{
90 const struct vdso_image *image = vma->vm_mm->context.vdso_image; 90 const struct vdso_image *image = vma->vm_mm->context.vdso_image;
91 long sym_offset; 91 long sym_offset;
92 int ret = -EFAULT;
93 92
94 if (!image) 93 if (!image)
95 return VM_FAULT_SIGBUS; 94 return VM_FAULT_SIGBUS;
@@ -108,29 +107,24 @@ static int vvar_fault(const struct vm_special_mapping *sm,
108 return VM_FAULT_SIGBUS; 107 return VM_FAULT_SIGBUS;
109 108
110 if (sym_offset == image->sym_vvar_page) { 109 if (sym_offset == image->sym_vvar_page) {
111 ret = vm_insert_pfn(vma, vmf->address, 110 return vmf_insert_pfn(vma, vmf->address,
112 __pa_symbol(&__vvar_page) >> PAGE_SHIFT); 111 __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
113 } else if (sym_offset == image->sym_pvclock_page) { 112 } else if (sym_offset == image->sym_pvclock_page) {
114 struct pvclock_vsyscall_time_info *pvti = 113 struct pvclock_vsyscall_time_info *pvti =
115 pvclock_get_pvti_cpu0_va(); 114 pvclock_get_pvti_cpu0_va();
116 if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { 115 if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
117 ret = vm_insert_pfn_prot( 116 return vmf_insert_pfn_prot(vma, vmf->address,
118 vma, 117 __pa(pvti) >> PAGE_SHIFT,
119 vmf->address, 118 pgprot_decrypted(vma->vm_page_prot));
120 __pa(pvti) >> PAGE_SHIFT,
121 pgprot_decrypted(vma->vm_page_prot));
122 } 119 }
123 } else if (sym_offset == image->sym_hvclock_page) { 120 } else if (sym_offset == image->sym_hvclock_page) {
124 struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); 121 struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
125 122
126 if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) 123 if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
127 ret = vm_insert_pfn(vma, vmf->address, 124 return vmf_insert_pfn(vma, vmf->address,
128 vmalloc_to_pfn(tsc_pg)); 125 vmalloc_to_pfn(tsc_pg));
129 } 126 }
130 127
131 if (ret == 0 || ret == -EBUSY)
132 return VM_FAULT_NOPAGE;
133
134 return VM_FAULT_SIGBUS; 128 return VM_FAULT_SIGBUS;
135} 129}
136 130
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 5ed826da5e07..7469d321f072 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -13,75 +13,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
13 return 0; 13 return 0;
14} 14}
15 15
16/*
17 * If the arch doesn't supply something else, assume that hugepage
18 * size aligned regions are ok without further preparation.
19 */
20static inline int prepare_hugepage_range(struct file *file,
21 unsigned long addr, unsigned long len)
22{
23 struct hstate *h = hstate_file(file);
24 if (len & ~huge_page_mask(h))
25 return -EINVAL;
26 if (addr & ~huge_page_mask(h))
27 return -EINVAL;
28 return 0;
29}
30
31static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
32 unsigned long addr, unsigned long end,
33 unsigned long floor,
34 unsigned long ceiling)
35{
36 free_pgd_range(tlb, addr, end, floor, ceiling);
37}
38
39static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
40 pte_t *ptep, pte_t pte)
41{
42 set_pte_at(mm, addr, ptep, pte);
43}
44
45static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
46 unsigned long addr, pte_t *ptep)
47{
48 return ptep_get_and_clear(mm, addr, ptep);
49}
50
51static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
52 unsigned long addr, pte_t *ptep)
53{
54 ptep_clear_flush(vma, addr, ptep);
55}
56
57static inline int huge_pte_none(pte_t pte)
58{
59 return pte_none(pte);
60}
61
62static inline pte_t huge_pte_wrprotect(pte_t pte)
63{
64 return pte_wrprotect(pte);
65}
66
67static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
68 unsigned long addr, pte_t *ptep)
69{
70 ptep_set_wrprotect(mm, addr, ptep);
71}
72
73static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
74 unsigned long addr, pte_t *ptep,
75 pte_t pte, int dirty)
76{
77 return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
78}
79
80static inline pte_t huge_ptep_get(pte_t *ptep)
81{
82 return *ptep;
83}
84
85static inline void arch_clear_hugepage_flags(struct page *page) 16static inline void arch_clear_hugepage_flags(struct page *page)
86{ 17{
87} 18}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c88c23c658c1..d1f25c831447 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1248,7 +1248,6 @@ void __init e820__memblock_setup(void)
1248{ 1248{
1249 int i; 1249 int i;
1250 u64 end; 1250 u64 end;
1251 u64 addr = 0;
1252 1251
1253 /* 1252 /*
1254 * The bootstrap memblock region count maximum is 128 entries 1253 * The bootstrap memblock region count maximum is 128 entries
@@ -1265,21 +1264,13 @@ void __init e820__memblock_setup(void)
1265 struct e820_entry *entry = &e820_table->entries[i]; 1264 struct e820_entry *entry = &e820_table->entries[i];
1266 1265
1267 end = entry->addr + entry->size; 1266 end = entry->addr + entry->size;
1268 if (addr < entry->addr)
1269 memblock_reserve(addr, entry->addr - addr);
1270 addr = end;
1271 if (end != (resource_size_t)end) 1267 if (end != (resource_size_t)end)
1272 continue; 1268 continue;
1273 1269
1274 /*
1275 * all !E820_TYPE_RAM ranges (including gap ranges) are put
1276 * into memblock.reserved to make sure that struct pages in
1277 * such regions are not left uninitialized after bootup.
1278 */
1279 if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) 1270 if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1280 memblock_reserve(entry->addr, entry->size); 1271 continue;
1281 else 1272
1282 memblock_add(entry->addr, entry->size); 1273 memblock_add(entry->addr, entry->size);
1283 } 1274 }
1284 1275
1285 /* Throw away partial pages: */ 1276 /* Throw away partial pages: */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 82c756431b49..3310adecafb0 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -26,5 +26,6 @@ generic-y += rwsem.h
26generic-y += sections.h 26generic-y += sections.h
27generic-y += topology.h 27generic-y += topology.h
28generic-y += trace_clock.h 28generic-y += trace_clock.h
29generic-y += vga.h
29generic-y += word-at-a-time.h 30generic-y += word-at-a-time.h
30generic-y += xor.h 31generic-y += xor.h
diff --git a/arch/xtensa/include/asm/vga.h b/arch/xtensa/include/asm/vga.h
deleted file mode 100644
index 1fd8cab3a297..000000000000
--- a/arch/xtensa/include/asm/vga.h
+++ /dev/null
@@ -1,19 +0,0 @@
1/*
2 * include/asm-xtensa/vga.h
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 2001 - 2005 Tensilica Inc.
9 */
10
11#ifndef _XTENSA_VGA_H
12#define _XTENSA_VGA_H
13
14#define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x)
15
16#define vga_readb(x) (*(x))
17#define vga_writeb(x,y) (*(y) = (x))
18
19#endif
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 35c48d7b8f78..28f80d227528 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -153,7 +153,7 @@ struct iolatency_grp {
153#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC 153#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
154/* 154/*
155 * These are the constants used to fake the fixed-point moving average 155 * These are the constants used to fake the fixed-point moving average
156 * calculation just like load average. The call to CALC_LOAD folds 156 * calculation just like load average. The call to calc_load() folds
157 * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling 157 * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling
158 * window size is bucketed to try to approximately calculate average 158 * window size is bucketed to try to approximately calculate average
159 * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows 159 * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
@@ -248,7 +248,7 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
248 return; 248 return;
249 249
250 /* 250 /*
251 * CALC_LOAD takes in a number stored in fixed point representation. 251 * calc_load() takes in a number stored in fixed point representation.
252 * Because we are using this for IO time in ns, the values stored 252 * Because we are using this for IO time in ns, the values stored
253 * are significantly larger than the FIXED_1 denominator (2048). 253 * are significantly larger than the FIXED_1 denominator (2048).
254 * Therefore, rounding errors in the calculation are negligible and 254 * Therefore, rounding errors in the calculation are negligible and
@@ -257,7 +257,9 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
257 exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, 257 exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
258 div64_u64(iolat->cur_win_nsec, 258 div64_u64(iolat->cur_win_nsec,
259 BLKIOLATENCY_EXP_BUCKET_SIZE)); 259 BLKIOLATENCY_EXP_BUCKET_SIZE));
260 CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); 260 iolat->lat_avg = calc_load(iolat->lat_avg,
261 iolatency_exp_factors[exp_idx],
262 stat->rqs.mean);
261} 263}
262 264
263static inline bool iolatency_may_queue(struct iolatency_grp *iolat, 265static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 1ac4c36e13bb..86d6cd92ce3d 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -67,8 +67,11 @@ static ssize_t node_read_meminfo(struct device *dev,
67 int nid = dev->id; 67 int nid = dev->id;
68 struct pglist_data *pgdat = NODE_DATA(nid); 68 struct pglist_data *pgdat = NODE_DATA(nid);
69 struct sysinfo i; 69 struct sysinfo i;
70 unsigned long sreclaimable, sunreclaimable;
70 71
71 si_meminfo_node(&i, nid); 72 si_meminfo_node(&i, nid);
73 sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
74 sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE);
72 n = sprintf(buf, 75 n = sprintf(buf,
73 "Node %d MemTotal: %8lu kB\n" 76 "Node %d MemTotal: %8lu kB\n"
74 "Node %d MemFree: %8lu kB\n" 77 "Node %d MemFree: %8lu kB\n"
@@ -118,6 +121,7 @@ static ssize_t node_read_meminfo(struct device *dev,
118 "Node %d NFS_Unstable: %8lu kB\n" 121 "Node %d NFS_Unstable: %8lu kB\n"
119 "Node %d Bounce: %8lu kB\n" 122 "Node %d Bounce: %8lu kB\n"
120 "Node %d WritebackTmp: %8lu kB\n" 123 "Node %d WritebackTmp: %8lu kB\n"
124 "Node %d KReclaimable: %8lu kB\n"
121 "Node %d Slab: %8lu kB\n" 125 "Node %d Slab: %8lu kB\n"
122 "Node %d SReclaimable: %8lu kB\n" 126 "Node %d SReclaimable: %8lu kB\n"
123 "Node %d SUnreclaim: %8lu kB\n" 127 "Node %d SUnreclaim: %8lu kB\n"
@@ -138,20 +142,21 @@ static ssize_t node_read_meminfo(struct device *dev,
138 nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), 142 nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
139 nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), 143 nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
140 nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 144 nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
141 nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) + 145 nid, K(sreclaimable +
142 node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), 146 node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
143 nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)), 147 nid, K(sreclaimable + sunreclaimable),
148 nid, K(sreclaimable),
149 nid, K(sunreclaimable)
144#ifdef CONFIG_TRANSPARENT_HUGEPAGE 150#ifdef CONFIG_TRANSPARENT_HUGEPAGE
145 nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), 151 ,
146 nid, K(node_page_state(pgdat, NR_ANON_THPS) * 152 nid, K(node_page_state(pgdat, NR_ANON_THPS) *
147 HPAGE_PMD_NR), 153 HPAGE_PMD_NR),
148 nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * 154 nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
149 HPAGE_PMD_NR), 155 HPAGE_PMD_NR),
150 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * 156 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
151 HPAGE_PMD_NR)); 157 HPAGE_PMD_NR)
152#else
153 nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)));
154#endif 158#endif
159 );
155 n += hugetlb_report_node_meminfo(nid, buf + n); 160 n += hugetlb_report_node_meminfo(nid, buf + n);
156 return n; 161 return n;
157} 162}
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 575a68f31761..71979605246e 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -130,10 +130,6 @@ struct menu_device {
130 int interval_ptr; 130 int interval_ptr;
131}; 131};
132 132
133
134#define LOAD_INT(x) ((x) >> FSHIFT)
135#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
136
137static inline int get_loadavg(unsigned long load) 133static inline int get_loadavg(unsigned long load)
138{ 134{
139 return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; 135 return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index e1c7996c018e..475b769e120c 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,7 +77,6 @@ static void do_remove(struct mmu_rb_handler *handler,
77static void handle_remove(struct work_struct *work); 77static void handle_remove(struct work_struct *work);
78 78
79static const struct mmu_notifier_ops mn_opts = { 79static const struct mmu_notifier_ops mn_opts = {
80 .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
81 .invalidate_range_start = mmu_notifier_range_start, 80 .invalidate_range_start = mmu_notifier_range_start,
82}; 81};
83 82
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 58da65df03f5..fd552235bd13 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,7 +427,6 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
427} 427}
428 428
429static const struct mmu_notifier_ops iommu_mn = { 429static const struct mmu_notifier_ops iommu_mn = {
430 .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
431 .release = mn_release, 430 .release = mn_release,
432 .clear_flush_young = mn_clear_flush_young, 431 .clear_flush_young = mn_clear_flush_young,
433 .invalidate_range = mn_invalidate_range, 432 .invalidate_range = mn_invalidate_range,
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 4a03e5090952..db301efe126d 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -273,7 +273,6 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
273} 273}
274 274
275static const struct mmu_notifier_ops intel_mmuops = { 275static const struct mmu_notifier_ops intel_mmuops = {
276 .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
277 .release = intel_mm_release, 276 .release = intel_mm_release,
278 .change_pte = intel_change_pte, 277 .change_pte = intel_change_pte,
279 .invalidate_range = intel_invalidate_range, 278 .invalidate_range = intel_invalidate_range,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
index be28f05bfafa..03b49d52092e 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -261,7 +261,6 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
261 261
262 262
263static const struct mmu_notifier_ops gru_mmuops = { 263static const struct mmu_notifier_ops gru_mmuops = {
264 .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
265 .invalidate_range_start = gru_invalidate_range_start, 264 .invalidate_range_start = gru_invalidate_range_start,
266 .invalidate_range_end = gru_invalidate_range_end, 265 .invalidate_range_end = gru_invalidate_range_end,
267 .release = gru_release, 266 .release = gru_release,
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 800ad252cf9c..76c83c1ffeda 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1127,12 +1127,13 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
1127{ 1127{
1128 const u64 phys_offset = MIN_MEMBLOCK_ADDR; 1128 const u64 phys_offset = MIN_MEMBLOCK_ADDR;
1129 1129
1130 if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
1131 pr_warn("Ignoring memory block 0x%llx - 0x%llx\n",
1132 base, base + size);
1133 return;
1134 }
1135
1130 if (!PAGE_ALIGNED(base)) { 1136 if (!PAGE_ALIGNED(base)) {
1131 if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
1132 pr_warn("Ignoring memory block 0x%llx - 0x%llx\n",
1133 base, base + size);
1134 return;
1135 }
1136 size -= PAGE_SIZE - (base & ~PAGE_MASK); 1137 size -= PAGE_SIZE - (base & ~PAGE_MASK);
1137 base = PAGE_ALIGN(base); 1138 base = PAGE_ALIGN(base);
1138 } 1139 }
diff --git a/drivers/staging/android/ion/ion_page_pool.c b/drivers/staging/android/ion/ion_page_pool.c
index 9bc56eb48d2a..0d2a95957ee8 100644
--- a/drivers/staging/android/ion/ion_page_pool.c
+++ b/drivers/staging/android/ion/ion_page_pool.c
@@ -33,8 +33,8 @@ static void ion_page_pool_add(struct ion_page_pool *pool, struct page *page)
33 pool->low_count++; 33 pool->low_count++;
34 } 34 }
35 35
36 mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, 36 mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
37 (1 << (PAGE_SHIFT + pool->order))); 37 1 << pool->order);
38 mutex_unlock(&pool->mutex); 38 mutex_unlock(&pool->mutex);
39} 39}
40 40
@@ -53,8 +53,8 @@ static struct page *ion_page_pool_remove(struct ion_page_pool *pool, bool high)
53 } 53 }
54 54
55 list_del(&page->lru); 55 list_del(&page->lru);
56 mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, 56 mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
57 -(1 << (PAGE_SHIFT + pool->order))); 57 -(1 << pool->order));
58 return page; 58 return page;
59} 59}
60 60
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f408994fc632..0c35e62f108d 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -418,9 +418,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
418 int i; 418 int i;
419 vma->vm_flags |= VM_MIXEDMAP; 419 vma->vm_flags |= VM_MIXEDMAP;
420 for (i = 0; i < pages && !ret; i++) { 420 for (i = 0; i < pages && !ret; i++) {
421 vm_fault_t vmf;
421 unsigned long off = i * PAGE_SIZE; 422 unsigned long off = i * PAGE_SIZE;
422 pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); 423 pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
423 ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); 424 vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
425 if (vmf & VM_FAULT_ERROR)
426 ret = vm_fault_to_errno(vmf, 0);
424 } 427 }
425 } 428 }
426 429
diff --git a/fs/dcache.c b/fs/dcache.c
index 2e7e8d85e9b4..c2e443fb76ae 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head)
257 kmem_cache_free(dentry_cache, dentry); 257 kmem_cache_free(dentry_cache, dentry);
258} 258}
259 259
260static void __d_free_external_name(struct rcu_head *head)
261{
262 struct external_name *name = container_of(head, struct external_name,
263 u.head);
264
265 mod_node_page_state(page_pgdat(virt_to_page(name)),
266 NR_INDIRECTLY_RECLAIMABLE_BYTES,
267 -ksize(name));
268
269 kfree(name);
270}
271
272static void __d_free_external(struct rcu_head *head) 260static void __d_free_external(struct rcu_head *head)
273{ 261{
274 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); 262 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
275 263 kfree(external_name(dentry));
276 __d_free_external_name(&external_name(dentry)->u.head);
277
278 kmem_cache_free(dentry_cache, dentry); 264 kmem_cache_free(dentry_cache, dentry);
279} 265}
280 266
@@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name)
306 struct external_name *p; 292 struct external_name *p;
307 p = container_of(name->name, struct external_name, name[0]); 293 p = container_of(name->name, struct external_name, name[0]);
308 if (unlikely(atomic_dec_and_test(&p->u.count))) 294 if (unlikely(atomic_dec_and_test(&p->u.count)))
309 call_rcu(&p->u.head, __d_free_external_name); 295 kfree_rcu(p, u.head);
310 } 296 }
311} 297}
312EXPORT_SYMBOL(release_dentry_name_snapshot); 298EXPORT_SYMBOL(release_dentry_name_snapshot);
@@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate);
1606 1592
1607struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) 1593struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
1608{ 1594{
1609 struct external_name *ext = NULL;
1610 struct dentry *dentry; 1595 struct dentry *dentry;
1611 char *dname; 1596 char *dname;
1612 int err; 1597 int err;
@@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
1627 dname = dentry->d_iname; 1612 dname = dentry->d_iname;
1628 } else if (name->len > DNAME_INLINE_LEN-1) { 1613 } else if (name->len > DNAME_INLINE_LEN-1) {
1629 size_t size = offsetof(struct external_name, name[1]); 1614 size_t size = offsetof(struct external_name, name[1]);
1630 1615 struct external_name *p = kmalloc(size + name->len,
1631 ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); 1616 GFP_KERNEL_ACCOUNT |
1632 if (!ext) { 1617 __GFP_RECLAIMABLE);
1618 if (!p) {
1633 kmem_cache_free(dentry_cache, dentry); 1619 kmem_cache_free(dentry_cache, dentry);
1634 return NULL; 1620 return NULL;
1635 } 1621 }
1636 atomic_set(&ext->u.count, 1); 1622 atomic_set(&p->u.count, 1);
1637 dname = ext->name; 1623 dname = p->name;
1638 } else { 1624 } else {
1639 dname = dentry->d_iname; 1625 dname = dentry->d_iname;
1640 } 1626 }
@@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
1673 } 1659 }
1674 } 1660 }
1675 1661
1676 if (unlikely(ext)) {
1677 pg_data_t *pgdat = page_pgdat(virt_to_page(ext));
1678 mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
1679 ksize(ext));
1680 }
1681
1682 this_cpu_inc(nr_dentry); 1662 this_cpu_inc(nr_dentry);
1683 1663
1684 return dentry; 1664 return dentry;
@@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
2707 dentry->d_name.hash_len = target->d_name.hash_len; 2687 dentry->d_name.hash_len = target->d_name.hash_len;
2708 } 2688 }
2709 if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) 2689 if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
2710 call_rcu(&old_name->u.head, __d_free_external_name); 2690 kfree_rcu(old_name, u.head);
2711} 2691}
2712 2692
2713/* 2693/*
diff --git a/fs/iomap.c b/fs/iomap.c
index ec15cf2ec696..90c2febc93ac 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1057,7 +1057,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
1057 return length; 1057 return length;
1058} 1058}
1059 1059
1060int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) 1060vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
1061{ 1061{
1062 struct page *page = vmf->page; 1062 struct page *page = vmf->page;
1063 struct inode *inode = file_inode(vmf->vma->vm_file); 1063 struct inode *inode = file_inode(vmf->vma->vm_file);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index ff2716f9322e..fdf527b6d79c 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -236,6 +236,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
236 sb->s_export_op = &kernfs_export_ops; 236 sb->s_export_op = &kernfs_export_ops;
237 sb->s_time_gran = 1; 237 sb->s_time_gran = 1;
238 238
239 /* sysfs dentries and inodes don't require IO to create */
240 sb->s_shrink.seeks = 0;
241
239 /* get root inode, initialize and unlock it */ 242 /* get root inode, initialize and unlock it */
240 mutex_lock(&kernfs_mutex); 243 mutex_lock(&kernfs_mutex);
241 inode = kernfs_get_inode(sb, info->root->kn); 244 inode = kernfs_get_inode(sb, info->root->kn);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a342f008e42f..d1cbb27808e2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle,
5106 * rightmost extent list. 5106 * rightmost extent list.
5107 */ 5107 */
5108 if (path->p_tree_depth) { 5108 if (path->p_tree_depth) {
5109 struct ocfs2_extent_block *eb;
5110
5111 ret = ocfs2_read_extent_block(et->et_ci, 5109 ret = ocfs2_read_extent_block(et->et_ci,
5112 ocfs2_et_get_last_eb_blk(et), 5110 ocfs2_et_get_last_eb_blk(et),
5113 &last_eb_bh); 5111 &last_eb_bh);
@@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle,
5115 mlog_errno(ret); 5113 mlog_errno(ret);
5116 goto out; 5114 goto out;
5117 } 5115 }
5118
5119 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5120 } 5116 }
5121 5117
5122 if (rec->e_cpos == split_rec->e_cpos && 5118 if (rec->e_cpos == split_rec->e_cpos &&
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 302cd7caa4a7..da578ad4c08f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1392,8 +1392,7 @@ retry:
1392unlock: 1392unlock:
1393 spin_unlock(&oi->ip_lock); 1393 spin_unlock(&oi->ip_lock);
1394out: 1394out:
1395 if (new) 1395 kfree(new);
1396 kfree(new);
1397 return ret; 1396 return ret;
1398} 1397}
1399 1398
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 9b984cae4c4e..1d6dc8422899 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
329{ 329{
330 char *buf; 330 char *buf;
331 331
332 buf = (char *) get_zeroed_page(GFP_NOFS); 332 buf = (char *) get_zeroed_page(GFP_ATOMIC);
333 if (buf) { 333 if (buf) {
334 dump_mle(mle, buf, PAGE_SIZE - 1); 334 dump_mle(mle, buf, PAGE_SIZE - 1);
335 free_page((unsigned long)buf); 335 free_page((unsigned long)buf);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 838a06d4066a..074d5de17bb2 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
531 assert_spin_locked(&res->spinlock); 531 assert_spin_locked(&res->spinlock);
532 532
533 /* don't shuffle secondary queues */ 533 /* don't shuffle secondary queues */
534 if ((res->owner == dlm->node_num)) { 534 if (res->owner == dlm->node_num) {
535 if (res->state & (DLM_LOCK_RES_MIGRATING | 535 if (res->state & (DLM_LOCK_RES_MIGRATING |
536 DLM_LOCK_RES_BLOCK_DIRTY)) 536 DLM_LOCK_RES_BLOCK_DIRTY))
537 return; 537 return;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7a5ee145c733..1114ef02e780 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
4135 struct buffer_head *ref_root_bh = NULL; 4135 struct buffer_head *ref_root_bh = NULL;
4136 struct ocfs2_cached_dealloc_ctxt dealloc; 4136 struct ocfs2_cached_dealloc_ctxt dealloc;
4137 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); 4137 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4138 struct ocfs2_refcount_block *rb;
4139 struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; 4138 struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
4140 struct ocfs2_refcount_tree *ref_tree; 4139 struct ocfs2_refcount_tree *ref_tree;
4141 4140
@@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
4162 mlog_errno(ret); 4161 mlog_errno(ret);
4163 goto out; 4162 goto out;
4164 } 4163 }
4165 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4166 4164
4167 ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, 4165 ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4168 &ref_tree->rf_ci, ref_root_bh, 4166 &ref_tree->rf_ci, ref_root_bh,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fc5306a31a1d..5792f9e39466 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -516,6 +516,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
516 */ 516 */
517 s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; 517 s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
518 518
519 /* procfs dentries and inodes don't require IO to create */
520 s->s_shrink.seeks = 0;
521
519 pde_get(&proc_root); 522 pde_get(&proc_root);
520 root_inode = proc_get_inode(s, &proc_root); 523 root_inode = proc_get_inode(s, &proc_root);
521 if (!root_inode) { 524 if (!root_inode) {
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index d06694757201..8468baee951d 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -10,9 +10,6 @@
10#include <linux/seqlock.h> 10#include <linux/seqlock.h>
11#include <linux/time.h> 11#include <linux/time.h>
12 12
13#define LOAD_INT(x) ((x) >> FSHIFT)
14#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
15
16static int loadavg_proc_show(struct seq_file *m, void *v) 13static int loadavg_proc_show(struct seq_file *m, void *v)
17{ 14{
18 unsigned long avnrun[3]; 15 unsigned long avnrun[3];
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index edda898714eb..568d90e17c17 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
38 long cached; 38 long cached;
39 long available; 39 long available;
40 unsigned long pages[NR_LRU_LISTS]; 40 unsigned long pages[NR_LRU_LISTS];
41 unsigned long sreclaimable, sunreclaim;
41 int lru; 42 int lru;
42 43
43 si_meminfo(&i); 44 si_meminfo(&i);
@@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
53 pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 54 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
54 55
55 available = si_mem_available(); 56 available = si_mem_available();
57 sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
58 sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
56 59
57 show_val_kb(m, "MemTotal: ", i.totalram); 60 show_val_kb(m, "MemTotal: ", i.totalram);
58 show_val_kb(m, "MemFree: ", i.freeram); 61 show_val_kb(m, "MemFree: ", i.freeram);
@@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
94 show_val_kb(m, "Mapped: ", 97 show_val_kb(m, "Mapped: ",
95 global_node_page_state(NR_FILE_MAPPED)); 98 global_node_page_state(NR_FILE_MAPPED));
96 show_val_kb(m, "Shmem: ", i.sharedram); 99 show_val_kb(m, "Shmem: ", i.sharedram);
97 show_val_kb(m, "Slab: ", 100 show_val_kb(m, "KReclaimable: ", sreclaimable +
98 global_node_page_state(NR_SLAB_RECLAIMABLE) + 101 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
99 global_node_page_state(NR_SLAB_UNRECLAIMABLE)); 102 show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
100 103 show_val_kb(m, "SReclaimable: ", sreclaimable);
101 show_val_kb(m, "SReclaimable: ", 104 show_val_kb(m, "SUnreclaim: ", sunreclaim);
102 global_node_page_state(NR_SLAB_RECLAIMABLE));
103 show_val_kb(m, "SUnreclaim: ",
104 global_node_page_state(NR_SLAB_UNRECLAIMABLE));
105 seq_printf(m, "KernelStack: %8lu kB\n", 105 seq_printf(m, "KernelStack: %8lu kB\n",
106 global_zone_page_state(NR_KERNEL_STACK_KB)); 106 global_zone_page_state(NR_KERNEL_STACK_KB));
107 show_val_kb(m, "PageTables: ", 107 show_val_kb(m, "PageTables: ",
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5ea1d64cb0b4..a027473561c6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -713,6 +713,8 @@ static void smap_gather_stats(struct vm_area_struct *vma,
713 smaps_walk.private = mss; 713 smaps_walk.private = mss;
714 714
715#ifdef CONFIG_SHMEM 715#ifdef CONFIG_SHMEM
716 /* In case of smaps_rollup, reset the value from previous vma */
717 mss->check_shmem_swap = false;
716 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { 718 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
717 /* 719 /*
718 * For shared or readonly shmem mappings we know that all 720 * For shared or readonly shmem mappings we know that all
@@ -728,7 +730,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
728 730
729 if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || 731 if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
730 !(vma->vm_flags & VM_WRITE)) { 732 !(vma->vm_flags & VM_WRITE)) {
731 mss->swap = shmem_swapped; 733 mss->swap += shmem_swapped;
732 } else { 734 } else {
733 mss->check_shmem_swap = true; 735 mss->check_shmem_swap = true;
734 smaps_walk.pte_hole = smaps_pte_hole; 736 smaps_walk.pte_hole = smaps_pte_hole;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bfa0ec69f924..356d2b8568c1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1026,7 +1026,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1026 struct userfaultfd_ctx *fork_nctx = NULL; 1026 struct userfaultfd_ctx *fork_nctx = NULL;
1027 1027
1028 /* always take the fd_wqh lock before the fault_pending_wqh lock */ 1028 /* always take the fd_wqh lock before the fault_pending_wqh lock */
1029 spin_lock(&ctx->fd_wqh.lock); 1029 spin_lock_irq(&ctx->fd_wqh.lock);
1030 __add_wait_queue(&ctx->fd_wqh, &wait); 1030 __add_wait_queue(&ctx->fd_wqh, &wait);
1031 for (;;) { 1031 for (;;) {
1032 set_current_state(TASK_INTERRUPTIBLE); 1032 set_current_state(TASK_INTERRUPTIBLE);
@@ -1112,13 +1112,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1112 ret = -EAGAIN; 1112 ret = -EAGAIN;
1113 break; 1113 break;
1114 } 1114 }
1115 spin_unlock(&ctx->fd_wqh.lock); 1115 spin_unlock_irq(&ctx->fd_wqh.lock);
1116 schedule(); 1116 schedule();
1117 spin_lock(&ctx->fd_wqh.lock); 1117 spin_lock_irq(&ctx->fd_wqh.lock);
1118 } 1118 }
1119 __remove_wait_queue(&ctx->fd_wqh, &wait); 1119 __remove_wait_queue(&ctx->fd_wqh, &wait);
1120 __set_current_state(TASK_RUNNING); 1120 __set_current_state(TASK_RUNNING);
1121 spin_unlock(&ctx->fd_wqh.lock); 1121 spin_unlock_irq(&ctx->fd_wqh.lock);
1122 1122
1123 if (!ret && msg->event == UFFD_EVENT_FORK) { 1123 if (!ret && msg->event == UFFD_EVENT_FORK) {
1124 ret = resolve_userfault_fork(ctx, fork_nctx, msg); 1124 ret = resolve_userfault_fork(ctx, fork_nctx, msg);
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 9d0cde8ab716..71d7b77eea50 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -32,7 +32,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
32 return pte_modify(pte, newprot); 32 return pte_modify(pte, newprot);
33} 33}
34 34
35#ifndef huge_pte_clear 35#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR
36static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, 36static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
37 pte_t *ptep, unsigned long sz) 37 pte_t *ptep, unsigned long sz)
38{ 38{
@@ -40,4 +40,90 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
40} 40}
41#endif 41#endif
42 42
43#ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
44static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
45 unsigned long addr, unsigned long end,
46 unsigned long floor, unsigned long ceiling)
47{
48 free_pgd_range(tlb, addr, end, floor, ceiling);
49}
50#endif
51
52#ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
53static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
54 pte_t *ptep, pte_t pte)
55{
56 set_pte_at(mm, addr, ptep, pte);
57}
58#endif
59
60#ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
61static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
62 unsigned long addr, pte_t *ptep)
63{
64 return ptep_get_and_clear(mm, addr, ptep);
65}
66#endif
67
68#ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
69static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
70 unsigned long addr, pte_t *ptep)
71{
72 ptep_clear_flush(vma, addr, ptep);
73}
74#endif
75
76#ifndef __HAVE_ARCH_HUGE_PTE_NONE
77static inline int huge_pte_none(pte_t pte)
78{
79 return pte_none(pte);
80}
81#endif
82
83#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
84static inline pte_t huge_pte_wrprotect(pte_t pte)
85{
86 return pte_wrprotect(pte);
87}
88#endif
89
90#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
91static inline int prepare_hugepage_range(struct file *file,
92 unsigned long addr, unsigned long len)
93{
94 struct hstate *h = hstate_file(file);
95
96 if (len & ~huge_page_mask(h))
97 return -EINVAL;
98 if (addr & ~huge_page_mask(h))
99 return -EINVAL;
100
101 return 0;
102}
103#endif
104
105#ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
106static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
107 unsigned long addr, pte_t *ptep)
108{
109 ptep_set_wrprotect(mm, addr, ptep);
110}
111#endif
112
113#ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
114static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
115 unsigned long addr, pte_t *ptep,
116 pte_t pte, int dirty)
117{
118 return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
119}
120#endif
121
122#ifndef __HAVE_ARCH_HUGE_PTEP_GET
123static inline pte_t huge_ptep_get(pte_t *ptep)
124{
125 return *ptep;
126}
127#endif
128
43#endif /* _ASM_GENERIC_HUGETLB_H */ 129#endif /* _ASM_GENERIC_HUGETLB_H */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 88ebc6102c7c..5657a20e0c59 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -757,7 +757,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
757/* 757/*
758 * Interfaces that can be used by architecture code to keep track of 758 * Interfaces that can be used by architecture code to keep track of
759 * memory type of pfn mappings specified by the remap_pfn_range, 759 * memory type of pfn mappings specified by the remap_pfn_range,
760 * vm_insert_pfn. 760 * vmf_insert_pfn.
761 */ 761 */
762 762
763/* 763/*
@@ -773,7 +773,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
773 773
774/* 774/*
775 * track_pfn_insert is called when a _new_ single pfn is established 775 * track_pfn_insert is called when a _new_ single pfn is established
776 * by vm_insert_pfn(). 776 * by vmf_insert_pfn().
777 */ 777 */
778static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 778static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
779 pfn_t pfn) 779 pfn_t pfn)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 22254c1fe1c5..5e1694fe035b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -20,6 +20,7 @@
20#include <linux/u64_stats_sync.h> 20#include <linux/u64_stats_sync.h>
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22#include <linux/bpf-cgroup.h> 22#include <linux/bpf-cgroup.h>
23#include <linux/psi_types.h>
23 24
24#ifdef CONFIG_CGROUPS 25#ifdef CONFIG_CGROUPS
25 26
@@ -436,6 +437,9 @@ struct cgroup {
436 /* used to schedule release agent */ 437 /* used to schedule release agent */
437 struct work_struct release_agent_work; 438 struct work_struct release_agent_work;
438 439
440 /* used to track pressure stalls */
441 struct psi_group psi;
442
439 /* used to store eBPF programs */ 443 /* used to store eBPF programs */
440 struct cgroup_bpf bpf; 444 struct cgroup_bpf bpf;
441 445
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b622d6608605..9968332cceed 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
650 pr_cont_kernfs_path(cgrp->kn); 650 pr_cont_kernfs_path(cgrp->kn);
651} 651}
652 652
653static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
654{
655 return &cgrp->psi;
656}
657
653static inline void cgroup_init_kthreadd(void) 658static inline void cgroup_init_kthreadd(void)
654{ 659{
655 /* 660 /*
@@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
703 return NULL; 708 return NULL;
704} 709}
705 710
711static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
712{
713 return NULL;
714}
715
716static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
717{
718 return NULL;
719}
720
706static inline bool task_under_cgroup_hierarchy(struct task_struct *task, 721static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
707 struct cgroup *ancestor) 722 struct cgroup *ancestor)
708{ 723{
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 31c865d1842e..577d1b25fccd 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -57,7 +57,12 @@ struct task_delay_info {
57 57
58 u64 freepages_start; 58 u64 freepages_start;
59 u64 freepages_delay; /* wait for memory reclaim */ 59 u64 freepages_delay; /* wait for memory reclaim */
60
61 u64 thrashing_start;
62 u64 thrashing_delay; /* wait for thrashing page */
63
60 u32 freepages_count; /* total count of memory reclaim */ 64 u32 freepages_count; /* total count of memory reclaim */
65 u32 thrashing_count; /* total count of thrash waits */
61}; 66};
62#endif 67#endif
63 68
@@ -76,6 +81,8 @@ extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
76extern __u64 __delayacct_blkio_ticks(struct task_struct *); 81extern __u64 __delayacct_blkio_ticks(struct task_struct *);
77extern void __delayacct_freepages_start(void); 82extern void __delayacct_freepages_start(void);
78extern void __delayacct_freepages_end(void); 83extern void __delayacct_freepages_end(void);
84extern void __delayacct_thrashing_start(void);
85extern void __delayacct_thrashing_end(void);
79 86
80static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) 87static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
81{ 88{
@@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void)
156 __delayacct_freepages_end(); 163 __delayacct_freepages_end();
157} 164}
158 165
166static inline void delayacct_thrashing_start(void)
167{
168 if (current->delays)
169 __delayacct_thrashing_start();
170}
171
172static inline void delayacct_thrashing_end(void)
173{
174 if (current->delays)
175 __delayacct_thrashing_end();
176}
177
159#else 178#else
160static inline void delayacct_set_flag(int flag) 179static inline void delayacct_set_flag(int flag)
161{} 180{}
@@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void)
182{} 201{}
183static inline void delayacct_freepages_end(void) 202static inline void delayacct_freepages_end(void)
184{} 203{}
204static inline void delayacct_thrashing_start(void)
205{}
206static inline void delayacct_thrashing_end(void)
207{}
185 208
186#endif /* CONFIG_TASK_DELAY_ACCT */ 209#endif /* CONFIG_TASK_DELAY_ACCT */
187 210
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4c92e3ba3e16..dde947083d4e 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -107,7 +107,7 @@ enum hmm_pfn_flag_e {
107 * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory 107 * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
108 * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() 108 * HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
109 * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the 109 * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
110 * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not 110 * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
111 * be mirrored by a device, because the entry will never have HMM_PFN_VALID 111 * be mirrored by a device, because the entry will never have HMM_PFN_VALID
112 * set and the pfn value is undefined. 112 * set and the pfn value is undefined.
113 * 113 *
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fdcb45999b26..4663ee96cf59 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -213,9 +213,9 @@ static inline int hpage_nr_pages(struct page *page)
213} 213}
214 214
215struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 215struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
216 pmd_t *pmd, int flags); 216 pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
217struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, 217struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
218 pud_t *pud, int flags); 218 pud_t *pud, int flags, struct dev_pagemap **pgmap);
219 219
220extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); 220extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
221 221
@@ -344,13 +344,13 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm)
344} 344}
345 345
346static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, 346static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
347 unsigned long addr, pmd_t *pmd, int flags) 347 unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
348{ 348{
349 return NULL; 349 return NULL;
350} 350}
351 351
352static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, 352static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
353 unsigned long addr, pud_t *pud, int flags) 353 unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
354{ 354{
355 return NULL; 355 return NULL;
356} 356}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3555d54bf79a..9a4258154b25 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -6,6 +6,7 @@
6#include <linux/bitmap.h> 6#include <linux/bitmap.h>
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/mm_types.h>
9 10
10struct address_space; 11struct address_space;
11struct fiemap_extent_info; 12struct fiemap_extent_info;
@@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
141 bool *did_zero, const struct iomap_ops *ops); 142 bool *did_zero, const struct iomap_ops *ops);
142int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 143int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
143 const struct iomap_ops *ops); 144 const struct iomap_ops *ops);
144int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); 145vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
146 const struct iomap_ops *ops);
145int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 147int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
146 loff_t start, loff_t len, const struct iomap_ops *ops); 148 loff_t start, loff_t len, const struct iomap_ops *ops);
147loff_t iomap_seek_hole(struct inode *inode, loff_t offset, 149loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index d7618c41f74c..7c47b1a471d4 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -90,6 +90,7 @@
90#ifndef WEAK 90#ifndef WEAK
91#define WEAK(name) \ 91#define WEAK(name) \
92 .weak name ASM_NL \ 92 .weak name ASM_NL \
93 ALIGN ASM_NL \
93 name: 94 name:
94#endif 95#endif
95 96
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 837f2f2d1d34..bb2c84afb80c 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -281,4 +281,7 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
281} 281}
282#endif /* mul_u64_u32_div */ 282#endif /* mul_u64_u32_div */
283 283
284#define DIV64_U64_ROUND_UP(ll, d) \
285 ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
286
284#endif /* _LINUX_MATH64_H */ 287#endif /* _LINUX_MATH64_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 516920549378..2acdd046df2d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
265 for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ 265 for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
266 nid, flags, p_start, p_end, p_nid) 266 nid, flags, p_start, p_end, p_nid)
267 267
268/**
269 * for_each_resv_unavail_range - iterate through reserved and unavailable memory
270 * @i: u64 used as loop variable
271 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
272 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
273 *
274 * Walks over unavailable but reserved (reserved && !memory) areas of memblock.
275 * Available as soon as memblock is initialized.
276 * Note: because this memory does not belong to any physical node, flags and
277 * nid arguments do not make sense and thus not exported as arguments.
278 */
279#define for_each_resv_unavail_range(i, p_start, p_end) \
280 for_each_mem_range(i, &memblock.reserved, &memblock.memory, \
281 NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
282
283static inline void memblock_set_region_flags(struct memblock_region *r, 268static inline void memblock_set_region_flags(struct memblock_region *r,
284 enum memblock_flags flags) 269 enum memblock_flags flags)
285{ 270{
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 652f602167df..7ab2120155a4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -78,7 +78,7 @@ struct mem_cgroup_reclaim_cookie {
78 78
79struct mem_cgroup_id { 79struct mem_cgroup_id {
80 int id; 80 int id;
81 atomic_t ref; 81 refcount_t ref;
82}; 82};
83 83
84/* 84/*
@@ -1268,10 +1268,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
1268void memcg_kmem_put_cache(struct kmem_cache *cachep); 1268void memcg_kmem_put_cache(struct kmem_cache *cachep);
1269int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 1269int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
1270 struct mem_cgroup *memcg); 1270 struct mem_cgroup *memcg);
1271
1272#ifdef CONFIG_MEMCG_KMEM
1271int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); 1273int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
1272void memcg_kmem_uncharge(struct page *page, int order); 1274void memcg_kmem_uncharge(struct page *page, int order);
1273 1275
1274#ifdef CONFIG_MEMCG_KMEM
1275extern struct static_key_false memcg_kmem_enabled_key; 1276extern struct static_key_false memcg_kmem_enabled_key;
1276extern struct workqueue_struct *memcg_kmem_cache_wq; 1277extern struct workqueue_struct *memcg_kmem_cache_wq;
1277 1278
@@ -1307,6 +1308,16 @@ extern int memcg_expand_shrinker_maps(int new_id);
1307extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, 1308extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
1308 int nid, int shrinker_id); 1309 int nid, int shrinker_id);
1309#else 1310#else
1311
1312static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
1313{
1314 return 0;
1315}
1316
1317static inline void memcg_kmem_uncharge(struct page *page, int order)
1318{
1319}
1320
1310#define for_each_memcg_cache_index(_idx) \ 1321#define for_each_memcg_cache_index(_idx) \
1311 for (; NULL; ) 1322 for (; NULL; )
1312 1323
diff --git a/include/linux/mm.h b/include/linux/mm.h
index daa2b8f1e9a8..1e52b8fd1685 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page)
848{ 848{
849 return page_zonenum(page) == ZONE_DEVICE; 849 return page_zonenum(page) == ZONE_DEVICE;
850} 850}
851extern void memmap_init_zone_device(struct zone *, unsigned long,
852 unsigned long, struct dev_pagemap *);
851#else 853#else
852static inline bool is_zone_device_page(const struct page *page) 854static inline bool is_zone_device_page(const struct page *page)
853{ 855{
@@ -2304,6 +2306,8 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
2304 unsigned long len, unsigned long prot, unsigned long flags, 2306 unsigned long len, unsigned long prot, unsigned long flags,
2305 vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, 2307 vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
2306 struct list_head *uf); 2308 struct list_head *uf);
2309extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
2310 struct list_head *uf, bool downgrade);
2307extern int do_munmap(struct mm_struct *, unsigned long, size_t, 2311extern int do_munmap(struct mm_struct *, unsigned long, size_t,
2308 struct list_head *uf); 2312 struct list_head *uf);
2309 2313
@@ -2502,11 +2506,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
2502int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 2506int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
2503 unsigned long pfn, unsigned long size, pgprot_t); 2507 unsigned long pfn, unsigned long size, pgprot_t);
2504int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); 2508int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
2505int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 2509vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2506 unsigned long pfn); 2510 unsigned long pfn);
2507int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, 2511vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
2508 unsigned long pfn, pgprot_t pgprot); 2512 unsigned long pfn, pgprot_t pgprot);
2509int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 2513vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2510 pfn_t pfn); 2514 pfn_t pfn);
2511vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, 2515vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2512 unsigned long addr, pfn_t pfn); 2516 unsigned long addr, pfn_t pfn);
@@ -2525,32 +2529,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
2525 return VM_FAULT_NOPAGE; 2529 return VM_FAULT_NOPAGE;
2526} 2530}
2527 2531
2528static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
2529 unsigned long addr, pfn_t pfn)
2530{
2531 int err = vm_insert_mixed(vma, addr, pfn);
2532
2533 if (err == -ENOMEM)
2534 return VM_FAULT_OOM;
2535 if (err < 0 && err != -EBUSY)
2536 return VM_FAULT_SIGBUS;
2537
2538 return VM_FAULT_NOPAGE;
2539}
2540
2541static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma,
2542 unsigned long addr, unsigned long pfn)
2543{
2544 int err = vm_insert_pfn(vma, addr, pfn);
2545
2546 if (err == -ENOMEM)
2547 return VM_FAULT_OOM;
2548 if (err < 0 && err != -EBUSY)
2549 return VM_FAULT_SIGBUS;
2550
2551 return VM_FAULT_NOPAGE;
2552}
2553
2554static inline vm_fault_t vmf_error(int err) 2532static inline vm_fault_t vmf_error(int err)
2555{ 2533{
2556 if (err == -ENOMEM) 2534 if (err == -ENOMEM)
@@ -2558,16 +2536,8 @@ static inline vm_fault_t vmf_error(int err)
2558 return VM_FAULT_SIGBUS; 2536 return VM_FAULT_SIGBUS;
2559} 2537}
2560 2538
2561struct page *follow_page_mask(struct vm_area_struct *vma, 2539struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
2562 unsigned long address, unsigned int foll_flags, 2540 unsigned int foll_flags);
2563 unsigned int *page_mask);
2564
2565static inline struct page *follow_page(struct vm_area_struct *vma,
2566 unsigned long address, unsigned int foll_flags)
2567{
2568 unsigned int unused_page_mask;
2569 return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
2570}
2571 2541
2572#define FOLL_WRITE 0x01 /* check pte is writable */ 2542#define FOLL_WRITE 0x01 /* check pte is writable */
2573#define FOLL_TOUCH 0x02 /* mark page accessed */ 2543#define FOLL_TOUCH 0x02 /* mark page accessed */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 133ba78820ee..9893a6432adf 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -2,7 +2,6 @@
2#ifndef _LINUX_MMU_NOTIFIER_H 2#ifndef _LINUX_MMU_NOTIFIER_H
3#define _LINUX_MMU_NOTIFIER_H 3#define _LINUX_MMU_NOTIFIER_H
4 4
5#include <linux/types.h>
6#include <linux/list.h> 5#include <linux/list.h>
7#include <linux/spinlock.h> 6#include <linux/spinlock.h>
8#include <linux/mm_types.h> 7#include <linux/mm_types.h>
@@ -11,9 +10,6 @@
11struct mmu_notifier; 10struct mmu_notifier;
12struct mmu_notifier_ops; 11struct mmu_notifier_ops;
13 12
14/* mmu_notifier_ops flags */
15#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
16
17#ifdef CONFIG_MMU_NOTIFIER 13#ifdef CONFIG_MMU_NOTIFIER
18 14
19/* 15/*
@@ -31,15 +27,6 @@ struct mmu_notifier_mm {
31 27
32struct mmu_notifier_ops { 28struct mmu_notifier_ops {
33 /* 29 /*
34 * Flags to specify behavior of callbacks for this MMU notifier.
35 * Used to determine which context an operation may be called.
36 *
37 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
38 * block
39 */
40 int flags;
41
42 /*
43 * Called either by mmu_notifier_unregister or when the mm is 30 * Called either by mmu_notifier_unregister or when the mm is
44 * being destroyed by exit_mmap, always before all pages are 31 * being destroyed by exit_mmap, always before all pages are
45 * freed. This can run concurrently with other mmu notifier 32 * freed. This can run concurrently with other mmu notifier
@@ -153,7 +140,9 @@ struct mmu_notifier_ops {
153 * 140 *
154 * If blockable argument is set to false then the callback cannot 141 * If blockable argument is set to false then the callback cannot
155 * sleep and has to return with -EAGAIN. 0 should be returned 142 * sleep and has to return with -EAGAIN. 0 should be returned
156 * otherwise. 143 * otherwise. Please note that if invalidate_range_start approves
144 * a non-blocking behavior then the same applies to
145 * invalidate_range_end.
157 * 146 *
158 */ 147 */
159 int (*invalidate_range_start)(struct mmu_notifier *mn, 148 int (*invalidate_range_start)(struct mmu_notifier *mn,
@@ -181,10 +170,6 @@ struct mmu_notifier_ops {
181 * Note that this function might be called with just a sub-range 170 * Note that this function might be called with just a sub-range
182 * of what was passed to invalidate_range_start()/end(), if 171 * of what was passed to invalidate_range_start()/end(), if
183 * called between those functions. 172 * called between those functions.
184 *
185 * If this callback cannot block, and invalidate_range_{start,end}
186 * cannot block, mmu_notifier_ops.flags should have
187 * MMU_INVALIDATE_DOES_NOT_BLOCK set.
188 */ 173 */
189 void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, 174 void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
190 unsigned long start, unsigned long end); 175 unsigned long start, unsigned long end);
@@ -239,7 +224,6 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
239 bool only_end); 224 bool only_end);
240extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, 225extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
241 unsigned long start, unsigned long end); 226 unsigned long start, unsigned long end);
242extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
243 227
244static inline void mmu_notifier_release(struct mm_struct *mm) 228static inline void mmu_notifier_release(struct mm_struct *mm)
245{ 229{
@@ -493,11 +477,6 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
493{ 477{
494} 478}
495 479
496static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
497{
498 return false;
499}
500
501static inline void mmu_notifier_mm_init(struct mm_struct *mm) 480static inline void mmu_notifier_mm_init(struct mm_struct *mm)
502{ 481{
503} 482}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d4b0c79d2924..9f0caccd5833 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -161,8 +161,10 @@ enum node_stat_item {
161 NR_SLAB_UNRECLAIMABLE, 161 NR_SLAB_UNRECLAIMABLE,
162 NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ 162 NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
163 NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ 163 NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
164 WORKINGSET_NODES,
164 WORKINGSET_REFAULT, 165 WORKINGSET_REFAULT,
165 WORKINGSET_ACTIVATE, 166 WORKINGSET_ACTIVATE,
167 WORKINGSET_RESTORE,
166 WORKINGSET_NODERECLAIM, 168 WORKINGSET_NODERECLAIM,
167 NR_ANON_MAPPED, /* Mapped anonymous pages */ 169 NR_ANON_MAPPED, /* Mapped anonymous pages */
168 NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. 170 NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
@@ -180,7 +182,7 @@ enum node_stat_item {
180 NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ 182 NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
181 NR_DIRTIED, /* page dirtyings since bootup */ 183 NR_DIRTIED, /* page dirtyings since bootup */
182 NR_WRITTEN, /* page writings since bootup */ 184 NR_WRITTEN, /* page writings since bootup */
183 NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */ 185 NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
184 NR_VM_NODE_STAT_ITEMS 186 NR_VM_NODE_STAT_ITEMS
185}; 187};
186 188
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74bee8cecf4c..50ce1bddaf56 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -69,13 +69,14 @@
69 */ 69 */
70enum pageflags { 70enum pageflags {
71 PG_locked, /* Page is locked. Don't touch. */ 71 PG_locked, /* Page is locked. Don't touch. */
72 PG_error,
73 PG_referenced, 72 PG_referenced,
74 PG_uptodate, 73 PG_uptodate,
75 PG_dirty, 74 PG_dirty,
76 PG_lru, 75 PG_lru,
77 PG_active, 76 PG_active,
77 PG_workingset,
78 PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ 78 PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
79 PG_error,
79 PG_slab, 80 PG_slab,
80 PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ 81 PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
81 PG_arch_1, 82 PG_arch_1,
@@ -162,6 +163,14 @@ static inline int PagePoisoned(const struct page *page)
162 return page->flags == PAGE_POISON_PATTERN; 163 return page->flags == PAGE_POISON_PATTERN;
163} 164}
164 165
166#ifdef CONFIG_DEBUG_VM
167void page_init_poison(struct page *page, size_t size);
168#else
169static inline void page_init_poison(struct page *page, size_t size)
170{
171}
172#endif
173
165/* 174/*
166 * Page flags policies wrt compound pages 175 * Page flags policies wrt compound pages
167 * 176 *
@@ -280,6 +289,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
280PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) 289PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
281PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) 290PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
282 TESTCLEARFLAG(Active, active, PF_HEAD) 291 TESTCLEARFLAG(Active, active, PF_HEAD)
292PAGEFLAG(Workingset, workingset, PF_HEAD)
293 TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
283__PAGEFLAG(Slab, slab, PF_NO_TAIL) 294__PAGEFLAG(Slab, slab, PF_NO_TAIL)
284__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) 295__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
285PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ 296PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
@@ -292,6 +303,7 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
292 303
293PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) 304PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
294 __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) 305 __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
306 __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
295PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) 307PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
296 __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) 308 __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
297 __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) 309 __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 21713dc14ce2..7bb77850c65a 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -9,8 +9,10 @@
9 * PFN_SG_LAST - pfn references a page and is the last scatterlist entry 9 * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
10 * PFN_DEV - pfn is not covered by system memmap by default 10 * PFN_DEV - pfn is not covered by system memmap by default
11 * PFN_MAP - pfn has a dynamic page mapping established by a device driver 11 * PFN_MAP - pfn has a dynamic page mapping established by a device driver
12 * PFN_SPECIAL - for CONFIG_FS_DAX_LIMITED builds to allow XIP, but not
13 * get_user_pages
12 */ 14 */
13#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) 15#define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
14#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) 16#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
15#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) 17#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2))
16#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) 18#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
diff --git a/include/linux/psi.h b/include/linux/psi.h
new file mode 100644
index 000000000000..8e0725aac0aa
--- /dev/null
+++ b/include/linux/psi.h
@@ -0,0 +1,53 @@
1#ifndef _LINUX_PSI_H
2#define _LINUX_PSI_H
3
4#include <linux/psi_types.h>
5#include <linux/sched.h>
6
7struct seq_file;
8struct css_set;
9
10#ifdef CONFIG_PSI
11
12extern bool psi_disabled;
13
14void psi_init(void);
15
16void psi_task_change(struct task_struct *task, int clear, int set);
17
18void psi_memstall_tick(struct task_struct *task, int cpu);
19void psi_memstall_enter(unsigned long *flags);
20void psi_memstall_leave(unsigned long *flags);
21
22int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
23
24#ifdef CONFIG_CGROUPS
25int psi_cgroup_alloc(struct cgroup *cgrp);
26void psi_cgroup_free(struct cgroup *cgrp);
27void cgroup_move_task(struct task_struct *p, struct css_set *to);
28#endif
29
30#else /* CONFIG_PSI */
31
32static inline void psi_init(void) {}
33
34static inline void psi_memstall_enter(unsigned long *flags) {}
35static inline void psi_memstall_leave(unsigned long *flags) {}
36
37#ifdef CONFIG_CGROUPS
38static inline int psi_cgroup_alloc(struct cgroup *cgrp)
39{
40 return 0;
41}
42static inline void psi_cgroup_free(struct cgroup *cgrp)
43{
44}
45static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
46{
47 rcu_assign_pointer(p->cgroups, to);
48}
49#endif
50
51#endif /* CONFIG_PSI */
52
53#endif /* _LINUX_PSI_H */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
new file mode 100644
index 000000000000..2cf422db5d18
--- /dev/null
+++ b/include/linux/psi_types.h
@@ -0,0 +1,92 @@
1#ifndef _LINUX_PSI_TYPES_H
2#define _LINUX_PSI_TYPES_H
3
4#include <linux/seqlock.h>
5#include <linux/types.h>
6
7#ifdef CONFIG_PSI
8
9/* Tracked task states */
10enum psi_task_count {
11 NR_IOWAIT,
12 NR_MEMSTALL,
13 NR_RUNNING,
14 NR_PSI_TASK_COUNTS,
15};
16
17/* Task state bitmasks */
18#define TSK_IOWAIT (1 << NR_IOWAIT)
19#define TSK_MEMSTALL (1 << NR_MEMSTALL)
20#define TSK_RUNNING (1 << NR_RUNNING)
21
22/* Resources that workloads could be stalled on */
23enum psi_res {
24 PSI_IO,
25 PSI_MEM,
26 PSI_CPU,
27 NR_PSI_RESOURCES,
28};
29
30/*
31 * Pressure states for each resource:
32 *
33 * SOME: Stalled tasks & working tasks
34 * FULL: Stalled tasks & no working tasks
35 */
36enum psi_states {
37 PSI_IO_SOME,
38 PSI_IO_FULL,
39 PSI_MEM_SOME,
40 PSI_MEM_FULL,
41 PSI_CPU_SOME,
42 /* Only per-CPU, to weigh the CPU in the global average: */
43 PSI_NONIDLE,
44 NR_PSI_STATES,
45};
46
47struct psi_group_cpu {
48 /* 1st cacheline updated by the scheduler */
49
50 /* Aggregator needs to know of concurrent changes */
51 seqcount_t seq ____cacheline_aligned_in_smp;
52
53 /* States of the tasks belonging to this group */
54 unsigned int tasks[NR_PSI_TASK_COUNTS];
55
56 /* Period time sampling buckets for each state of interest (ns) */
57 u32 times[NR_PSI_STATES];
58
59 /* Time of last task change in this group (rq_clock) */
60 u64 state_start;
61
62 /* 2nd cacheline updated by the aggregator */
63
64 /* Delta detection against the sampling buckets */
65 u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp;
66};
67
68struct psi_group {
69 /* Protects data updated during an aggregation */
70 struct mutex stat_lock;
71
72 /* Per-cpu task state & time tracking */
73 struct psi_group_cpu __percpu *pcpu;
74
75 /* Periodic aggregation state */
76 u64 total_prev[NR_PSI_STATES - 1];
77 u64 last_update;
78 u64 next_update;
79 struct delayed_work clock_work;
80
81 /* Total stall times and sampled pressure averages */
82 u64 total[NR_PSI_STATES - 1];
83 unsigned long avg[NR_PSI_STATES - 1][3];
84};
85
86#else /* CONFIG_PSI */
87
88struct psi_group { };
89
90#endif /* CONFIG_PSI */
91
92#endif /* _LINUX_PSI_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index adfb3f9a7597..8f8a5418b627 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,6 +25,7 @@
25#include <linux/latencytop.h> 25#include <linux/latencytop.h>
26#include <linux/sched/prio.h> 26#include <linux/sched/prio.h>
27#include <linux/signal_types.h> 27#include <linux/signal_types.h>
28#include <linux/psi_types.h>
28#include <linux/mm_types_task.h> 29#include <linux/mm_types_task.h>
29#include <linux/task_io_accounting.h> 30#include <linux/task_io_accounting.h>
30#include <linux/rseq.h> 31#include <linux/rseq.h>
@@ -706,6 +707,10 @@ struct task_struct {
706 unsigned sched_contributes_to_load:1; 707 unsigned sched_contributes_to_load:1;
707 unsigned sched_migrated:1; 708 unsigned sched_migrated:1;
708 unsigned sched_remote_wakeup:1; 709 unsigned sched_remote_wakeup:1;
710#ifdef CONFIG_PSI
711 unsigned sched_psi_wake_requeue:1;
712#endif
713
709 /* Force alignment to the next boundary: */ 714 /* Force alignment to the next boundary: */
710 unsigned :0; 715 unsigned :0;
711 716
@@ -719,9 +724,6 @@ struct task_struct {
719#endif 724#endif
720#ifdef CONFIG_MEMCG 725#ifdef CONFIG_MEMCG
721 unsigned in_user_fault:1; 726 unsigned in_user_fault:1;
722#ifdef CONFIG_MEMCG_KMEM
723 unsigned memcg_kmem_skip_account:1;
724#endif
725#endif 727#endif
726#ifdef CONFIG_COMPAT_BRK 728#ifdef CONFIG_COMPAT_BRK
727 unsigned brk_randomized:1; 729 unsigned brk_randomized:1;
@@ -965,6 +967,10 @@ struct task_struct {
965 kernel_siginfo_t *last_siginfo; 967 kernel_siginfo_t *last_siginfo;
966 968
967 struct task_io_accounting ioac; 969 struct task_io_accounting ioac;
970#ifdef CONFIG_PSI
971 /* Pressure stall state */
972 unsigned int psi_flags;
973#endif
968#ifdef CONFIG_TASK_XACCT 974#ifdef CONFIG_TASK_XACCT
969 /* Accumulated RSS usage: */ 975 /* Accumulated RSS usage: */
970 u64 acct_rss_mem1; 976 u64 acct_rss_mem1;
@@ -1391,6 +1397,7 @@ extern struct pid *cad_pid;
1391#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1397#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
1392#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ 1398#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
1393#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1399#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
1400#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
1394#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ 1401#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
1395#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1402#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
1396#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ 1403#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index 80bc84ba5d2a..4859bea47a7b 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -22,10 +22,26 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
22#define EXP_5 2014 /* 1/exp(5sec/5min) */ 22#define EXP_5 2014 /* 1/exp(5sec/5min) */
23#define EXP_15 2037 /* 1/exp(5sec/15min) */ 23#define EXP_15 2037 /* 1/exp(5sec/15min) */
24 24
25#define CALC_LOAD(load,exp,n) \ 25/*
26 load *= exp; \ 26 * a1 = a0 * e + a * (1 - e)
27 load += n*(FIXED_1-exp); \ 27 */
28 load >>= FSHIFT; 28static inline unsigned long
29calc_load(unsigned long load, unsigned long exp, unsigned long active)
30{
31 unsigned long newload;
32
33 newload = load * exp + active * (FIXED_1 - exp);
34 if (active >= load)
35 newload += FIXED_1-1;
36
37 return newload / FIXED_1;
38}
39
40extern unsigned long calc_load_n(unsigned long load, unsigned long exp,
41 unsigned long active, unsigned int n);
42
43#define LOAD_INT(x) ((x) >> FSHIFT)
44#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
29 45
30extern void calc_global_load(unsigned long ticks); 46extern void calc_global_load(unsigned long ticks);
31 47
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ed9cbddeb4a6..918f374e7156 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -295,12 +295,43 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
295#define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ 295#define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \
296 (KMALLOC_MIN_SIZE) : 16) 296 (KMALLOC_MIN_SIZE) : 16)
297 297
298/*
299 * Whenever changing this, take care of that kmalloc_type() and
300 * create_kmalloc_caches() still work as intended.
301 */
302enum kmalloc_cache_type {
303 KMALLOC_NORMAL = 0,
304 KMALLOC_RECLAIM,
305#ifdef CONFIG_ZONE_DMA
306 KMALLOC_DMA,
307#endif
308 NR_KMALLOC_TYPES
309};
310
298#ifndef CONFIG_SLOB 311#ifndef CONFIG_SLOB
299extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; 312extern struct kmem_cache *
313kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
314
315static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
316{
317 int is_dma = 0;
318 int type_dma = 0;
319 int is_reclaimable;
320
300#ifdef CONFIG_ZONE_DMA 321#ifdef CONFIG_ZONE_DMA
301extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; 322 is_dma = !!(flags & __GFP_DMA);
323 type_dma = is_dma * KMALLOC_DMA;
302#endif 324#endif
303 325
326 is_reclaimable = !!(flags & __GFP_RECLAIMABLE);
327
328 /*
329 * If an allocation is both __GFP_DMA and __GFP_RECLAIMABLE, return
330 * KMALLOC_DMA and effectively ignore __GFP_RECLAIMABLE
331 */
332 return type_dma + (is_reclaimable & !is_dma) * KMALLOC_RECLAIM;
333}
334
304/* 335/*
305 * Figure out which kmalloc slab an allocation of a certain size 336 * Figure out which kmalloc slab an allocation of a certain size
306 * belongs to. 337 * belongs to.
@@ -501,18 +532,20 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
501static __always_inline void *kmalloc(size_t size, gfp_t flags) 532static __always_inline void *kmalloc(size_t size, gfp_t flags)
502{ 533{
503 if (__builtin_constant_p(size)) { 534 if (__builtin_constant_p(size)) {
535#ifndef CONFIG_SLOB
536 unsigned int index;
537#endif
504 if (size > KMALLOC_MAX_CACHE_SIZE) 538 if (size > KMALLOC_MAX_CACHE_SIZE)
505 return kmalloc_large(size, flags); 539 return kmalloc_large(size, flags);
506#ifndef CONFIG_SLOB 540#ifndef CONFIG_SLOB
507 if (!(flags & GFP_DMA)) { 541 index = kmalloc_index(size);
508 unsigned int index = kmalloc_index(size);
509 542
510 if (!index) 543 if (!index)
511 return ZERO_SIZE_PTR; 544 return ZERO_SIZE_PTR;
512 545
513 return kmem_cache_alloc_trace(kmalloc_caches[index], 546 return kmem_cache_alloc_trace(
514 flags, size); 547 kmalloc_caches[kmalloc_type(flags)][index],
515 } 548 flags, size);
516#endif 549#endif
517 } 550 }
518 return __kmalloc(size, flags); 551 return __kmalloc(size, flags);
@@ -542,13 +575,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
542{ 575{
543#ifndef CONFIG_SLOB 576#ifndef CONFIG_SLOB
544 if (__builtin_constant_p(size) && 577 if (__builtin_constant_p(size) &&
545 size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { 578 size <= KMALLOC_MAX_CACHE_SIZE) {
546 unsigned int i = kmalloc_index(size); 579 unsigned int i = kmalloc_index(size);
547 580
548 if (!i) 581 if (!i)
549 return ZERO_SIZE_PTR; 582 return ZERO_SIZE_PTR;
550 583
551 return kmem_cache_alloc_node_trace(kmalloc_caches[i], 584 return kmem_cache_alloc_node_trace(
585 kmalloc_caches[kmalloc_type(flags)][i],
552 flags, node, size); 586 flags, node, size);
553 } 587 }
554#endif 588#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8e2c11e692ba..38195f5c96b1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -167,13 +167,14 @@ enum {
167 SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ 167 SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
168 SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ 168 SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
169 SWP_BLKDEV = (1 << 6), /* its a block device */ 169 SWP_BLKDEV = (1 << 6), /* its a block device */
170 SWP_FILE = (1 << 7), /* set after swap_activate success */ 170 SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
171 SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ 171 SWP_FS = (1 << 8), /* swap file goes through fs */
172 SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ 172 SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */
173 SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ 173 SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
174 SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ 174 SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
175 SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
175 /* add others here before... */ 176 /* add others here before... */
176 SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ 177 SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */
177}; 178};
178 179
179#define SWAP_CLUSTER_MAX 32UL 180#define SWAP_CLUSTER_MAX 32UL
@@ -296,7 +297,7 @@ struct vma_swap_readahead {
296 297
297/* linux/mm/workingset.c */ 298/* linux/mm/workingset.c */
298void *workingset_eviction(struct address_space *mapping, struct page *page); 299void *workingset_eviction(struct address_space *mapping, struct page *page);
299bool workingset_refault(void *shadow); 300void workingset_refault(struct page *page, void *shadow);
300void workingset_activation(struct page *page); 301void workingset_activation(struct page *page);
301 302
302/* Do not use directly, use workingset_lookup_update */ 303/* Do not use directly, use workingset_lookup_update */
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a81cffb76d89..a1675d43777e 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -88,6 +88,7 @@
88 {1UL << PG_dirty, "dirty" }, \ 88 {1UL << PG_dirty, "dirty" }, \
89 {1UL << PG_lru, "lru" }, \ 89 {1UL << PG_lru, "lru" }, \
90 {1UL << PG_active, "active" }, \ 90 {1UL << PG_active, "active" }, \
91 {1UL << PG_workingset, "workingset" }, \
91 {1UL << PG_slab, "slab" }, \ 92 {1UL << PG_slab, "slab" }, \
92 {1UL << PG_owner_priv_1, "owner_priv_1" }, \ 93 {1UL << PG_owner_priv_1, "owner_priv_1" }, \
93 {1UL << PG_arch_1, "arch_1" }, \ 94 {1UL << PG_arch_1, "arch_1" }, \
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index b7aa7bb2349f..5e8ca16a9079 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
34 */ 34 */
35 35
36 36
37#define TASKSTATS_VERSION 8 37#define TASKSTATS_VERSION 9
38#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN 38#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
39 * in linux/sched.h */ 39 * in linux/sched.h */
40 40
@@ -164,6 +164,10 @@ struct taskstats {
164 /* Delay waiting for memory reclaim */ 164 /* Delay waiting for memory reclaim */
165 __u64 freepages_count; 165 __u64 freepages_count;
166 __u64 freepages_delay_total; 166 __u64 freepages_delay_total;
167
168 /* Delay waiting for thrashing page */
169 __u64 thrashing_count;
170 __u64 thrashing_delay_total;
167}; 171};
168 172
169 173
diff --git a/init/Kconfig b/init/Kconfig
index 317d5ccb5191..a4112e95724a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -490,6 +490,25 @@ config TASK_IO_ACCOUNTING
490 490
491 Say N if unsure. 491 Say N if unsure.
492 492
493config PSI
494 bool "Pressure stall information tracking"
495 help
496 Collect metrics that indicate how overcommitted the CPU, memory,
497 and IO capacity are in the system.
498
499 If you say Y here, the kernel will create /proc/pressure/ with the
500 pressure statistics files cpu, memory, and io. These will indicate
501 the share of walltime in which some or all tasks in the system are
502 delayed due to contention of the respective resource.
503
504 In kernels with cgroup support, cgroups (cgroup2 only) will
505 have cpu.pressure, memory.pressure, and io.pressure files,
506 which aggregate pressure stalls for the grouped tasks only.
507
508 For more details see Documentation/accounting/psi.txt.
509
510 Say N if unsure.
511
493endmenu # "CPU/Task time and stats accounting" 512endmenu # "CPU/Task time and stats accounting"
494 513
495config CPU_ISOLATION 514config CPU_ISOLATION
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4c1cf0969a80..8b79318810ad 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,7 @@
55#include <linux/nsproxy.h> 55#include <linux/nsproxy.h>
56#include <linux/file.h> 56#include <linux/file.h>
57#include <linux/sched/cputime.h> 57#include <linux/sched/cputime.h>
58#include <linux/psi.h>
58#include <net/sock.h> 59#include <net/sock.h>
59 60
60#define CREATE_TRACE_POINTS 61#define CREATE_TRACE_POINTS
@@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task,
862 */ 863 */
863 WARN_ON_ONCE(task->flags & PF_EXITING); 864 WARN_ON_ONCE(task->flags & PF_EXITING);
864 865
865 rcu_assign_pointer(task->cgroups, to_cset); 866 cgroup_move_task(task, to_cset);
866 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : 867 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
867 &to_cset->tasks); 868 &to_cset->tasks);
868 } 869 }
@@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
3446 return ret; 3447 return ret;
3447} 3448}
3448 3449
3450#ifdef CONFIG_PSI
3451static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3452{
3453 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
3454}
3455static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3456{
3457 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
3458}
3459static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3460{
3461 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
3462}
3463#endif
3464
3449static int cgroup_file_open(struct kernfs_open_file *of) 3465static int cgroup_file_open(struct kernfs_open_file *of)
3450{ 3466{
3451 struct cftype *cft = of->kn->priv; 3467 struct cftype *cft = of->kn->priv;
@@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = {
4576 .flags = CFTYPE_NOT_ON_ROOT, 4592 .flags = CFTYPE_NOT_ON_ROOT,
4577 .seq_show = cpu_stat_show, 4593 .seq_show = cpu_stat_show,
4578 }, 4594 },
4595#ifdef CONFIG_PSI
4596 {
4597 .name = "io.pressure",
4598 .flags = CFTYPE_NOT_ON_ROOT,
4599 .seq_show = cgroup_io_pressure_show,
4600 },
4601 {
4602 .name = "memory.pressure",
4603 .flags = CFTYPE_NOT_ON_ROOT,
4604 .seq_show = cgroup_memory_pressure_show,
4605 },
4606 {
4607 .name = "cpu.pressure",
4608 .flags = CFTYPE_NOT_ON_ROOT,
4609 .seq_show = cgroup_cpu_pressure_show,
4610 },
4611#endif
4579 { } /* terminate */ 4612 { } /* terminate */
4580}; 4613};
4581 4614
@@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work)
4636 */ 4669 */
4637 cgroup_put(cgroup_parent(cgrp)); 4670 cgroup_put(cgroup_parent(cgrp));
4638 kernfs_put(cgrp->kn); 4671 kernfs_put(cgrp->kn);
4672 psi_cgroup_free(cgrp);
4639 if (cgroup_on_dfl(cgrp)) 4673 if (cgroup_on_dfl(cgrp))
4640 cgroup_rstat_exit(cgrp); 4674 cgroup_rstat_exit(cgrp);
4641 kfree(cgrp); 4675 kfree(cgrp);
@@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4892 cgrp->self.parent = &parent->self; 4926 cgrp->self.parent = &parent->self;
4893 cgrp->root = root; 4927 cgrp->root = root;
4894 cgrp->level = level; 4928 cgrp->level = level;
4895 ret = cgroup_bpf_inherit(cgrp); 4929
4930 ret = psi_cgroup_alloc(cgrp);
4896 if (ret) 4931 if (ret)
4897 goto out_idr_free; 4932 goto out_idr_free;
4898 4933
4934 ret = cgroup_bpf_inherit(cgrp);
4935 if (ret)
4936 goto out_psi_free;
4937
4899 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { 4938 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
4900 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; 4939 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
4901 4940
@@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4933 4972
4934 return cgrp; 4973 return cgrp;
4935 4974
4975out_psi_free:
4976 psi_cgroup_free(cgrp);
4936out_idr_free: 4977out_idr_free:
4937 cgroup_idr_remove(&root->cgroup_idr, cgrp->id); 4978 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4938out_stat_exit: 4979out_stat_exit:
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2ddfce8f1e8f..bb4fe4e1a601 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv)
2556 } 2556 }
2557 kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); 2557 kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
2558 2558
2559 /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
2560
2561#define LOAD_INT(x) ((x) >> FSHIFT)
2562#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
2563 kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", 2559 kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
2564 LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), 2560 LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
2565 LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), 2561 LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
2566 LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); 2562 LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
2567#undef LOAD_INT 2563
2568#undef LOAD_FRAC
2569 /* Display in kilobytes */ 2564 /* Display in kilobytes */
2570#define K(x) ((x) << (PAGE_SHIFT - 10)) 2565#define K(x) ((x) << (PAGE_SHIFT - 10))
2571 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" 2566 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ca8ac2824f0b..2a12b988c717 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
135 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; 135 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
136 tmp = d->freepages_delay_total + tsk->delays->freepages_delay; 136 tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
137 d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; 137 d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
138 tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
139 d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
138 d->blkio_count += tsk->delays->blkio_count; 140 d->blkio_count += tsk->delays->blkio_count;
139 d->swapin_count += tsk->delays->swapin_count; 141 d->swapin_count += tsk->delays->swapin_count;
140 d->freepages_count += tsk->delays->freepages_count; 142 d->freepages_count += tsk->delays->freepages_count;
143 d->thrashing_count += tsk->delays->thrashing_count;
141 raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); 144 raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
142 145
143 return 0; 146 return 0;
@@ -169,3 +172,15 @@ void __delayacct_freepages_end(void)
169 &current->delays->freepages_count); 172 &current->delays->freepages_count);
170} 173}
171 174
175void __delayacct_thrashing_start(void)
176{
177 current->delays->thrashing_start = ktime_get_ns();
178}
179
180void __delayacct_thrashing_end(void)
181{
182 delayacct_end(&current->delays->lock,
183 &current->delays->thrashing_start,
184 &current->delays->thrashing_delay,
185 &current->delays->thrashing_count);
186}
diff --git a/kernel/fork.c b/kernel/fork.c
index f0b58479534f..8f82a3bdcb8f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -223,9 +223,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
223 return s->addr; 223 return s->addr;
224 } 224 }
225 225
226 /*
227 * Allocated stacks are cached and later reused by new threads,
228 * so memcg accounting is performed manually on assigning/releasing
229 * stacks to tasks. Drop __GFP_ACCOUNT.
230 */
226 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, 231 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
227 VMALLOC_START, VMALLOC_END, 232 VMALLOC_START, VMALLOC_END,
228 THREADINFO_GFP, 233 THREADINFO_GFP & ~__GFP_ACCOUNT,
229 PAGE_KERNEL, 234 PAGE_KERNEL,
230 0, node, __builtin_return_address(0)); 235 0, node, __builtin_return_address(0));
231 236
@@ -248,9 +253,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
248static inline void free_thread_stack(struct task_struct *tsk) 253static inline void free_thread_stack(struct task_struct *tsk)
249{ 254{
250#ifdef CONFIG_VMAP_STACK 255#ifdef CONFIG_VMAP_STACK
251 if (task_stack_vm_area(tsk)) { 256 struct vm_struct *vm = task_stack_vm_area(tsk);
257
258 if (vm) {
252 int i; 259 int i;
253 260
261 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
262 mod_memcg_page_state(vm->pages[i],
263 MEMCG_KERNEL_STACK_KB,
264 -(int)(PAGE_SIZE / 1024));
265
266 memcg_kmem_uncharge(vm->pages[i], 0);
267 }
268
254 for (i = 0; i < NR_CACHED_STACKS; i++) { 269 for (i = 0; i < NR_CACHED_STACKS; i++) {
255 if (this_cpu_cmpxchg(cached_stacks[i], 270 if (this_cpu_cmpxchg(cached_stacks[i],
256 NULL, tsk->stack_vm_area) != NULL) 271 NULL, tsk->stack_vm_area) != NULL)
@@ -351,10 +366,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
351 NR_KERNEL_STACK_KB, 366 NR_KERNEL_STACK_KB,
352 PAGE_SIZE / 1024 * account); 367 PAGE_SIZE / 1024 * account);
353 } 368 }
354
355 /* All stack pages belong to the same memcg. */
356 mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
357 account * (THREAD_SIZE / 1024));
358 } else { 369 } else {
359 /* 370 /*
360 * All stack pages are in the same zone and belong to the 371 * All stack pages are in the same zone and belong to the
@@ -370,6 +381,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
370 } 381 }
371} 382}
372 383
384static int memcg_charge_kernel_stack(struct task_struct *tsk)
385{
386#ifdef CONFIG_VMAP_STACK
387 struct vm_struct *vm = task_stack_vm_area(tsk);
388 int ret;
389
390 if (vm) {
391 int i;
392
393 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
394 /*
395 * If memcg_kmem_charge() fails, page->mem_cgroup
396 * pointer is NULL, and both memcg_kmem_uncharge()
397 * and mod_memcg_page_state() in free_thread_stack()
398 * will ignore this page. So it's safe.
399 */
400 ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
401 if (ret)
402 return ret;
403
404 mod_memcg_page_state(vm->pages[i],
405 MEMCG_KERNEL_STACK_KB,
406 PAGE_SIZE / 1024);
407 }
408 }
409#endif
410 return 0;
411}
412
373static void release_task_stack(struct task_struct *tsk) 413static void release_task_stack(struct task_struct *tsk)
374{ 414{
375 if (WARN_ON(tsk->state != TASK_DEAD)) 415 if (WARN_ON(tsk->state != TASK_DEAD))
@@ -807,6 +847,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
807 if (!stack) 847 if (!stack)
808 goto free_tsk; 848 goto free_tsk;
809 849
850 if (memcg_charge_kernel_stack(tsk))
851 goto free_stack;
852
810 stack_vm_area = task_stack_vm_area(tsk); 853 stack_vm_area = task_stack_vm_area(tsk);
811 854
812 err = arch_dup_task_struct(tsk, orig); 855 err = arch_dup_task_struct(tsk, orig);
@@ -1779,6 +1822,10 @@ static __latent_entropy struct task_struct *copy_process(
1779 1822
1780 p->default_timer_slack_ns = current->timer_slack_ns; 1823 p->default_timer_slack_ns = current->timer_slack_ns;
1781 1824
1825#ifdef CONFIG_PSI
1826 p->psi_flags = 0;
1827#endif
1828
1782 task_io_accounting_init(&p->ioac); 1829 task_io_accounting_init(&p->ioac);
1783 acct_clear_integrals(p); 1830 acct_clear_integrals(p);
1784 1831
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 5b8600d39931..620fc4d2559a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
175 struct vmem_altmap *altmap = pgmap->altmap_valid ? 175 struct vmem_altmap *altmap = pgmap->altmap_valid ?
176 &pgmap->altmap : NULL; 176 &pgmap->altmap : NULL;
177 struct resource *res = &pgmap->res; 177 struct resource *res = &pgmap->res;
178 unsigned long pfn, pgoff, order; 178 struct dev_pagemap *conflict_pgmap;
179 pgprot_t pgprot = PAGE_KERNEL; 179 pgprot_t pgprot = PAGE_KERNEL;
180 unsigned long pgoff, order;
180 int error, nid, is_ram; 181 int error, nid, is_ram;
181 struct dev_pagemap *conflict_pgmap;
182 182
183 align_start = res->start & ~(SECTION_SIZE - 1); 183 align_start = res->start & ~(SECTION_SIZE - 1);
184 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) 184 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -256,19 +256,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
256 if (error) 256 if (error)
257 goto err_add_memory; 257 goto err_add_memory;
258 258
259 for_each_device_pfn(pfn, pgmap) { 259 /*
260 struct page *page = pfn_to_page(pfn); 260 * Initialization of the pages has been deferred until now in order
261 261 * to allow us to do the work while not holding the hotplug lock.
262 /* 262 */
263 * ZONE_DEVICE pages union ->lru with a ->pgmap back 263 memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
264 * pointer. It is a bug if a ZONE_DEVICE page is ever 264 align_start >> PAGE_SHIFT,
265 * freed or placed on a driver-private list. Seed the 265 align_size >> PAGE_SHIFT, pgmap);
266 * storage with LIST_POISON* values. 266 percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
267 */
268 list_del(&page->lru);
269 page->pgmap = pgmap;
270 percpu_ref_get(pgmap->ref);
271 }
272 267
273 devm_add_action(dev, devm_memremap_pages_release, pgmap); 268 devm_add_action(dev, devm_memremap_pages_release, pgmap);
274 269
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe183404c38..21fb5a5662b5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
29obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o 29obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
30obj-$(CONFIG_MEMBARRIER) += membarrier.o 30obj-$(CONFIG_MEMBARRIER) += membarrier.o
31obj-$(CONFIG_CPU_ISOLATION) += isolation.o 31obj-$(CONFIG_CPU_ISOLATION) += isolation.o
32obj-$(CONFIG_PSI) += psi.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e696b03e99d..fd2fce8a001b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
722 if (!(flags & ENQUEUE_NOCLOCK)) 722 if (!(flags & ENQUEUE_NOCLOCK))
723 update_rq_clock(rq); 723 update_rq_clock(rq);
724 724
725 if (!(flags & ENQUEUE_RESTORE)) 725 if (!(flags & ENQUEUE_RESTORE)) {
726 sched_info_queued(rq, p); 726 sched_info_queued(rq, p);
727 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
728 }
727 729
728 p->sched_class->enqueue_task(rq, p, flags); 730 p->sched_class->enqueue_task(rq, p, flags);
729} 731}
@@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
733 if (!(flags & DEQUEUE_NOCLOCK)) 735 if (!(flags & DEQUEUE_NOCLOCK))
734 update_rq_clock(rq); 736 update_rq_clock(rq);
735 737
736 if (!(flags & DEQUEUE_SAVE)) 738 if (!(flags & DEQUEUE_SAVE)) {
737 sched_info_dequeued(rq, p); 739 sched_info_dequeued(rq, p);
740 psi_dequeue(p, flags & DEQUEUE_SLEEP);
741 }
738 742
739 p->sched_class->dequeue_task(rq, p, flags); 743 p->sched_class->dequeue_task(rq, p, flags);
740} 744}
@@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2037 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 2041 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2038 if (task_cpu(p) != cpu) { 2042 if (task_cpu(p) != cpu) {
2039 wake_flags |= WF_MIGRATED; 2043 wake_flags |= WF_MIGRATED;
2044 psi_ttwu_dequeue(p);
2040 set_task_cpu(p, cpu); 2045 set_task_cpu(p, cpu);
2041 } 2046 }
2042 2047
@@ -3051,6 +3056,7 @@ void scheduler_tick(void)
3051 curr->sched_class->task_tick(rq, curr, 0); 3056 curr->sched_class->task_tick(rq, curr, 0);
3052 cpu_load_update_active(rq); 3057 cpu_load_update_active(rq);
3053 calc_global_load_tick(rq); 3058 calc_global_load_tick(rq);
3059 psi_task_tick(rq);
3054 3060
3055 rq_unlock(rq, &rf); 3061 rq_unlock(rq, &rf);
3056 3062
@@ -4933,9 +4939,7 @@ static void do_sched_yield(void)
4933 struct rq_flags rf; 4939 struct rq_flags rf;
4934 struct rq *rq; 4940 struct rq *rq;
4935 4941
4936 local_irq_disable(); 4942 rq = this_rq_lock_irq(&rf);
4937 rq = this_rq();
4938 rq_lock(rq, &rf);
4939 4943
4940 schedstat_inc(rq->yld_count); 4944 schedstat_inc(rq->yld_count);
4941 current->sched_class->yield_task(rq); 4945 current->sched_class->yield_task(rq);
@@ -6069,6 +6073,8 @@ void __init sched_init(void)
6069 6073
6070 init_schedstats(); 6074 init_schedstats();
6071 6075
6076 psi_init();
6077
6072 scheduler_running = 1; 6078 scheduler_running = 1;
6073} 6079}
6074 6080
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a171c1258109..28a516575c18 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -91,19 +91,73 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
91 return delta; 91 return delta;
92} 92}
93 93
94/* 94/**
95 * a1 = a0 * e + a * (1 - e) 95 * fixed_power_int - compute: x^n, in O(log n) time
96 *
97 * @x: base of the power
98 * @frac_bits: fractional bits of @x
99 * @n: power to raise @x to.
100 *
101 * By exploiting the relation between the definition of the natural power
102 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
103 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
104 * (where: n_i \elem {0, 1}, the binary vector representing n),
105 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
106 * of course trivially computable in O(log_2 n), the length of our binary
107 * vector.
96 */ 108 */
97static unsigned long 109static unsigned long
98calc_load(unsigned long load, unsigned long exp, unsigned long active) 110fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
99{ 111{
100 unsigned long newload; 112 unsigned long result = 1UL << frac_bits;
113
114 if (n) {
115 for (;;) {
116 if (n & 1) {
117 result *= x;
118 result += 1UL << (frac_bits - 1);
119 result >>= frac_bits;
120 }
121 n >>= 1;
122 if (!n)
123 break;
124 x *= x;
125 x += 1UL << (frac_bits - 1);
126 x >>= frac_bits;
127 }
128 }
101 129
102 newload = load * exp + active * (FIXED_1 - exp); 130 return result;
103 if (active >= load) 131}
104 newload += FIXED_1-1;
105 132
106 return newload / FIXED_1; 133/*
134 * a1 = a0 * e + a * (1 - e)
135 *
136 * a2 = a1 * e + a * (1 - e)
137 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
138 * = a0 * e^2 + a * (1 - e) * (1 + e)
139 *
140 * a3 = a2 * e + a * (1 - e)
141 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
142 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
143 *
144 * ...
145 *
146 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
147 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
148 * = a0 * e^n + a * (1 - e^n)
149 *
150 * [1] application of the geometric series:
151 *
152 * n 1 - x^(n+1)
153 * S_n := \Sum x^i = -------------
154 * i=0 1 - x
155 */
156unsigned long
157calc_load_n(unsigned long load, unsigned long exp,
158 unsigned long active, unsigned int n)
159{
160 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
107} 161}
108 162
109#ifdef CONFIG_NO_HZ_COMMON 163#ifdef CONFIG_NO_HZ_COMMON
@@ -225,75 +279,6 @@ static long calc_load_nohz_fold(void)
225 return delta; 279 return delta;
226} 280}
227 281
228/**
229 * fixed_power_int - compute: x^n, in O(log n) time
230 *
231 * @x: base of the power
232 * @frac_bits: fractional bits of @x
233 * @n: power to raise @x to.
234 *
235 * By exploiting the relation between the definition of the natural power
236 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
237 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
238 * (where: n_i \elem {0, 1}, the binary vector representing n),
239 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
240 * of course trivially computable in O(log_2 n), the length of our binary
241 * vector.
242 */
243static unsigned long
244fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
245{
246 unsigned long result = 1UL << frac_bits;
247
248 if (n) {
249 for (;;) {
250 if (n & 1) {
251 result *= x;
252 result += 1UL << (frac_bits - 1);
253 result >>= frac_bits;
254 }
255 n >>= 1;
256 if (!n)
257 break;
258 x *= x;
259 x += 1UL << (frac_bits - 1);
260 x >>= frac_bits;
261 }
262 }
263
264 return result;
265}
266
267/*
268 * a1 = a0 * e + a * (1 - e)
269 *
270 * a2 = a1 * e + a * (1 - e)
271 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
272 * = a0 * e^2 + a * (1 - e) * (1 + e)
273 *
274 * a3 = a2 * e + a * (1 - e)
275 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
276 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
277 *
278 * ...
279 *
280 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
281 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
282 * = a0 * e^n + a * (1 - e^n)
283 *
284 * [1] application of the geometric series:
285 *
286 * n 1 - x^(n+1)
287 * S_n := \Sum x^i = -------------
288 * i=0 1 - x
289 */
290static unsigned long
291calc_load_n(unsigned long load, unsigned long exp,
292 unsigned long active, unsigned int n)
293{
294 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
295}
296
297/* 282/*
298 * NO_HZ can leave us missing all per-CPU ticks calling 283 * NO_HZ can leave us missing all per-CPU ticks calling
299 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into 284 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
new file mode 100644
index 000000000000..7cdecfc010af
--- /dev/null
+++ b/kernel/sched/psi.c
@@ -0,0 +1,759 @@
1/*
2 * Pressure stall information for CPU, memory and IO
3 *
4 * Copyright (c) 2018 Facebook, Inc.
5 * Author: Johannes Weiner <hannes@cmpxchg.org>
6 *
7 * When CPU, memory and IO are contended, tasks experience delays that
8 * reduce throughput and introduce latencies into the workload. Memory
9 * and IO contention, in addition, can cause a full loss of forward
10 * progress in which the CPU goes idle.
11 *
12 * This code aggregates individual task delays into resource pressure
13 * metrics that indicate problems with both workload health and
14 * resource utilization.
15 *
16 * Model
17 *
18 * The time in which a task can execute on a CPU is our baseline for
19 * productivity. Pressure expresses the amount of time in which this
20 * potential cannot be realized due to resource contention.
21 *
22 * This concept of productivity has two components: the workload and
23 * the CPU. To measure the impact of pressure on both, we define two
24 * contention states for a resource: SOME and FULL.
25 *
26 * In the SOME state of a given resource, one or more tasks are
27 * delayed on that resource. This affects the workload's ability to
28 * perform work, but the CPU may still be executing other tasks.
29 *
30 * In the FULL state of a given resource, all non-idle tasks are
31 * delayed on that resource such that nobody is advancing and the CPU
32 * goes idle. This leaves both workload and CPU unproductive.
33 *
34 * (Naturally, the FULL state doesn't exist for the CPU resource.)
35 *
36 * SOME = nr_delayed_tasks != 0
37 * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
38 *
39 * The percentage of wallclock time spent in those compound stall
40 * states gives pressure numbers between 0 and 100 for each resource,
41 * where the SOME percentage indicates workload slowdowns and the FULL
42 * percentage indicates reduced CPU utilization:
43 *
44 * %SOME = time(SOME) / period
45 * %FULL = time(FULL) / period
46 *
47 * Multiple CPUs
48 *
49 * The more tasks and available CPUs there are, the more work can be
50 * performed concurrently. This means that the potential that can go
51 * unrealized due to resource contention *also* scales with non-idle
52 * tasks and CPUs.
53 *
54 * Consider a scenario where 257 number crunching tasks are trying to
55 * run concurrently on 256 CPUs. If we simply aggregated the task
56 * states, we would have to conclude a CPU SOME pressure number of
57 * 100%, since *somebody* is waiting on a runqueue at all
58 * times. However, that is clearly not the amount of contention the
59 * workload is experiencing: only one out of 256 possible exceution
60 * threads will be contended at any given time, or about 0.4%.
61 *
62 * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
63 * given time *one* of the tasks is delayed due to a lack of memory.
64 * Again, looking purely at the task state would yield a memory FULL
65 * pressure number of 0%, since *somebody* is always making forward
66 * progress. But again this wouldn't capture the amount of execution
67 * potential lost, which is 1 out of 4 CPUs, or 25%.
68 *
69 * To calculate wasted potential (pressure) with multiple processors,
70 * we have to base our calculation on the number of non-idle tasks in
71 * conjunction with the number of available CPUs, which is the number
72 * of potential execution threads. SOME becomes then the proportion of
73 * delayed tasks to possibe threads, and FULL is the share of possible
74 * threads that are unproductive due to delays:
75 *
76 * threads = min(nr_nonidle_tasks, nr_cpus)
77 * SOME = min(nr_delayed_tasks / threads, 1)
78 * FULL = (threads - min(nr_running_tasks, threads)) / threads
79 *
80 * For the 257 number crunchers on 256 CPUs, this yields:
81 *
82 * threads = min(257, 256)
83 * SOME = min(1 / 256, 1) = 0.4%
84 * FULL = (256 - min(257, 256)) / 256 = 0%
85 *
86 * For the 1 out of 4 memory-delayed tasks, this yields:
87 *
88 * threads = min(4, 4)
89 * SOME = min(1 / 4, 1) = 25%
90 * FULL = (4 - min(3, 4)) / 4 = 25%
91 *
92 * [ Substitute nr_cpus with 1, and you can see that it's a natural
93 * extension of the single-CPU model. ]
94 *
95 * Implementation
96 *
97 * To assess the precise time spent in each such state, we would have
98 * to freeze the system on task changes and start/stop the state
99 * clocks accordingly. Obviously that doesn't scale in practice.
100 *
101 * Because the scheduler aims to distribute the compute load evenly
102 * among the available CPUs, we can track task state locally to each
103 * CPU and, at much lower frequency, extrapolate the global state for
104 * the cumulative stall times and the running averages.
105 *
106 * For each runqueue, we track:
107 *
108 * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
109 * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
110 * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
111 *
112 * and then periodically aggregate:
113 *
114 * tNONIDLE = sum(tNONIDLE[i])
115 *
116 * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
117 * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
118 *
119 * %SOME = tSOME / period
120 * %FULL = tFULL / period
121 *
122 * This gives us an approximation of pressure that is practical
123 * cost-wise, yet way more sensitive and accurate than periodic
124 * sampling of the aggregate task states would be.
125 */
126
127#include <linux/sched/loadavg.h>
128#include <linux/seq_file.h>
129#include <linux/proc_fs.h>
130#include <linux/seqlock.h>
131#include <linux/cgroup.h>
132#include <linux/module.h>
133#include <linux/sched.h>
134#include <linux/psi.h>
135#include "sched.h"
136
137static int psi_bug __read_mostly;
138
139bool psi_disabled __read_mostly;
140core_param(psi_disabled, psi_disabled, bool, 0644);
141
142/* Running averages - we need to be higher-res than loadavg */
143#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
144#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
145#define EXP_60s 1981 /* 1/exp(2s/60s) */
146#define EXP_300s 2034 /* 1/exp(2s/300s) */
147
148/* Sampling frequency in nanoseconds */
149static u64 psi_period __read_mostly;
150
151/* System-level pressure and stall tracking */
152static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
153static struct psi_group psi_system = {
154 .pcpu = &system_group_pcpu,
155};
156
157static void psi_update_work(struct work_struct *work);
158
159static void group_init(struct psi_group *group)
160{
161 int cpu;
162
163 for_each_possible_cpu(cpu)
164 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
165 group->next_update = sched_clock() + psi_period;
166 INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
167 mutex_init(&group->stat_lock);
168}
169
170void __init psi_init(void)
171{
172 if (psi_disabled)
173 return;
174
175 psi_period = jiffies_to_nsecs(PSI_FREQ);
176 group_init(&psi_system);
177}
178
179static bool test_state(unsigned int *tasks, enum psi_states state)
180{
181 switch (state) {
182 case PSI_IO_SOME:
183 return tasks[NR_IOWAIT];
184 case PSI_IO_FULL:
185 return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
186 case PSI_MEM_SOME:
187 return tasks[NR_MEMSTALL];
188 case PSI_MEM_FULL:
189 return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
190 case PSI_CPU_SOME:
191 return tasks[NR_RUNNING] > 1;
192 case PSI_NONIDLE:
193 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
194 tasks[NR_RUNNING];
195 default:
196 return false;
197 }
198}
199
200static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
201{
202 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
203 unsigned int tasks[NR_PSI_TASK_COUNTS];
204 u64 now, state_start;
205 unsigned int seq;
206 int s;
207
208 /* Snapshot a coherent view of the CPU state */
209 do {
210 seq = read_seqcount_begin(&groupc->seq);
211 now = cpu_clock(cpu);
212 memcpy(times, groupc->times, sizeof(groupc->times));
213 memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
214 state_start = groupc->state_start;
215 } while (read_seqcount_retry(&groupc->seq, seq));
216
217 /* Calculate state time deltas against the previous snapshot */
218 for (s = 0; s < NR_PSI_STATES; s++) {
219 u32 delta;
220 /*
221 * In addition to already concluded states, we also
222 * incorporate currently active states on the CPU,
223 * since states may last for many sampling periods.
224 *
225 * This way we keep our delta sampling buckets small
226 * (u32) and our reported pressure close to what's
227 * actually happening.
228 */
229 if (test_state(tasks, s))
230 times[s] += now - state_start;
231
232 delta = times[s] - groupc->times_prev[s];
233 groupc->times_prev[s] = times[s];
234
235 times[s] = delta;
236 }
237}
238
239static void calc_avgs(unsigned long avg[3], int missed_periods,
240 u64 time, u64 period)
241{
242 unsigned long pct;
243
244 /* Fill in zeroes for periods of no activity */
245 if (missed_periods) {
246 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
247 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
248 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
249 }
250
251 /* Sample the most recent active period */
252 pct = div_u64(time * 100, period);
253 pct *= FIXED_1;
254 avg[0] = calc_load(avg[0], EXP_10s, pct);
255 avg[1] = calc_load(avg[1], EXP_60s, pct);
256 avg[2] = calc_load(avg[2], EXP_300s, pct);
257}
258
259static bool update_stats(struct psi_group *group)
260{
261 u64 deltas[NR_PSI_STATES - 1] = { 0, };
262 unsigned long missed_periods = 0;
263 unsigned long nonidle_total = 0;
264 u64 now, expires, period;
265 int cpu;
266 int s;
267
268 mutex_lock(&group->stat_lock);
269
270 /*
271 * Collect the per-cpu time buckets and average them into a
272 * single time sample that is normalized to wallclock time.
273 *
274 * For averaging, each CPU is weighted by its non-idle time in
275 * the sampling period. This eliminates artifacts from uneven
276 * loading, or even entirely idle CPUs.
277 */
278 for_each_possible_cpu(cpu) {
279 u32 times[NR_PSI_STATES];
280 u32 nonidle;
281
282 get_recent_times(group, cpu, times);
283
284 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
285 nonidle_total += nonidle;
286
287 for (s = 0; s < PSI_NONIDLE; s++)
288 deltas[s] += (u64)times[s] * nonidle;
289 }
290
291 /*
292 * Integrate the sample into the running statistics that are
293 * reported to userspace: the cumulative stall times and the
294 * decaying averages.
295 *
296 * Pressure percentages are sampled at PSI_FREQ. We might be
297 * called more often when the user polls more frequently than
298 * that; we might be called less often when there is no task
299 * activity, thus no data, and clock ticks are sporadic. The
300 * below handles both.
301 */
302
303 /* total= */
304 for (s = 0; s < NR_PSI_STATES - 1; s++)
305 group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
306
307 /* avgX= */
308 now = sched_clock();
309 expires = group->next_update;
310 if (now < expires)
311 goto out;
312 if (now - expires > psi_period)
313 missed_periods = div_u64(now - expires, psi_period);
314
315 /*
316 * The periodic clock tick can get delayed for various
317 * reasons, especially on loaded systems. To avoid clock
318 * drift, we schedule the clock in fixed psi_period intervals.
319 * But the deltas we sample out of the per-cpu buckets above
320 * are based on the actual time elapsing between clock ticks.
321 */
322 group->next_update = expires + ((1 + missed_periods) * psi_period);
323 period = now - (group->last_update + (missed_periods * psi_period));
324 group->last_update = now;
325
326 for (s = 0; s < NR_PSI_STATES - 1; s++) {
327 u32 sample;
328
329 sample = group->total[s] - group->total_prev[s];
330 /*
331 * Due to the lockless sampling of the time buckets,
332 * recorded time deltas can slip into the next period,
333 * which under full pressure can result in samples in
334 * excess of the period length.
335 *
336 * We don't want to report non-sensical pressures in
337 * excess of 100%, nor do we want to drop such events
338 * on the floor. Instead we punt any overage into the
339 * future until pressure subsides. By doing this we
340 * don't underreport the occurring pressure curve, we
341 * just report it delayed by one period length.
342 *
343 * The error isn't cumulative. As soon as another
344 * delta slips from a period P to P+1, by definition
345 * it frees up its time T in P.
346 */
347 if (sample > period)
348 sample = period;
349 group->total_prev[s] += sample;
350 calc_avgs(group->avg[s], missed_periods, sample, period);
351 }
352out:
353 mutex_unlock(&group->stat_lock);
354 return nonidle_total;
355}
356
357static void psi_update_work(struct work_struct *work)
358{
359 struct delayed_work *dwork;
360 struct psi_group *group;
361 bool nonidle;
362
363 dwork = to_delayed_work(work);
364 group = container_of(dwork, struct psi_group, clock_work);
365
366 /*
367 * If there is task activity, periodically fold the per-cpu
368 * times and feed samples into the running averages. If things
369 * are idle and there is no data to process, stop the clock.
370 * Once restarted, we'll catch up the running averages in one
371 * go - see calc_avgs() and missed_periods.
372 */
373
374 nonidle = update_stats(group);
375
376 if (nonidle) {
377 unsigned long delay = 0;
378 u64 now;
379
380 now = sched_clock();
381 if (group->next_update > now)
382 delay = nsecs_to_jiffies(group->next_update - now) + 1;
383 schedule_delayed_work(dwork, delay);
384 }
385}
386
387static void record_times(struct psi_group_cpu *groupc, int cpu,
388 bool memstall_tick)
389{
390 u32 delta;
391 u64 now;
392
393 now = cpu_clock(cpu);
394 delta = now - groupc->state_start;
395 groupc->state_start = now;
396
397 if (test_state(groupc->tasks, PSI_IO_SOME)) {
398 groupc->times[PSI_IO_SOME] += delta;
399 if (test_state(groupc->tasks, PSI_IO_FULL))
400 groupc->times[PSI_IO_FULL] += delta;
401 }
402
403 if (test_state(groupc->tasks, PSI_MEM_SOME)) {
404 groupc->times[PSI_MEM_SOME] += delta;
405 if (test_state(groupc->tasks, PSI_MEM_FULL))
406 groupc->times[PSI_MEM_FULL] += delta;
407 else if (memstall_tick) {
408 u32 sample;
409 /*
410 * Since we care about lost potential, a
411 * memstall is FULL when there are no other
412 * working tasks, but also when the CPU is
413 * actively reclaiming and nothing productive
414 * could run even if it were runnable.
415 *
416 * When the timer tick sees a reclaiming CPU,
417 * regardless of runnable tasks, sample a FULL
418 * tick (or less if it hasn't been a full tick
419 * since the last state change).
420 */
421 sample = min(delta, (u32)jiffies_to_nsecs(1));
422 groupc->times[PSI_MEM_FULL] += sample;
423 }
424 }
425
426 if (test_state(groupc->tasks, PSI_CPU_SOME))
427 groupc->times[PSI_CPU_SOME] += delta;
428
429 if (test_state(groupc->tasks, PSI_NONIDLE))
430 groupc->times[PSI_NONIDLE] += delta;
431}
432
433static void psi_group_change(struct psi_group *group, int cpu,
434 unsigned int clear, unsigned int set)
435{
436 struct psi_group_cpu *groupc;
437 unsigned int t, m;
438
439 groupc = per_cpu_ptr(group->pcpu, cpu);
440
441 /*
442 * First we assess the aggregate resource states this CPU's
443 * tasks have been in since the last change, and account any
444 * SOME and FULL time these may have resulted in.
445 *
446 * Then we update the task counts according to the state
447 * change requested through the @clear and @set bits.
448 */
449 write_seqcount_begin(&groupc->seq);
450
451 record_times(groupc, cpu, false);
452
453 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
454 if (!(m & (1 << t)))
455 continue;
456 if (groupc->tasks[t] == 0 && !psi_bug) {
457 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
458 cpu, t, groupc->tasks[0],
459 groupc->tasks[1], groupc->tasks[2],
460 clear, set);
461 psi_bug = 1;
462 }
463 groupc->tasks[t]--;
464 }
465
466 for (t = 0; set; set &= ~(1 << t), t++)
467 if (set & (1 << t))
468 groupc->tasks[t]++;
469
470 write_seqcount_end(&groupc->seq);
471
472 if (!delayed_work_pending(&group->clock_work))
473 schedule_delayed_work(&group->clock_work, PSI_FREQ);
474}
475
476static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
477{
478#ifdef CONFIG_CGROUPS
479 struct cgroup *cgroup = NULL;
480
481 if (!*iter)
482 cgroup = task->cgroups->dfl_cgrp;
483 else if (*iter == &psi_system)
484 return NULL;
485 else
486 cgroup = cgroup_parent(*iter);
487
488 if (cgroup && cgroup_parent(cgroup)) {
489 *iter = cgroup;
490 return cgroup_psi(cgroup);
491 }
492#else
493 if (*iter)
494 return NULL;
495#endif
496 *iter = &psi_system;
497 return &psi_system;
498}
499
500void psi_task_change(struct task_struct *task, int clear, int set)
501{
502 int cpu = task_cpu(task);
503 struct psi_group *group;
504 void *iter = NULL;
505
506 if (!task->pid)
507 return;
508
509 if (((task->psi_flags & set) ||
510 (task->psi_flags & clear) != clear) &&
511 !psi_bug) {
512 printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
513 task->pid, task->comm, cpu,
514 task->psi_flags, clear, set);
515 psi_bug = 1;
516 }
517
518 task->psi_flags &= ~clear;
519 task->psi_flags |= set;
520
521 while ((group = iterate_groups(task, &iter)))
522 psi_group_change(group, cpu, clear, set);
523}
524
525void psi_memstall_tick(struct task_struct *task, int cpu)
526{
527 struct psi_group *group;
528 void *iter = NULL;
529
530 while ((group = iterate_groups(task, &iter))) {
531 struct psi_group_cpu *groupc;
532
533 groupc = per_cpu_ptr(group->pcpu, cpu);
534 write_seqcount_begin(&groupc->seq);
535 record_times(groupc, cpu, true);
536 write_seqcount_end(&groupc->seq);
537 }
538}
539
540/**
541 * psi_memstall_enter - mark the beginning of a memory stall section
542 * @flags: flags to handle nested sections
543 *
544 * Marks the calling task as being stalled due to a lack of memory,
545 * such as waiting for a refault or performing reclaim.
546 */
547void psi_memstall_enter(unsigned long *flags)
548{
549 struct rq_flags rf;
550 struct rq *rq;
551
552 if (psi_disabled)
553 return;
554
555 *flags = current->flags & PF_MEMSTALL;
556 if (*flags)
557 return;
558 /*
559 * PF_MEMSTALL setting & accounting needs to be atomic wrt
560 * changes to the task's scheduling state, otherwise we can
561 * race with CPU migration.
562 */
563 rq = this_rq_lock_irq(&rf);
564
565 current->flags |= PF_MEMSTALL;
566 psi_task_change(current, 0, TSK_MEMSTALL);
567
568 rq_unlock_irq(rq, &rf);
569}
570
571/**
572 * psi_memstall_leave - mark the end of an memory stall section
573 * @flags: flags to handle nested memdelay sections
574 *
575 * Marks the calling task as no longer stalled due to lack of memory.
576 */
577void psi_memstall_leave(unsigned long *flags)
578{
579 struct rq_flags rf;
580 struct rq *rq;
581
582 if (psi_disabled)
583 return;
584
585 if (*flags)
586 return;
587 /*
588 * PF_MEMSTALL clearing & accounting needs to be atomic wrt
589 * changes to the task's scheduling state, otherwise we could
590 * race with CPU migration.
591 */
592 rq = this_rq_lock_irq(&rf);
593
594 current->flags &= ~PF_MEMSTALL;
595 psi_task_change(current, TSK_MEMSTALL, 0);
596
597 rq_unlock_irq(rq, &rf);
598}
599
600#ifdef CONFIG_CGROUPS
601int psi_cgroup_alloc(struct cgroup *cgroup)
602{
603 if (psi_disabled)
604 return 0;
605
606 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
607 if (!cgroup->psi.pcpu)
608 return -ENOMEM;
609 group_init(&cgroup->psi);
610 return 0;
611}
612
613void psi_cgroup_free(struct cgroup *cgroup)
614{
615 if (psi_disabled)
616 return;
617
618 cancel_delayed_work_sync(&cgroup->psi.clock_work);
619 free_percpu(cgroup->psi.pcpu);
620}
621
622/**
623 * cgroup_move_task - move task to a different cgroup
624 * @task: the task
625 * @to: the target css_set
626 *
627 * Move task to a new cgroup and safely migrate its associated stall
628 * state between the different groups.
629 *
630 * This function acquires the task's rq lock to lock out concurrent
631 * changes to the task's scheduling state and - in case the task is
632 * running - concurrent changes to its stall state.
633 */
634void cgroup_move_task(struct task_struct *task, struct css_set *to)
635{
636 bool move_psi = !psi_disabled;
637 unsigned int task_flags = 0;
638 struct rq_flags rf;
639 struct rq *rq;
640
641 if (move_psi) {
642 rq = task_rq_lock(task, &rf);
643
644 if (task_on_rq_queued(task))
645 task_flags = TSK_RUNNING;
646 else if (task->in_iowait)
647 task_flags = TSK_IOWAIT;
648
649 if (task->flags & PF_MEMSTALL)
650 task_flags |= TSK_MEMSTALL;
651
652 if (task_flags)
653 psi_task_change(task, task_flags, 0);
654 }
655
656 /*
657 * Lame to do this here, but the scheduler cannot be locked
658 * from the outside, so we move cgroups from inside sched/.
659 */
660 rcu_assign_pointer(task->cgroups, to);
661
662 if (move_psi) {
663 if (task_flags)
664 psi_task_change(task, 0, task_flags);
665
666 task_rq_unlock(rq, task, &rf);
667 }
668}
669#endif /* CONFIG_CGROUPS */
670
671int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
672{
673 int full;
674
675 if (psi_disabled)
676 return -EOPNOTSUPP;
677
678 update_stats(group);
679
680 for (full = 0; full < 2 - (res == PSI_CPU); full++) {
681 unsigned long avg[3];
682 u64 total;
683 int w;
684
685 for (w = 0; w < 3; w++)
686 avg[w] = group->avg[res * 2 + full][w];
687 total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
688
689 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
690 full ? "full" : "some",
691 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
692 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
693 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
694 total);
695 }
696
697 return 0;
698}
699
700static int psi_io_show(struct seq_file *m, void *v)
701{
702 return psi_show(m, &psi_system, PSI_IO);
703}
704
705static int psi_memory_show(struct seq_file *m, void *v)
706{
707 return psi_show(m, &psi_system, PSI_MEM);
708}
709
710static int psi_cpu_show(struct seq_file *m, void *v)
711{
712 return psi_show(m, &psi_system, PSI_CPU);
713}
714
715static int psi_io_open(struct inode *inode, struct file *file)
716{
717 return single_open(file, psi_io_show, NULL);
718}
719
720static int psi_memory_open(struct inode *inode, struct file *file)
721{
722 return single_open(file, psi_memory_show, NULL);
723}
724
725static int psi_cpu_open(struct inode *inode, struct file *file)
726{
727 return single_open(file, psi_cpu_show, NULL);
728}
729
730static const struct file_operations psi_io_fops = {
731 .open = psi_io_open,
732 .read = seq_read,
733 .llseek = seq_lseek,
734 .release = single_release,
735};
736
737static const struct file_operations psi_memory_fops = {
738 .open = psi_memory_open,
739 .read = seq_read,
740 .llseek = seq_lseek,
741 .release = single_release,
742};
743
744static const struct file_operations psi_cpu_fops = {
745 .open = psi_cpu_open,
746 .read = seq_read,
747 .llseek = seq_lseek,
748 .release = single_release,
749};
750
751static int __init psi_proc_init(void)
752{
753 proc_mkdir("pressure", NULL);
754 proc_create("pressure/io", 0, NULL, &psi_io_fops);
755 proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
756 proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
757 return 0;
758}
759module_init(psi_proc_init);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b8c007713b3b..618577fc9aa8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -54,6 +54,7 @@
54#include <linux/proc_fs.h> 54#include <linux/proc_fs.h>
55#include <linux/prefetch.h> 55#include <linux/prefetch.h>
56#include <linux/profile.h> 56#include <linux/profile.h>
57#include <linux/psi.h>
57#include <linux/rcupdate_wait.h> 58#include <linux/rcupdate_wait.h>
58#include <linux/security.h> 59#include <linux/security.h>
59#include <linux/stop_machine.h> 60#include <linux/stop_machine.h>
@@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu);
319#ifdef CONFIG_CGROUP_SCHED 320#ifdef CONFIG_CGROUP_SCHED
320 321
321#include <linux/cgroup.h> 322#include <linux/cgroup.h>
323#include <linux/psi.h>
322 324
323struct cfs_rq; 325struct cfs_rq;
324struct rt_rq; 326struct rt_rq;
@@ -957,6 +959,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
957#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 959#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
958#define raw_rq() raw_cpu_ptr(&runqueues) 960#define raw_rq() raw_cpu_ptr(&runqueues)
959 961
962extern void update_rq_clock(struct rq *rq);
963
960static inline u64 __rq_clock_broken(struct rq *rq) 964static inline u64 __rq_clock_broken(struct rq *rq)
961{ 965{
962 return READ_ONCE(rq->clock); 966 return READ_ONCE(rq->clock);
@@ -1075,6 +1079,98 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
1075#endif 1079#endif
1076} 1080}
1077 1081
1082struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1083 __acquires(rq->lock);
1084
1085struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1086 __acquires(p->pi_lock)
1087 __acquires(rq->lock);
1088
1089static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
1090 __releases(rq->lock)
1091{
1092 rq_unpin_lock(rq, rf);
1093 raw_spin_unlock(&rq->lock);
1094}
1095
1096static inline void
1097task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1098 __releases(rq->lock)
1099 __releases(p->pi_lock)
1100{
1101 rq_unpin_lock(rq, rf);
1102 raw_spin_unlock(&rq->lock);
1103 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
1104}
1105
1106static inline void
1107rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
1108 __acquires(rq->lock)
1109{
1110 raw_spin_lock_irqsave(&rq->lock, rf->flags);
1111 rq_pin_lock(rq, rf);
1112}
1113
1114static inline void
1115rq_lock_irq(struct rq *rq, struct rq_flags *rf)
1116 __acquires(rq->lock)
1117{
1118 raw_spin_lock_irq(&rq->lock);
1119 rq_pin_lock(rq, rf);
1120}
1121
1122static inline void
1123rq_lock(struct rq *rq, struct rq_flags *rf)
1124 __acquires(rq->lock)
1125{
1126 raw_spin_lock(&rq->lock);
1127 rq_pin_lock(rq, rf);
1128}
1129
1130static inline void
1131rq_relock(struct rq *rq, struct rq_flags *rf)
1132 __acquires(rq->lock)
1133{
1134 raw_spin_lock(&rq->lock);
1135 rq_repin_lock(rq, rf);
1136}
1137
1138static inline void
1139rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
1140 __releases(rq->lock)
1141{
1142 rq_unpin_lock(rq, rf);
1143 raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
1144}
1145
1146static inline void
1147rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
1148 __releases(rq->lock)
1149{
1150 rq_unpin_lock(rq, rf);
1151 raw_spin_unlock_irq(&rq->lock);
1152}
1153
1154static inline void
1155rq_unlock(struct rq *rq, struct rq_flags *rf)
1156 __releases(rq->lock)
1157{
1158 rq_unpin_lock(rq, rf);
1159 raw_spin_unlock(&rq->lock);
1160}
1161
1162static inline struct rq *
1163this_rq_lock_irq(struct rq_flags *rf)
1164 __acquires(rq->lock)
1165{
1166 struct rq *rq;
1167
1168 local_irq_disable();
1169 rq = this_rq();
1170 rq_lock(rq, rf);
1171 return rq;
1172}
1173
1078#ifdef CONFIG_NUMA 1174#ifdef CONFIG_NUMA
1079enum numa_topology_type { 1175enum numa_topology_type {
1080 NUMA_DIRECT, 1176 NUMA_DIRECT,
@@ -1717,8 +1813,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
1717 sched_update_tick_dependency(rq); 1813 sched_update_tick_dependency(rq);
1718} 1814}
1719 1815
1720extern void update_rq_clock(struct rq *rq);
1721
1722extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1816extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
1723extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); 1817extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
1724 1818
@@ -1783,86 +1877,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
1783#endif 1877#endif
1784#endif 1878#endif
1785 1879
1786struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1787 __acquires(rq->lock);
1788
1789struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
1790 __acquires(p->pi_lock)
1791 __acquires(rq->lock);
1792
1793static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
1794 __releases(rq->lock)
1795{
1796 rq_unpin_lock(rq, rf);
1797 raw_spin_unlock(&rq->lock);
1798}
1799
1800static inline void
1801task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1802 __releases(rq->lock)
1803 __releases(p->pi_lock)
1804{
1805 rq_unpin_lock(rq, rf);
1806 raw_spin_unlock(&rq->lock);
1807 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
1808}
1809
1810static inline void
1811rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
1812 __acquires(rq->lock)
1813{
1814 raw_spin_lock_irqsave(&rq->lock, rf->flags);
1815 rq_pin_lock(rq, rf);
1816}
1817
1818static inline void
1819rq_lock_irq(struct rq *rq, struct rq_flags *rf)
1820 __acquires(rq->lock)
1821{
1822 raw_spin_lock_irq(&rq->lock);
1823 rq_pin_lock(rq, rf);
1824}
1825
1826static inline void
1827rq_lock(struct rq *rq, struct rq_flags *rf)
1828 __acquires(rq->lock)
1829{
1830 raw_spin_lock(&rq->lock);
1831 rq_pin_lock(rq, rf);
1832}
1833
1834static inline void
1835rq_relock(struct rq *rq, struct rq_flags *rf)
1836 __acquires(rq->lock)
1837{
1838 raw_spin_lock(&rq->lock);
1839 rq_repin_lock(rq, rf);
1840}
1841
1842static inline void
1843rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
1844 __releases(rq->lock)
1845{
1846 rq_unpin_lock(rq, rf);
1847 raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
1848}
1849
1850static inline void
1851rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
1852 __releases(rq->lock)
1853{
1854 rq_unpin_lock(rq, rf);
1855 raw_spin_unlock_irq(&rq->lock);
1856}
1857
1858static inline void
1859rq_unlock(struct rq *rq, struct rq_flags *rf)
1860 __releases(rq->lock)
1861{
1862 rq_unpin_lock(rq, rf);
1863 raw_spin_unlock(&rq->lock);
1864}
1865
1866#ifdef CONFIG_SMP 1880#ifdef CONFIG_SMP
1867#ifdef CONFIG_PREEMPT 1881#ifdef CONFIG_PREEMPT
1868 1882
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8aea199a39b4..4904c4677000 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
55# define schedstat_val_or_zero(var) 0 55# define schedstat_val_or_zero(var) 0
56#endif /* CONFIG_SCHEDSTATS */ 56#endif /* CONFIG_SCHEDSTATS */
57 57
58#ifdef CONFIG_PSI
59/*
60 * PSI tracks state that persists across sleeps, such as iowaits and
61 * memory stalls. As a result, it has to distinguish between sleeps,
62 * where a task's runnable state changes, and requeues, where a task
63 * and its state are being moved between CPUs and runqueues.
64 */
65static inline void psi_enqueue(struct task_struct *p, bool wakeup)
66{
67 int clear = 0, set = TSK_RUNNING;
68
69 if (psi_disabled)
70 return;
71
72 if (!wakeup || p->sched_psi_wake_requeue) {
73 if (p->flags & PF_MEMSTALL)
74 set |= TSK_MEMSTALL;
75 if (p->sched_psi_wake_requeue)
76 p->sched_psi_wake_requeue = 0;
77 } else {
78 if (p->in_iowait)
79 clear |= TSK_IOWAIT;
80 }
81
82 psi_task_change(p, clear, set);
83}
84
85static inline void psi_dequeue(struct task_struct *p, bool sleep)
86{
87 int clear = TSK_RUNNING, set = 0;
88
89 if (psi_disabled)
90 return;
91
92 if (!sleep) {
93 if (p->flags & PF_MEMSTALL)
94 clear |= TSK_MEMSTALL;
95 } else {
96 if (p->in_iowait)
97 set |= TSK_IOWAIT;
98 }
99
100 psi_task_change(p, clear, set);
101}
102
103static inline void psi_ttwu_dequeue(struct task_struct *p)
104{
105 if (psi_disabled)
106 return;
107 /*
108 * Is the task being migrated during a wakeup? Make sure to
109 * deregister its sleep-persistent psi states from the old
110 * queue, and let psi_enqueue() know it has to requeue.
111 */
112 if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
113 struct rq_flags rf;
114 struct rq *rq;
115 int clear = 0;
116
117 if (p->in_iowait)
118 clear |= TSK_IOWAIT;
119 if (p->flags & PF_MEMSTALL)
120 clear |= TSK_MEMSTALL;
121
122 rq = __task_rq_lock(p, &rf);
123 psi_task_change(p, clear, 0);
124 p->sched_psi_wake_requeue = 1;
125 __task_rq_unlock(rq, &rf);
126 }
127}
128
129static inline void psi_task_tick(struct rq *rq)
130{
131 if (psi_disabled)
132 return;
133
134 if (unlikely(rq->curr->flags & PF_MEMSTALL))
135 psi_memstall_tick(rq->curr, cpu_of(rq));
136}
137#else /* CONFIG_PSI */
138static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
139static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
140static inline void psi_ttwu_dequeue(struct task_struct *p) {}
141static inline void psi_task_tick(struct rq *rq) {}
142#endif /* CONFIG_PSI */
143
58#ifdef CONFIG_SCHED_INFO 144#ifdef CONFIG_SCHED_INFO
59static inline void sched_info_reset_dequeued(struct task_struct *t) 145static inline void sched_info_reset_dequeued(struct task_struct *t)
60{ 146{
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index ec657105edbf..51b78405bf24 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -579,6 +579,73 @@ static noinline void __init kmem_cache_invalid_free(void)
579 kmem_cache_destroy(cache); 579 kmem_cache_destroy(cache);
580} 580}
581 581
582static noinline void __init kasan_memchr(void)
583{
584 char *ptr;
585 size_t size = 24;
586
587 pr_info("out-of-bounds in memchr\n");
588 ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
589 if (!ptr)
590 return;
591
592 memchr(ptr, '1', size + 1);
593 kfree(ptr);
594}
595
596static noinline void __init kasan_memcmp(void)
597{
598 char *ptr;
599 size_t size = 24;
600 int arr[9];
601
602 pr_info("out-of-bounds in memcmp\n");
603 ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
604 if (!ptr)
605 return;
606
607 memset(arr, 0, sizeof(arr));
608 memcmp(ptr, arr, size+1);
609 kfree(ptr);
610}
611
612static noinline void __init kasan_strings(void)
613{
614 char *ptr;
615 size_t size = 24;
616
617 pr_info("use-after-free in strchr\n");
618 ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
619 if (!ptr)
620 return;
621
622 kfree(ptr);
623
624 /*
625 * Try to cause only 1 invalid access (less spam in dmesg).
626 * For that we need ptr to point to zeroed byte.
627 * Skip metadata that could be stored in freed object so ptr
628 * will likely point to zeroed byte.
629 */
630 ptr += 16;
631 strchr(ptr, '1');
632
633 pr_info("use-after-free in strrchr\n");
634 strrchr(ptr, '1');
635
636 pr_info("use-after-free in strcmp\n");
637 strcmp(ptr, "2");
638
639 pr_info("use-after-free in strncmp\n");
640 strncmp(ptr, "2", 1);
641
642 pr_info("use-after-free in strlen\n");
643 strlen(ptr);
644
645 pr_info("use-after-free in strnlen\n");
646 strnlen(ptr, 1);
647}
648
582static int __init kmalloc_tests_init(void) 649static int __init kmalloc_tests_init(void)
583{ 650{
584 /* 651 /*
@@ -618,6 +685,9 @@ static int __init kmalloc_tests_init(void)
618 use_after_scope_test(); 685 use_after_scope_test();
619 kmem_cache_double_free(); 686 kmem_cache_double_free();
620 kmem_cache_invalid_free(); 687 kmem_cache_invalid_free();
688 kasan_memchr();
689 kasan_memcmp();
690 kasan_strings();
621 691
622 kasan_restore_multi_shot(multishot); 692 kasan_restore_multi_shot(multishot);
623 693
diff --git a/mm/compaction.c b/mm/compaction.c
index faca45ebe62d..7c607479de4a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -22,6 +22,7 @@
22#include <linux/kthread.h> 22#include <linux/kthread.h>
23#include <linux/freezer.h> 23#include <linux/freezer.h>
24#include <linux/page_owner.h> 24#include <linux/page_owner.h>
25#include <linux/psi.h>
25#include "internal.h" 26#include "internal.h"
26 27
27#ifdef CONFIG_COMPACTION 28#ifdef CONFIG_COMPACTION
@@ -2068,11 +2069,15 @@ static int kcompactd(void *p)
2068 pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; 2069 pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
2069 2070
2070 while (!kthread_should_stop()) { 2071 while (!kthread_should_stop()) {
2072 unsigned long pflags;
2073
2071 trace_mm_compaction_kcompactd_sleep(pgdat->node_id); 2074 trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
2072 wait_event_freezable(pgdat->kcompactd_wait, 2075 wait_event_freezable(pgdat->kcompactd_wait,
2073 kcompactd_work_requested(pgdat)); 2076 kcompactd_work_requested(pgdat));
2074 2077
2078 psi_memstall_enter(&pflags);
2075 kcompactd_do_work(pgdat); 2079 kcompactd_do_work(pgdat);
2080 psi_memstall_leave(&pflags);
2076 } 2081 }
2077 2082
2078 return 0; 2083 return 0;
diff --git a/mm/debug.c b/mm/debug.c
index bd10aad8539a..cdacba12e09a 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -13,6 +13,7 @@
13#include <trace/events/mmflags.h> 13#include <trace/events/mmflags.h>
14#include <linux/migrate.h> 14#include <linux/migrate.h>
15#include <linux/page_owner.h> 15#include <linux/page_owner.h>
16#include <linux/ctype.h>
16 17
17#include "internal.h" 18#include "internal.h"
18 19
@@ -175,4 +176,49 @@ void dump_mm(const struct mm_struct *mm)
175 ); 176 );
176} 177}
177 178
179static bool page_init_poisoning __read_mostly = true;
180
181static int __init setup_vm_debug(char *str)
182{
183 bool __page_init_poisoning = true;
184
185 /*
186 * Calling vm_debug with no arguments is equivalent to requesting
187 * to enable all debugging options we can control.
188 */
189 if (*str++ != '=' || !*str)
190 goto out;
191
192 __page_init_poisoning = false;
193 if (*str == '-')
194 goto out;
195
196 while (*str) {
197 switch (tolower(*str)) {
198 case'p':
199 __page_init_poisoning = true;
200 break;
201 default:
202 pr_err("vm_debug option '%c' unknown. skipped\n",
203 *str);
204 }
205
206 str++;
207 }
208out:
209 if (page_init_poisoning && !__page_init_poisoning)
210 pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n");
211
212 page_init_poisoning = __page_init_poisoning;
213
214 return 1;
215}
216__setup("vm_debug", setup_vm_debug);
217
218void page_init_poison(struct page *page, size_t size)
219{
220 if (page_init_poisoning)
221 memset(page, PAGE_POISON_PATTERN, size);
222}
223EXPORT_SYMBOL_GPL(page_init_poison);
178#endif /* CONFIG_DEBUG_VM */ 224#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/filemap.c b/mm/filemap.c
index 52517f28e6f4..3968da1f7f5a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -36,6 +36,8 @@
36#include <linux/cleancache.h> 36#include <linux/cleancache.h>
37#include <linux/shmem_fs.h> 37#include <linux/shmem_fs.h>
38#include <linux/rmap.h> 38#include <linux/rmap.h>
39#include <linux/delayacct.h>
40#include <linux/psi.h>
39#include "internal.h" 41#include "internal.h"
40 42
41#define CREATE_TRACE_POINTS 43#define CREATE_TRACE_POINTS
@@ -915,12 +917,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
915 * data from the working set, only to cache data that will 917 * data from the working set, only to cache data that will
916 * get overwritten with something else, is a waste of memory. 918 * get overwritten with something else, is a waste of memory.
917 */ 919 */
918 if (!(gfp_mask & __GFP_WRITE) && 920 WARN_ON_ONCE(PageActive(page));
919 shadow && workingset_refault(shadow)) { 921 if (!(gfp_mask & __GFP_WRITE) && shadow)
920 SetPageActive(page); 922 workingset_refault(page, shadow);
921 workingset_activation(page);
922 } else
923 ClearPageActive(page);
924 lru_cache_add(page); 923 lru_cache_add(page);
925 } 924 }
926 return ret; 925 return ret;
@@ -1076,8 +1075,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
1076{ 1075{
1077 struct wait_page_queue wait_page; 1076 struct wait_page_queue wait_page;
1078 wait_queue_entry_t *wait = &wait_page.wait; 1077 wait_queue_entry_t *wait = &wait_page.wait;
1078 bool thrashing = false;
1079 unsigned long pflags;
1079 int ret = 0; 1080 int ret = 0;
1080 1081
1082 if (bit_nr == PG_locked &&
1083 !PageUptodate(page) && PageWorkingset(page)) {
1084 if (!PageSwapBacked(page))
1085 delayacct_thrashing_start();
1086 psi_memstall_enter(&pflags);
1087 thrashing = true;
1088 }
1089
1081 init_wait(wait); 1090 init_wait(wait);
1082 wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; 1091 wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
1083 wait->func = wake_page_function; 1092 wait->func = wake_page_function;
@@ -1116,6 +1125,12 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
1116 1125
1117 finish_wait(q, wait); 1126 finish_wait(q, wait);
1118 1127
1128 if (thrashing) {
1129 if (!PageSwapBacked(page))
1130 delayacct_thrashing_end();
1131 psi_memstall_leave(&pflags);
1132 }
1133
1119 /* 1134 /*
1120 * A signal could leave PageWaiters set. Clearing it here if 1135 * A signal could leave PageWaiters set. Clearing it here if
1121 * !waitqueue_active would be possible (by open-coding finish_wait), 1136 * !waitqueue_active would be possible (by open-coding finish_wait),
@@ -2581,9 +2596,7 @@ no_cached_page:
2581 * system is low on memory, or a problem occurs while trying 2596 * system is low on memory, or a problem occurs while trying
2582 * to schedule I/O. 2597 * to schedule I/O.
2583 */ 2598 */
2584 if (error == -ENOMEM) 2599 return vmf_error(error);
2585 return VM_FAULT_OOM;
2586 return VM_FAULT_SIGBUS;
2587 2600
2588page_not_uptodate: 2601page_not_uptodate:
2589 /* 2602 /*
@@ -2748,9 +2761,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
2748 return generic_file_mmap(file, vma); 2761 return generic_file_mmap(file, vma);
2749} 2762}
2750#else 2763#else
2751int filemap_page_mkwrite(struct vm_fault *vmf) 2764vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
2752{ 2765{
2753 return -ENOSYS; 2766 return VM_FAULT_SIGBUS;
2754} 2767}
2755int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2768int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2756{ 2769{
@@ -3012,7 +3025,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
3012 if (iocb->ki_flags & IOCB_NOWAIT) { 3025 if (iocb->ki_flags & IOCB_NOWAIT) {
3013 /* If there are pages to writeback, return */ 3026 /* If there are pages to writeback, return */
3014 if (filemap_range_has_page(inode->i_mapping, pos, 3027 if (filemap_range_has_page(inode->i_mapping, pos,
3015 pos + iov_iter_count(from))) 3028 pos + write_len))
3016 return -EAGAIN; 3029 return -EAGAIN;
3017 } else { 3030 } else {
3018 written = filemap_write_and_wait_range(mapping, pos, 3031 written = filemap_write_and_wait_range(mapping, pos,
diff --git a/mm/gup.c b/mm/gup.c
index 1abc8b4afff6..841d7ef53591 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -20,6 +20,11 @@
20 20
21#include "internal.h" 21#include "internal.h"
22 22
23struct follow_page_context {
24 struct dev_pagemap *pgmap;
25 unsigned int page_mask;
26};
27
23static struct page *no_page_table(struct vm_area_struct *vma, 28static struct page *no_page_table(struct vm_area_struct *vma,
24 unsigned int flags) 29 unsigned int flags)
25{ 30{
@@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
71} 76}
72 77
73static struct page *follow_page_pte(struct vm_area_struct *vma, 78static struct page *follow_page_pte(struct vm_area_struct *vma,
74 unsigned long address, pmd_t *pmd, unsigned int flags) 79 unsigned long address, pmd_t *pmd, unsigned int flags,
80 struct dev_pagemap **pgmap)
75{ 81{
76 struct mm_struct *mm = vma->vm_mm; 82 struct mm_struct *mm = vma->vm_mm;
77 struct dev_pagemap *pgmap = NULL;
78 struct page *page; 83 struct page *page;
79 spinlock_t *ptl; 84 spinlock_t *ptl;
80 pte_t *ptep, pte; 85 pte_t *ptep, pte;
@@ -116,8 +121,8 @@ retry:
116 * Only return device mapping pages in the FOLL_GET case since 121 * Only return device mapping pages in the FOLL_GET case since
117 * they are only valid while holding the pgmap reference. 122 * they are only valid while holding the pgmap reference.
118 */ 123 */
119 pgmap = get_dev_pagemap(pte_pfn(pte), NULL); 124 *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
120 if (pgmap) 125 if (*pgmap)
121 page = pte_page(pte); 126 page = pte_page(pte);
122 else 127 else
123 goto no_page; 128 goto no_page;
@@ -152,15 +157,8 @@ retry:
152 goto retry; 157 goto retry;
153 } 158 }
154 159
155 if (flags & FOLL_GET) { 160 if (flags & FOLL_GET)
156 get_page(page); 161 get_page(page);
157
158 /* drop the pgmap reference now that we hold the page */
159 if (pgmap) {
160 put_dev_pagemap(pgmap);
161 pgmap = NULL;
162 }
163 }
164 if (flags & FOLL_TOUCH) { 162 if (flags & FOLL_TOUCH) {
165 if ((flags & FOLL_WRITE) && 163 if ((flags & FOLL_WRITE) &&
166 !pte_dirty(pte) && !PageDirty(page)) 164 !pte_dirty(pte) && !PageDirty(page))
@@ -210,7 +208,8 @@ no_page:
210 208
211static struct page *follow_pmd_mask(struct vm_area_struct *vma, 209static struct page *follow_pmd_mask(struct vm_area_struct *vma,
212 unsigned long address, pud_t *pudp, 210 unsigned long address, pud_t *pudp,
213 unsigned int flags, unsigned int *page_mask) 211 unsigned int flags,
212 struct follow_page_context *ctx)
214{ 213{
215 pmd_t *pmd, pmdval; 214 pmd_t *pmd, pmdval;
216 spinlock_t *ptl; 215 spinlock_t *ptl;
@@ -258,13 +257,13 @@ retry:
258 } 257 }
259 if (pmd_devmap(pmdval)) { 258 if (pmd_devmap(pmdval)) {
260 ptl = pmd_lock(mm, pmd); 259 ptl = pmd_lock(mm, pmd);
261 page = follow_devmap_pmd(vma, address, pmd, flags); 260 page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
262 spin_unlock(ptl); 261 spin_unlock(ptl);
263 if (page) 262 if (page)
264 return page; 263 return page;
265 } 264 }
266 if (likely(!pmd_trans_huge(pmdval))) 265 if (likely(!pmd_trans_huge(pmdval)))
267 return follow_page_pte(vma, address, pmd, flags); 266 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
268 267
269 if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) 268 if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
270 return no_page_table(vma, flags); 269 return no_page_table(vma, flags);
@@ -284,7 +283,7 @@ retry_locked:
284 } 283 }
285 if (unlikely(!pmd_trans_huge(*pmd))) { 284 if (unlikely(!pmd_trans_huge(*pmd))) {
286 spin_unlock(ptl); 285 spin_unlock(ptl);
287 return follow_page_pte(vma, address, pmd, flags); 286 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
288 } 287 }
289 if (flags & FOLL_SPLIT) { 288 if (flags & FOLL_SPLIT) {
290 int ret; 289 int ret;
@@ -307,18 +306,18 @@ retry_locked:
307 } 306 }
308 307
309 return ret ? ERR_PTR(ret) : 308 return ret ? ERR_PTR(ret) :
310 follow_page_pte(vma, address, pmd, flags); 309 follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
311 } 310 }
312 page = follow_trans_huge_pmd(vma, address, pmd, flags); 311 page = follow_trans_huge_pmd(vma, address, pmd, flags);
313 spin_unlock(ptl); 312 spin_unlock(ptl);
314 *page_mask = HPAGE_PMD_NR - 1; 313 ctx->page_mask = HPAGE_PMD_NR - 1;
315 return page; 314 return page;
316} 315}
317 316
318
319static struct page *follow_pud_mask(struct vm_area_struct *vma, 317static struct page *follow_pud_mask(struct vm_area_struct *vma,
320 unsigned long address, p4d_t *p4dp, 318 unsigned long address, p4d_t *p4dp,
321 unsigned int flags, unsigned int *page_mask) 319 unsigned int flags,
320 struct follow_page_context *ctx)
322{ 321{
323 pud_t *pud; 322 pud_t *pud;
324 spinlock_t *ptl; 323 spinlock_t *ptl;
@@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
344 } 343 }
345 if (pud_devmap(*pud)) { 344 if (pud_devmap(*pud)) {
346 ptl = pud_lock(mm, pud); 345 ptl = pud_lock(mm, pud);
347 page = follow_devmap_pud(vma, address, pud, flags); 346 page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
348 spin_unlock(ptl); 347 spin_unlock(ptl);
349 if (page) 348 if (page)
350 return page; 349 return page;
@@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
352 if (unlikely(pud_bad(*pud))) 351 if (unlikely(pud_bad(*pud)))
353 return no_page_table(vma, flags); 352 return no_page_table(vma, flags);
354 353
355 return follow_pmd_mask(vma, address, pud, flags, page_mask); 354 return follow_pmd_mask(vma, address, pud, flags, ctx);
356} 355}
357 356
358
359static struct page *follow_p4d_mask(struct vm_area_struct *vma, 357static struct page *follow_p4d_mask(struct vm_area_struct *vma,
360 unsigned long address, pgd_t *pgdp, 358 unsigned long address, pgd_t *pgdp,
361 unsigned int flags, unsigned int *page_mask) 359 unsigned int flags,
360 struct follow_page_context *ctx)
362{ 361{
363 p4d_t *p4d; 362 p4d_t *p4d;
364 struct page *page; 363 struct page *page;
@@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
378 return page; 377 return page;
379 return no_page_table(vma, flags); 378 return no_page_table(vma, flags);
380 } 379 }
381 return follow_pud_mask(vma, address, p4d, flags, page_mask); 380 return follow_pud_mask(vma, address, p4d, flags, ctx);
382} 381}
383 382
384/** 383/**
@@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
396 */ 395 */
397struct page *follow_page_mask(struct vm_area_struct *vma, 396struct page *follow_page_mask(struct vm_area_struct *vma,
398 unsigned long address, unsigned int flags, 397 unsigned long address, unsigned int flags,
399 unsigned int *page_mask) 398 struct follow_page_context *ctx)
400{ 399{
401 pgd_t *pgd; 400 pgd_t *pgd;
402 struct page *page; 401 struct page *page;
403 struct mm_struct *mm = vma->vm_mm; 402 struct mm_struct *mm = vma->vm_mm;
404 403
405 *page_mask = 0; 404 ctx->page_mask = 0;
406 405
407 /* make this handle hugepd */ 406 /* make this handle hugepd */
408 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 407 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
@@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
431 return no_page_table(vma, flags); 430 return no_page_table(vma, flags);
432 } 431 }
433 432
434 return follow_p4d_mask(vma, address, pgd, flags, page_mask); 433 return follow_p4d_mask(vma, address, pgd, flags, ctx);
434}
435
436struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
437 unsigned int foll_flags)
438{
439 struct follow_page_context ctx = { NULL };
440 struct page *page;
441
442 page = follow_page_mask(vma, address, foll_flags, &ctx);
443 if (ctx.pgmap)
444 put_dev_pagemap(ctx.pgmap);
445 return page;
435} 446}
436 447
437static int get_gate_page(struct mm_struct *mm, unsigned long address, 448static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
659 unsigned int gup_flags, struct page **pages, 670 unsigned int gup_flags, struct page **pages,
660 struct vm_area_struct **vmas, int *nonblocking) 671 struct vm_area_struct **vmas, int *nonblocking)
661{ 672{
662 long i = 0; 673 long ret = 0, i = 0;
663 unsigned int page_mask;
664 struct vm_area_struct *vma = NULL; 674 struct vm_area_struct *vma = NULL;
675 struct follow_page_context ctx = { NULL };
665 676
666 if (!nr_pages) 677 if (!nr_pages)
667 return 0; 678 return 0;
@@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
691 pages ? &pages[i] : NULL); 702 pages ? &pages[i] : NULL);
692 if (ret) 703 if (ret)
693 return i ? : ret; 704 return i ? : ret;
694 page_mask = 0; 705 ctx.page_mask = 0;
695 goto next_page; 706 goto next_page;
696 } 707 }
697 708
698 if (!vma || check_vma_flags(vma, gup_flags)) 709 if (!vma || check_vma_flags(vma, gup_flags)) {
699 return i ? : -EFAULT; 710 ret = -EFAULT;
711 goto out;
712 }
700 if (is_vm_hugetlb_page(vma)) { 713 if (is_vm_hugetlb_page(vma)) {
701 i = follow_hugetlb_page(mm, vma, pages, vmas, 714 i = follow_hugetlb_page(mm, vma, pages, vmas,
702 &start, &nr_pages, i, 715 &start, &nr_pages, i,
@@ -709,23 +722,26 @@ retry:
709 * If we have a pending SIGKILL, don't keep faulting pages and 722 * If we have a pending SIGKILL, don't keep faulting pages and
710 * potentially allocating memory. 723 * potentially allocating memory.
711 */ 724 */
712 if (unlikely(fatal_signal_pending(current))) 725 if (unlikely(fatal_signal_pending(current))) {
713 return i ? i : -ERESTARTSYS; 726 ret = -ERESTARTSYS;
727 goto out;
728 }
714 cond_resched(); 729 cond_resched();
715 page = follow_page_mask(vma, start, foll_flags, &page_mask); 730
731 page = follow_page_mask(vma, start, foll_flags, &ctx);
716 if (!page) { 732 if (!page) {
717 int ret;
718 ret = faultin_page(tsk, vma, start, &foll_flags, 733 ret = faultin_page(tsk, vma, start, &foll_flags,
719 nonblocking); 734 nonblocking);
720 switch (ret) { 735 switch (ret) {
721 case 0: 736 case 0:
722 goto retry; 737 goto retry;
738 case -EBUSY:
739 ret = 0;
740 /* FALLTHRU */
723 case -EFAULT: 741 case -EFAULT:
724 case -ENOMEM: 742 case -ENOMEM:
725 case -EHWPOISON: 743 case -EHWPOISON:
726 return i ? i : ret; 744 goto out;
727 case -EBUSY:
728 return i;
729 case -ENOENT: 745 case -ENOENT:
730 goto next_page; 746 goto next_page;
731 } 747 }
@@ -737,27 +753,31 @@ retry:
737 */ 753 */
738 goto next_page; 754 goto next_page;
739 } else if (IS_ERR(page)) { 755 } else if (IS_ERR(page)) {
740 return i ? i : PTR_ERR(page); 756 ret = PTR_ERR(page);
757 goto out;
741 } 758 }
742 if (pages) { 759 if (pages) {
743 pages[i] = page; 760 pages[i] = page;
744 flush_anon_page(vma, page, start); 761 flush_anon_page(vma, page, start);
745 flush_dcache_page(page); 762 flush_dcache_page(page);
746 page_mask = 0; 763 ctx.page_mask = 0;
747 } 764 }
748next_page: 765next_page:
749 if (vmas) { 766 if (vmas) {
750 vmas[i] = vma; 767 vmas[i] = vma;
751 page_mask = 0; 768 ctx.page_mask = 0;
752 } 769 }
753 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 770 page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
754 if (page_increm > nr_pages) 771 if (page_increm > nr_pages)
755 page_increm = nr_pages; 772 page_increm = nr_pages;
756 i += page_increm; 773 i += page_increm;
757 start += page_increm * PAGE_SIZE; 774 start += page_increm * PAGE_SIZE;
758 nr_pages -= page_increm; 775 nr_pages -= page_increm;
759 } while (nr_pages); 776 } while (nr_pages);
760 return i; 777out:
778 if (ctx.pgmap)
779 put_dev_pagemap(ctx.pgmap);
780 return i ? i : ret;
761} 781}
762 782
763static bool vma_permits_fault(struct vm_area_struct *vma, 783static bool vma_permits_fault(struct vm_area_struct *vma,
@@ -1780,12 +1800,11 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
1780int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 1800int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1781 struct page **pages) 1801 struct page **pages)
1782{ 1802{
1783 unsigned long addr, len, end; 1803 unsigned long len, end;
1784 unsigned long flags; 1804 unsigned long flags;
1785 int nr = 0; 1805 int nr = 0;
1786 1806
1787 start &= PAGE_MASK; 1807 start &= PAGE_MASK;
1788 addr = start;
1789 len = (unsigned long) nr_pages << PAGE_SHIFT; 1808 len = (unsigned long) nr_pages << PAGE_SHIFT;
1790 end = start + len; 1809 end = start + len;
1791 1810
@@ -1807,7 +1826,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1807 1826
1808 if (gup_fast_permitted(start, nr_pages, write)) { 1827 if (gup_fast_permitted(start, nr_pages, write)) {
1809 local_irq_save(flags); 1828 local_irq_save(flags);
1810 gup_pgd_range(addr, end, write, pages, &nr); 1829 gup_pgd_range(start, end, write, pages, &nr);
1811 local_irq_restore(flags); 1830 local_irq_restore(flags);
1812 } 1831 }
1813 1832
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 7405c9d89d65..debf11388a60 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -6,13 +6,17 @@
6#include <linux/debugfs.h> 6#include <linux/debugfs.h>
7 7
8#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) 8#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
9#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
10#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
9 11
10struct gup_benchmark { 12struct gup_benchmark {
11 __u64 delta_usec; 13 __u64 get_delta_usec;
14 __u64 put_delta_usec;
12 __u64 addr; 15 __u64 addr;
13 __u64 size; 16 __u64 size;
14 __u32 nr_pages_per_call; 17 __u32 nr_pages_per_call;
15 __u32 flags; 18 __u32 flags;
19 __u64 expansion[10]; /* For future use */
16}; 20};
17 21
18static int __gup_benchmark_ioctl(unsigned int cmd, 22static int __gup_benchmark_ioctl(unsigned int cmd,
@@ -41,21 +45,40 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
41 nr = (next - addr) / PAGE_SIZE; 45 nr = (next - addr) / PAGE_SIZE;
42 } 46 }
43 47
44 nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); 48 switch (cmd) {
49 case GUP_FAST_BENCHMARK:
50 nr = get_user_pages_fast(addr, nr, gup->flags & 1,
51 pages + i);
52 break;
53 case GUP_LONGTERM_BENCHMARK:
54 nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
55 pages + i, NULL);
56 break;
57 case GUP_BENCHMARK:
58 nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
59 NULL);
60 break;
61 default:
62 return -1;
63 }
64
45 if (nr <= 0) 65 if (nr <= 0)
46 break; 66 break;
47 i += nr; 67 i += nr;
48 } 68 }
49 end_time = ktime_get(); 69 end_time = ktime_get();
50 70
51 gup->delta_usec = ktime_us_delta(end_time, start_time); 71 gup->get_delta_usec = ktime_us_delta(end_time, start_time);
52 gup->size = addr - gup->addr; 72 gup->size = addr - gup->addr;
53 73
74 start_time = ktime_get();
54 for (i = 0; i < nr_pages; i++) { 75 for (i = 0; i < nr_pages; i++) {
55 if (!pages[i]) 76 if (!pages[i])
56 break; 77 break;
57 put_page(pages[i]); 78 put_page(pages[i]);
58 } 79 }
80 end_time = ktime_get();
81 gup->put_delta_usec = ktime_us_delta(end_time, start_time);
59 82
60 kvfree(pages); 83 kvfree(pages);
61 return 0; 84 return 0;
@@ -67,8 +90,14 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
67 struct gup_benchmark gup; 90 struct gup_benchmark gup;
68 int ret; 91 int ret;
69 92
70 if (cmd != GUP_FAST_BENCHMARK) 93 switch (cmd) {
94 case GUP_FAST_BENCHMARK:
95 case GUP_LONGTERM_BENCHMARK:
96 case GUP_BENCHMARK:
97 break;
98 default:
71 return -EINVAL; 99 return -EINVAL;
100 }
72 101
73 if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) 102 if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
74 return -EFAULT; 103 return -EFAULT;
diff --git a/mm/hmm.c b/mm/hmm.c
index c968e49f7a0c..774d684fa2b4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
1024 resource_size_t key, align_start, align_size, align_end; 1024 resource_size_t key, align_start, align_size, align_end;
1025 struct device *device = devmem->device; 1025 struct device *device = devmem->device;
1026 int ret, nid, is_ram; 1026 int ret, nid, is_ram;
1027 unsigned long pfn;
1028 1027
1029 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); 1028 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
1030 align_size = ALIGN(devmem->resource->start + 1029 align_size = ALIGN(devmem->resource->start +
@@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
1109 align_size >> PAGE_SHIFT, NULL); 1108 align_size >> PAGE_SHIFT, NULL);
1110 mem_hotplug_done(); 1109 mem_hotplug_done();
1111 1110
1112 for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { 1111 /*
1113 struct page *page = pfn_to_page(pfn); 1112 * Initialization of the pages has been deferred until now in order
1113 * to allow us to do the work while not holding the hotplug lock.
1114 */
1115 memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
1116 align_start >> PAGE_SHIFT,
1117 align_size >> PAGE_SHIFT, &devmem->pagemap);
1114 1118
1115 page->pgmap = &devmem->pagemap;
1116 }
1117 return 0; 1119 return 0;
1118 1120
1119error_add_memory: 1121error_add_memory:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index deed97fba979..25ef59b7ee34 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -852,11 +852,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
852} 852}
853 853
854struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 854struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
855 pmd_t *pmd, int flags) 855 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
856{ 856{
857 unsigned long pfn = pmd_pfn(*pmd); 857 unsigned long pfn = pmd_pfn(*pmd);
858 struct mm_struct *mm = vma->vm_mm; 858 struct mm_struct *mm = vma->vm_mm;
859 struct dev_pagemap *pgmap;
860 struct page *page; 859 struct page *page;
861 860
862 assert_spin_locked(pmd_lockptr(mm, pmd)); 861 assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -886,12 +885,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
886 return ERR_PTR(-EEXIST); 885 return ERR_PTR(-EEXIST);
887 886
888 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 887 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
889 pgmap = get_dev_pagemap(pfn, NULL); 888 *pgmap = get_dev_pagemap(pfn, *pgmap);
890 if (!pgmap) 889 if (!*pgmap)
891 return ERR_PTR(-EFAULT); 890 return ERR_PTR(-EFAULT);
892 page = pfn_to_page(pfn); 891 page = pfn_to_page(pfn);
893 get_page(page); 892 get_page(page);
894 put_dev_pagemap(pgmap);
895 893
896 return page; 894 return page;
897} 895}
@@ -1000,11 +998,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1000} 998}
1001 999
1002struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, 1000struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1003 pud_t *pud, int flags) 1001 pud_t *pud, int flags, struct dev_pagemap **pgmap)
1004{ 1002{
1005 unsigned long pfn = pud_pfn(*pud); 1003 unsigned long pfn = pud_pfn(*pud);
1006 struct mm_struct *mm = vma->vm_mm; 1004 struct mm_struct *mm = vma->vm_mm;
1007 struct dev_pagemap *pgmap;
1008 struct page *page; 1005 struct page *page;
1009 1006
1010 assert_spin_locked(pud_lockptr(mm, pud)); 1007 assert_spin_locked(pud_lockptr(mm, pud));
@@ -1028,12 +1025,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1028 return ERR_PTR(-EEXIST); 1025 return ERR_PTR(-EEXIST);
1029 1026
1030 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; 1027 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1031 pgmap = get_dev_pagemap(pfn, NULL); 1028 *pgmap = get_dev_pagemap(pfn, *pgmap);
1032 if (!pgmap) 1029 if (!*pgmap)
1033 return ERR_PTR(-EFAULT); 1030 return ERR_PTR(-EFAULT);
1034 page = pfn_to_page(pfn); 1031 page = pfn_to_page(pfn);
1035 get_page(page); 1032 get_page(page);
1036 put_dev_pagemap(pgmap);
1037 1033
1038 return page; 1034 return page;
1039} 1035}
@@ -1562,8 +1558,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1562 * We are not sure a pending tlb flush here is for a huge page 1558 * We are not sure a pending tlb flush here is for a huge page
1563 * mapping or not. Hence use the tlb range variant 1559 * mapping or not. Hence use the tlb range variant
1564 */ 1560 */
1565 if (mm_tlb_flush_pending(vma->vm_mm)) 1561 if (mm_tlb_flush_pending(vma->vm_mm)) {
1566 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); 1562 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1563 /*
1564 * change_huge_pmd() released the pmd lock before
1565 * invalidating the secondary MMUs sharing the primary
1566 * MMU pagetables (with ->invalidate_range()). The
1567 * mmu_notifier_invalidate_range_end() (which
1568 * internally calls ->invalidate_range()) in
1569 * change_pmd_range() will run after us, so we can't
1570 * rely on it here and we need an explicit invalidate.
1571 */
1572 mmu_notifier_invalidate_range(vma->vm_mm, haddr,
1573 haddr + HPAGE_PMD_SIZE);
1574 }
1567 1575
1568 /* 1576 /*
1569 * Migrate the THP to the requested node, returns with page unlocked 1577 * Migrate the THP to the requested node, returns with page unlocked
@@ -2369,6 +2377,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
2369 (1L << PG_mlocked) | 2377 (1L << PG_mlocked) |
2370 (1L << PG_uptodate) | 2378 (1L << PG_uptodate) |
2371 (1L << PG_active) | 2379 (1L << PG_active) |
2380 (1L << PG_workingset) |
2372 (1L << PG_locked) | 2381 (1L << PG_locked) |
2373 (1L << PG_unevictable) | 2382 (1L << PG_unevictable) |
2374 (1L << PG_dirty))); 2383 (1L << PG_dirty)));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5c390f5a5207..7b5c0ad9a6bd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3690,6 +3690,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
3690 return err; 3690 return err;
3691 ClearPagePrivate(page); 3691 ClearPagePrivate(page);
3692 3692
3693 /*
3694 * set page dirty so that it will not be removed from cache/file
3695 * by non-hugetlbfs specific code paths.
3696 */
3697 set_page_dirty(page);
3698
3693 spin_lock(&inode->i_lock); 3699 spin_lock(&inode->i_lock);
3694 inode->i_blocks += blocks_per_huge_page(h); 3700 inode->i_blocks += blocks_per_huge_page(h);
3695 spin_unlock(&inode->i_lock); 3701 spin_unlock(&inode->i_lock);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 3a8ddf8baf7d..b209dbaefde8 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -103,7 +103,7 @@ static int quarantine_head;
103static int quarantine_tail; 103static int quarantine_tail;
104/* Total size of all objects in global_quarantine across all batches. */ 104/* Total size of all objects in global_quarantine across all batches. */
105static unsigned long quarantine_size; 105static unsigned long quarantine_size;
106static DEFINE_SPINLOCK(quarantine_lock); 106static DEFINE_RAW_SPINLOCK(quarantine_lock);
107DEFINE_STATIC_SRCU(remove_cache_srcu); 107DEFINE_STATIC_SRCU(remove_cache_srcu);
108 108
109/* Maximum size of the global queue. */ 109/* Maximum size of the global queue. */
@@ -190,7 +190,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
190 if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { 190 if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
191 qlist_move_all(q, &temp); 191 qlist_move_all(q, &temp);
192 192
193 spin_lock(&quarantine_lock); 193 raw_spin_lock(&quarantine_lock);
194 WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); 194 WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
195 qlist_move_all(&temp, &global_quarantine[quarantine_tail]); 195 qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
196 if (global_quarantine[quarantine_tail].bytes >= 196 if (global_quarantine[quarantine_tail].bytes >=
@@ -203,7 +203,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
203 if (new_tail != quarantine_head) 203 if (new_tail != quarantine_head)
204 quarantine_tail = new_tail; 204 quarantine_tail = new_tail;
205 } 205 }
206 spin_unlock(&quarantine_lock); 206 raw_spin_unlock(&quarantine_lock);
207 } 207 }
208 208
209 local_irq_restore(flags); 209 local_irq_restore(flags);
@@ -230,7 +230,7 @@ void quarantine_reduce(void)
230 * expected case). 230 * expected case).
231 */ 231 */
232 srcu_idx = srcu_read_lock(&remove_cache_srcu); 232 srcu_idx = srcu_read_lock(&remove_cache_srcu);
233 spin_lock_irqsave(&quarantine_lock, flags); 233 raw_spin_lock_irqsave(&quarantine_lock, flags);
234 234
235 /* 235 /*
236 * Update quarantine size in case of hotplug. Allocate a fraction of 236 * Update quarantine size in case of hotplug. Allocate a fraction of
@@ -254,7 +254,7 @@ void quarantine_reduce(void)
254 quarantine_head = 0; 254 quarantine_head = 0;
255 } 255 }
256 256
257 spin_unlock_irqrestore(&quarantine_lock, flags); 257 raw_spin_unlock_irqrestore(&quarantine_lock, flags);
258 258
259 qlist_free_all(&to_free, NULL); 259 qlist_free_all(&to_free, NULL);
260 srcu_read_unlock(&remove_cache_srcu, srcu_idx); 260 srcu_read_unlock(&remove_cache_srcu, srcu_idx);
@@ -310,17 +310,17 @@ void quarantine_remove_cache(struct kmem_cache *cache)
310 */ 310 */
311 on_each_cpu(per_cpu_remove_cache, cache, 1); 311 on_each_cpu(per_cpu_remove_cache, cache, 1);
312 312
313 spin_lock_irqsave(&quarantine_lock, flags); 313 raw_spin_lock_irqsave(&quarantine_lock, flags);
314 for (i = 0; i < QUARANTINE_BATCHES; i++) { 314 for (i = 0; i < QUARANTINE_BATCHES; i++) {
315 if (qlist_empty(&global_quarantine[i])) 315 if (qlist_empty(&global_quarantine[i]))
316 continue; 316 continue;
317 qlist_move_cache(&global_quarantine[i], &to_free, cache); 317 qlist_move_cache(&global_quarantine[i], &to_free, cache);
318 /* Scanning whole quarantine can take a while. */ 318 /* Scanning whole quarantine can take a while. */
319 spin_unlock_irqrestore(&quarantine_lock, flags); 319 raw_spin_unlock_irqrestore(&quarantine_lock, flags);
320 cond_resched(); 320 cond_resched();
321 spin_lock_irqsave(&quarantine_lock, flags); 321 raw_spin_lock_irqsave(&quarantine_lock, flags);
322 } 322 }
323 spin_unlock_irqrestore(&quarantine_lock, flags); 323 raw_spin_unlock_irqrestore(&quarantine_lock, flags);
324 324
325 qlist_free_all(&to_free, cache); 325 qlist_free_all(&to_free, cache);
326 326
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 17dd883198ae..4f7e4b5a2f08 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -86,6 +86,7 @@
86#include <linux/seq_file.h> 86#include <linux/seq_file.h>
87#include <linux/cpumask.h> 87#include <linux/cpumask.h>
88#include <linux/spinlock.h> 88#include <linux/spinlock.h>
89#include <linux/module.h>
89#include <linux/mutex.h> 90#include <linux/mutex.h>
90#include <linux/rcupdate.h> 91#include <linux/rcupdate.h>
91#include <linux/stacktrace.h> 92#include <linux/stacktrace.h>
@@ -181,6 +182,7 @@ struct kmemleak_object {
181/* flag set to not scan the object */ 182/* flag set to not scan the object */
182#define OBJECT_NO_SCAN (1 << 2) 183#define OBJECT_NO_SCAN (1 << 2)
183 184
185#define HEX_PREFIX " "
184/* number of bytes to print per line; must be 16 or 32 */ 186/* number of bytes to print per line; must be 16 or 32 */
185#define HEX_ROW_SIZE 16 187#define HEX_ROW_SIZE 16
186/* number of bytes to print at a time (1, 2, 4, 8) */ 188/* number of bytes to print at a time (1, 2, 4, 8) */
@@ -235,6 +237,9 @@ static int kmemleak_skip_disable;
235/* If there are leaks that can be reported */ 237/* If there are leaks that can be reported */
236static bool kmemleak_found_leaks; 238static bool kmemleak_found_leaks;
237 239
240static bool kmemleak_verbose;
241module_param_named(verbose, kmemleak_verbose, bool, 0600);
242
238/* 243/*
239 * Early object allocation/freeing logging. Kmemleak is initialized after the 244 * Early object allocation/freeing logging. Kmemleak is initialized after the
240 * kernel allocator. However, both the kernel allocator and kmemleak may 245 * kernel allocator. However, both the kernel allocator and kmemleak may
@@ -299,6 +304,25 @@ static void kmemleak_disable(void);
299 kmemleak_disable(); \ 304 kmemleak_disable(); \
300} while (0) 305} while (0)
301 306
307#define warn_or_seq_printf(seq, fmt, ...) do { \
308 if (seq) \
309 seq_printf(seq, fmt, ##__VA_ARGS__); \
310 else \
311 pr_warn(fmt, ##__VA_ARGS__); \
312} while (0)
313
314static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type,
315 int rowsize, int groupsize, const void *buf,
316 size_t len, bool ascii)
317{
318 if (seq)
319 seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize,
320 buf, len, ascii);
321 else
322 print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type,
323 rowsize, groupsize, buf, len, ascii);
324}
325
302/* 326/*
303 * Printing of the objects hex dump to the seq file. The number of lines to be 327 * Printing of the objects hex dump to the seq file. The number of lines to be
304 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The 328 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
@@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq,
314 /* limit the number of lines to HEX_MAX_LINES */ 338 /* limit the number of lines to HEX_MAX_LINES */
315 len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); 339 len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
316 340
317 seq_printf(seq, " hex dump (first %zu bytes):\n", len); 341 warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
318 kasan_disable_current(); 342 kasan_disable_current();
319 seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, 343 warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
320 HEX_GROUP_SIZE, ptr, len, HEX_ASCII); 344 HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
321 kasan_enable_current(); 345 kasan_enable_current();
322} 346}
323 347
@@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq,
365 int i; 389 int i;
366 unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); 390 unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
367 391
368 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", 392 warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
369 object->pointer, object->size); 393 object->pointer, object->size);
370 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", 394 warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
371 object->comm, object->pid, object->jiffies, 395 object->comm, object->pid, object->jiffies,
372 msecs_age / 1000, msecs_age % 1000); 396 msecs_age / 1000, msecs_age % 1000);
373 hex_dump_object(seq, object); 397 hex_dump_object(seq, object);
374 seq_printf(seq, " backtrace:\n"); 398 warn_or_seq_printf(seq, " backtrace:\n");
375 399
376 for (i = 0; i < object->trace_len; i++) { 400 for (i = 0; i < object->trace_len; i++) {
377 void *ptr = (void *)object->trace[i]; 401 void *ptr = (void *)object->trace[i];
378 seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); 402 warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
379 } 403 }
380} 404}
381 405
@@ -1598,6 +1622,10 @@ static void kmemleak_scan(void)
1598 if (unreferenced_object(object) && 1622 if (unreferenced_object(object) &&
1599 !(object->flags & OBJECT_REPORTED)) { 1623 !(object->flags & OBJECT_REPORTED)) {
1600 object->flags |= OBJECT_REPORTED; 1624 object->flags |= OBJECT_REPORTED;
1625
1626 if (kmemleak_verbose)
1627 print_unreferenced(NULL, object);
1628
1601 new_leaks++; 1629 new_leaks++;
1602 } 1630 }
1603 spin_unlock_irqrestore(&object->lock, flags); 1631 spin_unlock_irqrestore(&object->lock, flags);
diff --git a/mm/memblock.c b/mm/memblock.c
index 237944479d25..a85315083b5a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1444,10 +1444,9 @@ void * __init memblock_virt_alloc_try_nid_raw(
1444 1444
1445 ptr = memblock_virt_alloc_internal(size, align, 1445 ptr = memblock_virt_alloc_internal(size, align,
1446 min_addr, max_addr, nid); 1446 min_addr, max_addr, nid);
1447#ifdef CONFIG_DEBUG_VM
1448 if (ptr && size > 0) 1447 if (ptr && size > 0)
1449 memset(ptr, PAGE_POISON_PATTERN, size); 1448 page_init_poison(ptr, size);
1450#endif 1449
1451 return ptr; 1450 return ptr;
1452} 1451}
1453 1452
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e79cb59552d9..10a9b554d69f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1669,6 +1669,8 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
1669 if (order > PAGE_ALLOC_COSTLY_ORDER) 1669 if (order > PAGE_ALLOC_COSTLY_ORDER)
1670 return OOM_SKIPPED; 1670 return OOM_SKIPPED;
1671 1671
1672 memcg_memory_event(memcg, MEMCG_OOM);
1673
1672 /* 1674 /*
1673 * We are in the middle of the charge context here, so we 1675 * We are in the middle of the charge context here, so we
1674 * don't want to block when potentially sitting on a callstack 1676 * don't want to block when potentially sitting on a callstack
@@ -2250,8 +2252,6 @@ retry:
2250 if (fatal_signal_pending(current)) 2252 if (fatal_signal_pending(current))
2251 goto force; 2253 goto force;
2252 2254
2253 memcg_memory_event(mem_over_limit, MEMCG_OOM);
2254
2255 /* 2255 /*
2256 * keep retrying as long as the memcg oom killer is able to make 2256 * keep retrying as long as the memcg oom killer is able to make
2257 * a forward progress or bypass the charge if the oom killer 2257 * a forward progress or bypass the charge if the oom killer
@@ -2460,7 +2460,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w)
2460/* 2460/*
2461 * Enqueue the creation of a per-memcg kmem_cache. 2461 * Enqueue the creation of a per-memcg kmem_cache.
2462 */ 2462 */
2463static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2463static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2464 struct kmem_cache *cachep) 2464 struct kmem_cache *cachep)
2465{ 2465{
2466 struct memcg_kmem_cache_create_work *cw; 2466 struct memcg_kmem_cache_create_work *cw;
@@ -2478,25 +2478,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2478 queue_work(memcg_kmem_cache_wq, &cw->work); 2478 queue_work(memcg_kmem_cache_wq, &cw->work);
2479} 2479}
2480 2480
2481static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2482 struct kmem_cache *cachep)
2483{
2484 /*
2485 * We need to stop accounting when we kmalloc, because if the
2486 * corresponding kmalloc cache is not yet created, the first allocation
2487 * in __memcg_schedule_kmem_cache_create will recurse.
2488 *
2489 * However, it is better to enclose the whole function. Depending on
2490 * the debugging options enabled, INIT_WORK(), for instance, can
2491 * trigger an allocation. This too, will make us recurse. Because at
2492 * this point we can't allow ourselves back into memcg_kmem_get_cache,
2493 * the safest choice is to do it like this, wrapping the whole function.
2494 */
2495 current->memcg_kmem_skip_account = 1;
2496 __memcg_schedule_kmem_cache_create(memcg, cachep);
2497 current->memcg_kmem_skip_account = 0;
2498}
2499
2500static inline bool memcg_kmem_bypass(void) 2481static inline bool memcg_kmem_bypass(void)
2501{ 2482{
2502 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 2483 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
@@ -2531,9 +2512,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2531 if (memcg_kmem_bypass()) 2512 if (memcg_kmem_bypass())
2532 return cachep; 2513 return cachep;
2533 2514
2534 if (current->memcg_kmem_skip_account)
2535 return cachep;
2536
2537 memcg = get_mem_cgroup_from_current(); 2515 memcg = get_mem_cgroup_from_current();
2538 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2516 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2539 if (kmemcg_id < 0) 2517 if (kmemcg_id < 0)
@@ -4321,14 +4299,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4321 4299
4322static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) 4300static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4323{ 4301{
4324 VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); 4302 refcount_add(n, &memcg->id.ref);
4325 atomic_add(n, &memcg->id.ref);
4326} 4303}
4327 4304
4328static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 4305static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4329{ 4306{
4330 VM_BUG_ON(atomic_read(&memcg->id.ref) < n); 4307 if (refcount_sub_and_test(n, &memcg->id.ref)) {
4331 if (atomic_sub_and_test(n, &memcg->id.ref)) {
4332 mem_cgroup_id_remove(memcg); 4308 mem_cgroup_id_remove(memcg);
4333 4309
4334 /* Memcg ID pins CSS */ 4310 /* Memcg ID pins CSS */
@@ -4545,7 +4521,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4545 } 4521 }
4546 4522
4547 /* Online state pins memcg ID, memcg ID pins CSS */ 4523 /* Online state pins memcg ID, memcg ID pins CSS */
4548 atomic_set(&memcg->id.ref, 1); 4524 refcount_set(&memcg->id.ref, 1);
4549 css_get(css); 4525 css_get(css);
4550 return 0; 4526 return 0;
4551} 4527}
@@ -4573,6 +4549,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4573 memcg_offline_kmem(memcg); 4549 memcg_offline_kmem(memcg);
4574 wb_memcg_offline(memcg); 4550 wb_memcg_offline(memcg);
4575 4551
4552 drain_all_stock(memcg);
4553
4576 mem_cgroup_id_put(memcg); 4554 mem_cgroup_id_put(memcg);
4577} 4555}
4578 4556
@@ -5595,6 +5573,13 @@ static int memory_stat_show(struct seq_file *m, void *v)
5595 seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); 5573 seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5596 seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); 5574 seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5597 5575
5576 seq_printf(m, "workingset_refault %lu\n",
5577 acc.stat[WORKINGSET_REFAULT]);
5578 seq_printf(m, "workingset_activate %lu\n",
5579 acc.stat[WORKINGSET_ACTIVATE]);
5580 seq_printf(m, "workingset_nodereclaim %lu\n",
5581 acc.stat[WORKINGSET_NODERECLAIM]);
5582
5598 seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); 5583 seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5599 seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + 5584 seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5600 acc.events[PGSCAN_DIRECT]); 5585 acc.events[PGSCAN_DIRECT]);
@@ -5605,13 +5590,6 @@ static int memory_stat_show(struct seq_file *m, void *v)
5605 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); 5590 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5606 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); 5591 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5607 5592
5608 seq_printf(m, "workingset_refault %lu\n",
5609 acc.stat[WORKINGSET_REFAULT]);
5610 seq_printf(m, "workingset_activate %lu\n",
5611 acc.stat[WORKINGSET_ACTIVATE]);
5612 seq_printf(m, "workingset_nodereclaim %lu\n",
5613 acc.stat[WORKINGSET_NODERECLAIM]);
5614
5615 return 0; 5593 return 0;
5616} 5594}
5617 5595
@@ -6377,7 +6355,7 @@ subsys_initcall(mem_cgroup_init);
6377#ifdef CONFIG_MEMCG_SWAP 6355#ifdef CONFIG_MEMCG_SWAP
6378static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 6356static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6379{ 6357{
6380 while (!atomic_inc_not_zero(&memcg->id.ref)) { 6358 while (!refcount_inc_not_zero(&memcg->id.ref)) {
6381 /* 6359 /*
6382 * The root cgroup cannot be destroyed, so it's refcount must 6360 * The root cgroup cannot be destroyed, so it's refcount must
6383 * always be >= 1. 6361 * always be >= 1.
diff --git a/mm/memory.c b/mm/memory.c
index 21a5e6e4758b..072139579d89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1520,19 +1520,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1520} 1520}
1521EXPORT_SYMBOL(vm_insert_page); 1521EXPORT_SYMBOL(vm_insert_page);
1522 1522
1523static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1523static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1524 pfn_t pfn, pgprot_t prot, bool mkwrite) 1524 pfn_t pfn, pgprot_t prot, bool mkwrite)
1525{ 1525{
1526 struct mm_struct *mm = vma->vm_mm; 1526 struct mm_struct *mm = vma->vm_mm;
1527 int retval;
1528 pte_t *pte, entry; 1527 pte_t *pte, entry;
1529 spinlock_t *ptl; 1528 spinlock_t *ptl;
1530 1529
1531 retval = -ENOMEM;
1532 pte = get_locked_pte(mm, addr, &ptl); 1530 pte = get_locked_pte(mm, addr, &ptl);
1533 if (!pte) 1531 if (!pte)
1534 goto out; 1532 return VM_FAULT_OOM;
1535 retval = -EBUSY;
1536 if (!pte_none(*pte)) { 1533 if (!pte_none(*pte)) {
1537 if (mkwrite) { 1534 if (mkwrite) {
1538 /* 1535 /*
@@ -1565,56 +1562,32 @@ out_mkwrite:
1565 set_pte_at(mm, addr, pte, entry); 1562 set_pte_at(mm, addr, pte, entry);
1566 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ 1563 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1567 1564
1568 retval = 0;
1569out_unlock: 1565out_unlock:
1570 pte_unmap_unlock(pte, ptl); 1566 pte_unmap_unlock(pte, ptl);
1571out: 1567 return VM_FAULT_NOPAGE;
1572 return retval;
1573}
1574
1575/**
1576 * vm_insert_pfn - insert single pfn into user vma
1577 * @vma: user vma to map to
1578 * @addr: target user address of this page
1579 * @pfn: source kernel pfn
1580 *
1581 * Similar to vm_insert_page, this allows drivers to insert individual pages
1582 * they've allocated into a user vma. Same comments apply.
1583 *
1584 * This function should only be called from a vm_ops->fault handler, and
1585 * in that case the handler should return NULL.
1586 *
1587 * vma cannot be a COW mapping.
1588 *
1589 * As this is called only for pages that do not currently exist, we
1590 * do not need to flush old virtual caches or the TLB.
1591 */
1592int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1593 unsigned long pfn)
1594{
1595 return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1596} 1568}
1597EXPORT_SYMBOL(vm_insert_pfn);
1598 1569
1599/** 1570/**
1600 * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot 1571 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
1601 * @vma: user vma to map to 1572 * @vma: user vma to map to
1602 * @addr: target user address of this page 1573 * @addr: target user address of this page
1603 * @pfn: source kernel pfn 1574 * @pfn: source kernel pfn
1604 * @pgprot: pgprot flags for the inserted page 1575 * @pgprot: pgprot flags for the inserted page
1605 * 1576 *
1606 * This is exactly like vm_insert_pfn, except that it allows drivers to 1577 * This is exactly like vmf_insert_pfn(), except that it allows drivers to
1607 * to override pgprot on a per-page basis. 1578 * to override pgprot on a per-page basis.
1608 * 1579 *
1609 * This only makes sense for IO mappings, and it makes no sense for 1580 * This only makes sense for IO mappings, and it makes no sense for
1610 * cow mappings. In general, using multiple vmas is preferable; 1581 * COW mappings. In general, using multiple vmas is preferable;
1611 * vm_insert_pfn_prot should only be used if using multiple VMAs is 1582 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
1612 * impractical. 1583 * impractical.
1584 *
1585 * Context: Process context. May allocate using %GFP_KERNEL.
1586 * Return: vm_fault_t value.
1613 */ 1587 */
1614int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, 1588vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1615 unsigned long pfn, pgprot_t pgprot) 1589 unsigned long pfn, pgprot_t pgprot)
1616{ 1590{
1617 int ret;
1618 /* 1591 /*
1619 * Technically, architectures with pte_special can avoid all these 1592 * Technically, architectures with pte_special can avoid all these
1620 * restrictions (same for remap_pfn_range). However we would like 1593 * restrictions (same for remap_pfn_range). However we would like
@@ -1628,19 +1601,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1628 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); 1601 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1629 1602
1630 if (addr < vma->vm_start || addr >= vma->vm_end) 1603 if (addr < vma->vm_start || addr >= vma->vm_end)
1631 return -EFAULT; 1604 return VM_FAULT_SIGBUS;
1632 1605
1633 if (!pfn_modify_allowed(pfn, pgprot)) 1606 if (!pfn_modify_allowed(pfn, pgprot))
1634 return -EACCES; 1607 return VM_FAULT_SIGBUS;
1635 1608
1636 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); 1609 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1637 1610
1638 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, 1611 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1639 false); 1612 false);
1613}
1614EXPORT_SYMBOL(vmf_insert_pfn_prot);
1640 1615
1641 return ret; 1616/**
1617 * vmf_insert_pfn - insert single pfn into user vma
1618 * @vma: user vma to map to
1619 * @addr: target user address of this page
1620 * @pfn: source kernel pfn
1621 *
1622 * Similar to vm_insert_page, this allows drivers to insert individual pages
1623 * they've allocated into a user vma. Same comments apply.
1624 *
1625 * This function should only be called from a vm_ops->fault handler, and
1626 * in that case the handler should return the result of this function.
1627 *
1628 * vma cannot be a COW mapping.
1629 *
1630 * As this is called only for pages that do not currently exist, we
1631 * do not need to flush old virtual caches or the TLB.
1632 *
1633 * Context: Process context. May allocate using %GFP_KERNEL.
1634 * Return: vm_fault_t value.
1635 */
1636vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1637 unsigned long pfn)
1638{
1639 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1642} 1640}
1643EXPORT_SYMBOL(vm_insert_pfn_prot); 1641EXPORT_SYMBOL(vmf_insert_pfn);
1644 1642
1645static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) 1643static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1646{ 1644{
@@ -1656,20 +1654,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1656 return false; 1654 return false;
1657} 1655}
1658 1656
1659static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1657static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
1660 pfn_t pfn, bool mkwrite) 1658 unsigned long addr, pfn_t pfn, bool mkwrite)
1661{ 1659{
1662 pgprot_t pgprot = vma->vm_page_prot; 1660 pgprot_t pgprot = vma->vm_page_prot;
1661 int err;
1663 1662
1664 BUG_ON(!vm_mixed_ok(vma, pfn)); 1663 BUG_ON(!vm_mixed_ok(vma, pfn));
1665 1664
1666 if (addr < vma->vm_start || addr >= vma->vm_end) 1665 if (addr < vma->vm_start || addr >= vma->vm_end)
1667 return -EFAULT; 1666 return VM_FAULT_SIGBUS;
1668 1667
1669 track_pfn_insert(vma, &pgprot, pfn); 1668 track_pfn_insert(vma, &pgprot, pfn);
1670 1669
1671 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) 1670 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1672 return -EACCES; 1671 return VM_FAULT_SIGBUS;
1673 1672
1674 /* 1673 /*
1675 * If we don't have pte special, then we have to use the pfn_valid() 1674 * If we don't have pte special, then we have to use the pfn_valid()
@@ -1688,36 +1687,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1688 * result in pfn_t_has_page() == false. 1687 * result in pfn_t_has_page() == false.
1689 */ 1688 */
1690 page = pfn_to_page(pfn_t_to_pfn(pfn)); 1689 page = pfn_to_page(pfn_t_to_pfn(pfn));
1691 return insert_page(vma, addr, page, pgprot); 1690 err = insert_page(vma, addr, page, pgprot);
1691 } else {
1692 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1692 } 1693 }
1693 return insert_pfn(vma, addr, pfn, pgprot, mkwrite); 1694
1695 if (err == -ENOMEM)
1696 return VM_FAULT_OOM;
1697 if (err < 0 && err != -EBUSY)
1698 return VM_FAULT_SIGBUS;
1699
1700 return VM_FAULT_NOPAGE;
1694} 1701}
1695 1702
1696int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1703vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1697 pfn_t pfn) 1704 pfn_t pfn)
1698{ 1705{
1699 return __vm_insert_mixed(vma, addr, pfn, false); 1706 return __vm_insert_mixed(vma, addr, pfn, false);
1700
1701} 1707}
1702EXPORT_SYMBOL(vm_insert_mixed); 1708EXPORT_SYMBOL(vmf_insert_mixed);
1703 1709
1704/* 1710/*
1705 * If the insertion of PTE failed because someone else already added a 1711 * If the insertion of PTE failed because someone else already added a
1706 * different entry in the mean time, we treat that as success as we assume 1712 * different entry in the mean time, we treat that as success as we assume
1707 * the same entry was actually inserted. 1713 * the same entry was actually inserted.
1708 */ 1714 */
1709
1710vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, 1715vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
1711 unsigned long addr, pfn_t pfn) 1716 unsigned long addr, pfn_t pfn)
1712{ 1717{
1713 int err; 1718 return __vm_insert_mixed(vma, addr, pfn, true);
1714
1715 err = __vm_insert_mixed(vma, addr, pfn, true);
1716 if (err == -ENOMEM)
1717 return VM_FAULT_OOM;
1718 if (err < 0 && err != -EBUSY)
1719 return VM_FAULT_SIGBUS;
1720 return VM_FAULT_NOPAGE;
1721} 1719}
1722EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); 1720EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
1723 1721
@@ -3498,10 +3496,36 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
3498 struct vm_area_struct *vma = vmf->vma; 3496 struct vm_area_struct *vma = vmf->vma;
3499 vm_fault_t ret; 3497 vm_fault_t ret;
3500 3498
3501 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3499 /*
3502 if (!vma->vm_ops->fault) 3500 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
3503 ret = VM_FAULT_SIGBUS; 3501 */
3504 else if (!(vmf->flags & FAULT_FLAG_WRITE)) 3502 if (!vma->vm_ops->fault) {
3503 /*
3504 * If we find a migration pmd entry or a none pmd entry, which
3505 * should never happen, return SIGBUS
3506 */
3507 if (unlikely(!pmd_present(*vmf->pmd)))
3508 ret = VM_FAULT_SIGBUS;
3509 else {
3510 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
3511 vmf->pmd,
3512 vmf->address,
3513 &vmf->ptl);
3514 /*
3515 * Make sure this is not a temporary clearing of pte
3516 * by holding ptl and checking again. A R/M/W update
3517 * of pte involves: take ptl, clearing the pte so that
3518 * we don't have concurrent modification by hardware
3519 * followed by an update.
3520 */
3521 if (unlikely(pte_none(*vmf->pte)))
3522 ret = VM_FAULT_SIGBUS;
3523 else
3524 ret = VM_FAULT_NOPAGE;
3525
3526 pte_unmap_unlock(vmf->pte, vmf->ptl);
3527 }
3528 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
3505 ret = do_read_fault(vmf); 3529 ret = do_read_fault(vmf);
3506 else if (!(vma->vm_flags & VM_SHARED)) 3530 else if (!(vma->vm_flags & VM_SHARED))
3507 ret = do_cow_fault(vmf); 3531 ret = do_cow_fault(vmf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 38d94b703e9d..7e6509a53d79 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -687,62 +687,19 @@ static void node_states_check_changes_online(unsigned long nr_pages,
687 struct zone *zone, struct memory_notify *arg) 687 struct zone *zone, struct memory_notify *arg)
688{ 688{
689 int nid = zone_to_nid(zone); 689 int nid = zone_to_nid(zone);
690 enum zone_type zone_last = ZONE_NORMAL;
691 690
692 /* 691 arg->status_change_nid = -1;
693 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 692 arg->status_change_nid_normal = -1;
694 * contains nodes which have zones of 0...ZONE_NORMAL, 693 arg->status_change_nid_high = -1;
695 * set zone_last to ZONE_NORMAL.
696 *
697 * If we don't have HIGHMEM nor movable node,
698 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
699 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
700 */
701 if (N_MEMORY == N_NORMAL_MEMORY)
702 zone_last = ZONE_MOVABLE;
703 694
704 /* 695 if (!node_state(nid, N_MEMORY))
705 * if the memory to be online is in a zone of 0...zone_last, and 696 arg->status_change_nid = nid;
706 * the zones of 0...zone_last don't have memory before online, we will 697 if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
707 * need to set the node to node_states[N_NORMAL_MEMORY] after
708 * the memory is online.
709 */
710 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
711 arg->status_change_nid_normal = nid; 698 arg->status_change_nid_normal = nid;
712 else
713 arg->status_change_nid_normal = -1;
714
715#ifdef CONFIG_HIGHMEM 699#ifdef CONFIG_HIGHMEM
716 /* 700 if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY))
717 * If we have movable node, node_states[N_HIGH_MEMORY]
718 * contains nodes which have zones of 0...ZONE_HIGHMEM,
719 * set zone_last to ZONE_HIGHMEM.
720 *
721 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
722 * contains nodes which have zones of 0...ZONE_MOVABLE,
723 * set zone_last to ZONE_MOVABLE.
724 */
725 zone_last = ZONE_HIGHMEM;
726 if (N_MEMORY == N_HIGH_MEMORY)
727 zone_last = ZONE_MOVABLE;
728
729 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
730 arg->status_change_nid_high = nid; 701 arg->status_change_nid_high = nid;
731 else
732 arg->status_change_nid_high = -1;
733#else
734 arg->status_change_nid_high = arg->status_change_nid_normal;
735#endif 702#endif
736
737 /*
738 * if the node don't have memory befor online, we will need to
739 * set the node to node_states[N_MEMORY] after the memory
740 * is online.
741 */
742 if (!node_state(nid, N_MEMORY))
743 arg->status_change_nid = nid;
744 else
745 arg->status_change_nid = -1;
746} 703}
747 704
748static void node_states_set_node(int node, struct memory_notify *arg) 705static void node_states_set_node(int node, struct memory_notify *arg)
@@ -753,7 +710,8 @@ static void node_states_set_node(int node, struct memory_notify *arg)
753 if (arg->status_change_nid_high >= 0) 710 if (arg->status_change_nid_high >= 0)
754 node_set_state(node, N_HIGH_MEMORY); 711 node_set_state(node, N_HIGH_MEMORY);
755 712
756 node_set_state(node, N_MEMORY); 713 if (arg->status_change_nid >= 0)
714 node_set_state(node, N_MEMORY);
757} 715}
758 716
759static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, 717static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
@@ -1505,75 +1463,53 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
1505{ 1463{
1506 struct pglist_data *pgdat = zone->zone_pgdat; 1464 struct pglist_data *pgdat = zone->zone_pgdat;
1507 unsigned long present_pages = 0; 1465 unsigned long present_pages = 0;
1508 enum zone_type zt, zone_last = ZONE_NORMAL; 1466 enum zone_type zt;
1509 1467
1510 /* 1468 arg->status_change_nid = -1;
1511 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1469 arg->status_change_nid_normal = -1;
1512 * contains nodes which have zones of 0...ZONE_NORMAL, 1470 arg->status_change_nid_high = -1;
1513 * set zone_last to ZONE_NORMAL.
1514 *
1515 * If we don't have HIGHMEM nor movable node,
1516 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1517 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1518 */
1519 if (N_MEMORY == N_NORMAL_MEMORY)
1520 zone_last = ZONE_MOVABLE;
1521 1471
1522 /* 1472 /*
1523 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1473 * Check whether node_states[N_NORMAL_MEMORY] will be changed.
1524 * If the memory to be offline is in a zone of 0...zone_last, 1474 * If the memory to be offline is within the range
1525 * and it is the last present memory, 0...zone_last will 1475 * [0..ZONE_NORMAL], and it is the last present memory there,
1526 * become empty after offline , thus we can determind we will 1476 * the zones in that range will become empty after the offlining,
1527 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1477 * thus we can determine that we need to clear the node from
1478 * node_states[N_NORMAL_MEMORY].
1528 */ 1479 */
1529 for (zt = 0; zt <= zone_last; zt++) 1480 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1530 present_pages += pgdat->node_zones[zt].present_pages; 1481 present_pages += pgdat->node_zones[zt].present_pages;
1531 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1482 if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
1532 arg->status_change_nid_normal = zone_to_nid(zone); 1483 arg->status_change_nid_normal = zone_to_nid(zone);
1533 else
1534 arg->status_change_nid_normal = -1;
1535 1484
1536#ifdef CONFIG_HIGHMEM 1485#ifdef CONFIG_HIGHMEM
1537 /* 1486 /*
1538 * If we have movable node, node_states[N_HIGH_MEMORY] 1487 * node_states[N_HIGH_MEMORY] contains nodes which
1539 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1488 * have normal memory or high memory.
1540 * set zone_last to ZONE_HIGHMEM. 1489 * Here we add the present_pages belonging to ZONE_HIGHMEM.
1541 * 1490 * If the zone is within the range of [0..ZONE_HIGHMEM), and
1542 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1491 * we determine that the zones in that range become empty,
1543 * contains nodes which have zones of 0...ZONE_MOVABLE, 1492 * we need to clear the node for N_HIGH_MEMORY.
1544 * set zone_last to ZONE_MOVABLE.
1545 */ 1493 */
1546 zone_last = ZONE_HIGHMEM; 1494 present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1547 if (N_MEMORY == N_HIGH_MEMORY) 1495 if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
1548 zone_last = ZONE_MOVABLE;
1549
1550 for (; zt <= zone_last; zt++)
1551 present_pages += pgdat->node_zones[zt].present_pages;
1552 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1553 arg->status_change_nid_high = zone_to_nid(zone); 1496 arg->status_change_nid_high = zone_to_nid(zone);
1554 else
1555 arg->status_change_nid_high = -1;
1556#else
1557 arg->status_change_nid_high = arg->status_change_nid_normal;
1558#endif 1497#endif
1559 1498
1560 /* 1499 /*
1561 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1500 * We have accounted the pages from [0..ZONE_NORMAL), and
1501 * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
1502 * as well.
1503 * Here we count the possible pages from ZONE_MOVABLE.
1504 * If after having accounted all the pages, we see that the nr_pages
1505 * to be offlined is over or equal to the accounted pages,
1506 * we know that the node will become empty, and so, we can clear
1507 * it for N_MEMORY as well.
1562 */ 1508 */
1563 zone_last = ZONE_MOVABLE; 1509 present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
1564 1510
1565 /*
1566 * check whether node_states[N_HIGH_MEMORY] will be changed
1567 * If we try to offline the last present @nr_pages from the node,
1568 * we can determind we will need to clear the node from
1569 * node_states[N_HIGH_MEMORY].
1570 */
1571 for (; zt <= zone_last; zt++)
1572 present_pages += pgdat->node_zones[zt].present_pages;
1573 if (nr_pages >= present_pages) 1511 if (nr_pages >= present_pages)
1574 arg->status_change_nid = zone_to_nid(zone); 1512 arg->status_change_nid = zone_to_nid(zone);
1575 else
1576 arg->status_change_nid = -1;
1577} 1513}
1578 1514
1579static void node_states_clear_node(int node, struct memory_notify *arg) 1515static void node_states_clear_node(int node, struct memory_notify *arg)
@@ -1581,12 +1517,10 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
1581 if (arg->status_change_nid_normal >= 0) 1517 if (arg->status_change_nid_normal >= 0)
1582 node_clear_state(node, N_NORMAL_MEMORY); 1518 node_clear_state(node, N_NORMAL_MEMORY);
1583 1519
1584 if ((N_MEMORY != N_NORMAL_MEMORY) && 1520 if (arg->status_change_nid_high >= 0)
1585 (arg->status_change_nid_high >= 0))
1586 node_clear_state(node, N_HIGH_MEMORY); 1521 node_clear_state(node, N_HIGH_MEMORY);
1587 1522
1588 if ((N_MEMORY != N_HIGH_MEMORY) && 1523 if (arg->status_change_nid >= 0)
1589 (arg->status_change_nid >= 0))
1590 node_clear_state(node, N_MEMORY); 1524 node_clear_state(node, N_MEMORY);
1591} 1525}
1592 1526
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index da858f794eb6..cfd26d7e61a1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
797 } 797 }
798} 798}
799 799
800static int lookup_node(unsigned long addr) 800static int lookup_node(struct mm_struct *mm, unsigned long addr)
801{ 801{
802 struct page *p; 802 struct page *p;
803 int err; 803 int err;
804 804
805 err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); 805 int locked = 1;
806 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
806 if (err >= 0) { 807 if (err >= 0) {
807 err = page_to_nid(p); 808 err = page_to_nid(p);
808 put_page(p); 809 put_page(p);
809 } 810 }
811 if (locked)
812 up_read(&mm->mmap_sem);
810 return err; 813 return err;
811} 814}
812 815
@@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
817 int err; 820 int err;
818 struct mm_struct *mm = current->mm; 821 struct mm_struct *mm = current->mm;
819 struct vm_area_struct *vma = NULL; 822 struct vm_area_struct *vma = NULL;
820 struct mempolicy *pol = current->mempolicy; 823 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
821 824
822 if (flags & 825 if (flags &
823 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 826 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
857 860
858 if (flags & MPOL_F_NODE) { 861 if (flags & MPOL_F_NODE) {
859 if (flags & MPOL_F_ADDR) { 862 if (flags & MPOL_F_ADDR) {
860 err = lookup_node(addr); 863 /*
864 * Take a refcount on the mpol, lookup_node()
865 * wil drop the mmap_sem, so after calling
866 * lookup_node() only "pol" remains valid, "vma"
867 * is stale.
868 */
869 pol_refcount = pol;
870 vma = NULL;
871 mpol_get(pol);
872 err = lookup_node(mm, addr);
861 if (err < 0) 873 if (err < 0)
862 goto out; 874 goto out;
863 *policy = err; 875 *policy = err;
@@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
892 out: 904 out:
893 mpol_cond_put(pol); 905 mpol_cond_put(pol);
894 if (vma) 906 if (vma)
895 up_read(&current->mm->mmap_sem); 907 up_read(&mm->mmap_sem);
908 if (pol_refcount)
909 mpol_put(pol_refcount);
896 return err; 910 return err;
897} 911}
898 912
@@ -2697,12 +2711,11 @@ static const char * const policy_modes[] =
2697int mpol_parse_str(char *str, struct mempolicy **mpol) 2711int mpol_parse_str(char *str, struct mempolicy **mpol)
2698{ 2712{
2699 struct mempolicy *new = NULL; 2713 struct mempolicy *new = NULL;
2700 unsigned short mode;
2701 unsigned short mode_flags; 2714 unsigned short mode_flags;
2702 nodemask_t nodes; 2715 nodemask_t nodes;
2703 char *nodelist = strchr(str, ':'); 2716 char *nodelist = strchr(str, ':');
2704 char *flags = strchr(str, '='); 2717 char *flags = strchr(str, '=');
2705 int err = 1; 2718 int err = 1, mode;
2706 2719
2707 if (nodelist) { 2720 if (nodelist) {
2708 /* NUL-terminate mode or flags string */ 2721 /* NUL-terminate mode or flags string */
@@ -2717,12 +2730,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
2717 if (flags) 2730 if (flags)
2718 *flags++ = '\0'; /* terminate mode string */ 2731 *flags++ = '\0'; /* terminate mode string */
2719 2732
2720 for (mode = 0; mode < MPOL_MAX; mode++) { 2733 mode = match_string(policy_modes, MPOL_MAX, str);
2721 if (!strcmp(str, policy_modes[mode])) { 2734 if (mode < 0)
2722 break;
2723 }
2724 }
2725 if (mode >= MPOL_MAX)
2726 goto out; 2735 goto out;
2727 2736
2728 switch (mode) { 2737 switch (mode) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 84381b55b2bd..b6700f2962f3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page)
685 SetPageActive(newpage); 685 SetPageActive(newpage);
686 } else if (TestClearPageUnevictable(page)) 686 } else if (TestClearPageUnevictable(page))
687 SetPageUnevictable(newpage); 687 SetPageUnevictable(newpage);
688 if (PageWorkingset(page))
689 SetPageWorkingset(newpage);
688 if (PageChecked(page)) 690 if (PageChecked(page))
689 SetPageChecked(newpage); 691 SetPageChecked(newpage);
690 if (PageMappedToDisk(page)) 692 if (PageMappedToDisk(page))
@@ -1973,8 +1975,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1973 int isolated = 0; 1975 int isolated = 0;
1974 struct page *new_page = NULL; 1976 struct page *new_page = NULL;
1975 int page_lru = page_is_file_cache(page); 1977 int page_lru = page_is_file_cache(page);
1976 unsigned long mmun_start = address & HPAGE_PMD_MASK; 1978 unsigned long start = address & HPAGE_PMD_MASK;
1977 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
1978 1979
1979 new_page = alloc_pages_node(node, 1980 new_page = alloc_pages_node(node,
1980 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), 1981 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
@@ -1997,15 +1998,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1997 /* anon mapping, we can simply copy page->mapping to the new page: */ 1998 /* anon mapping, we can simply copy page->mapping to the new page: */
1998 new_page->mapping = page->mapping; 1999 new_page->mapping = page->mapping;
1999 new_page->index = page->index; 2000 new_page->index = page->index;
2001 /* flush the cache before copying using the kernel virtual address */
2002 flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
2000 migrate_page_copy(new_page, page); 2003 migrate_page_copy(new_page, page);
2001 WARN_ON(PageLRU(new_page)); 2004 WARN_ON(PageLRU(new_page));
2002 2005
2003 /* Recheck the target PMD */ 2006 /* Recheck the target PMD */
2004 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2005 ptl = pmd_lock(mm, pmd); 2007 ptl = pmd_lock(mm, pmd);
2006 if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { 2008 if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
2007 spin_unlock(ptl); 2009 spin_unlock(ptl);
2008 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2009 2010
2010 /* Reverse changes made by migrate_page_copy() */ 2011 /* Reverse changes made by migrate_page_copy() */
2011 if (TestClearPageActive(new_page)) 2012 if (TestClearPageActive(new_page))
@@ -2029,16 +2030,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
2029 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 2030 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
2030 2031
2031 /* 2032 /*
2032 * Clear the old entry under pagetable lock and establish the new PTE. 2033 * Overwrite the old entry under pagetable lock and establish
2033 * Any parallel GUP will either observe the old page blocking on the 2034 * the new PTE. Any parallel GUP will either observe the old
2034 * page lock, block on the page table lock or observe the new page. 2035 * page blocking on the page lock, block on the page table
2035 * The SetPageUptodate on the new page and page_add_new_anon_rmap 2036 * lock or observe the new page. The SetPageUptodate on the
2036 * guarantee the copy is visible before the pagetable update. 2037 * new page and page_add_new_anon_rmap guarantee the copy is
2038 * visible before the pagetable update.
2037 */ 2039 */
2038 flush_cache_range(vma, mmun_start, mmun_end); 2040 page_add_anon_rmap(new_page, vma, start, true);
2039 page_add_anon_rmap(new_page, vma, mmun_start, true); 2041 /*
2040 pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); 2042 * At this point the pmd is numa/protnone (i.e. non present) and the TLB
2041 set_pmd_at(mm, mmun_start, pmd, entry); 2043 * has already been flushed globally. So no TLB can be currently
2044 * caching this non present pmd mapping. There's no need to clear the
2045 * pmd before doing set_pmd_at(), nor to flush the TLB after
2046 * set_pmd_at(). Clearing the pmd here would introduce a race
2047 * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
2048 * mmap_sem for reading. If the pmd is set to NULL at any given time,
2049 * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
2050 * pmd.
2051 */
2052 set_pmd_at(mm, start, pmd, entry);
2042 update_mmu_cache_pmd(vma, address, &entry); 2053 update_mmu_cache_pmd(vma, address, &entry);
2043 2054
2044 page_ref_unfreeze(page, 2); 2055 page_ref_unfreeze(page, 2);
@@ -2047,11 +2058,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
2047 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); 2058 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
2048 2059
2049 spin_unlock(ptl); 2060 spin_unlock(ptl);
2050 /*
2051 * No need to double call mmu_notifier->invalidate_range() callback as
2052 * the above pmdp_huge_clear_flush_notify() did already call it.
2053 */
2054 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
2055 2061
2056 /* Take an "isolate" reference and put new page on the LRU. */ 2062 /* Take an "isolate" reference and put new page on the LRU. */
2057 get_page(new_page); 2063 get_page(new_page);
@@ -2075,7 +2081,7 @@ out_fail:
2075 ptl = pmd_lock(mm, pmd); 2081 ptl = pmd_lock(mm, pmd);
2076 if (pmd_same(*pmd, entry)) { 2082 if (pmd_same(*pmd, entry)) {
2077 entry = pmd_modify(entry, vma->vm_page_prot); 2083 entry = pmd_modify(entry, vma->vm_page_prot);
2078 set_pmd_at(mm, mmun_start, pmd, entry); 2084 set_pmd_at(mm, start, pmd, entry);
2079 update_mmu_cache_pmd(vma, address, &entry); 2085 update_mmu_cache_pmd(vma, address, &entry);
2080 } 2086 }
2081 spin_unlock(ptl); 2087 spin_unlock(ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index f7cd9cb966c0..6c04292e16a7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -191,16 +191,19 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
191SYSCALL_DEFINE1(brk, unsigned long, brk) 191SYSCALL_DEFINE1(brk, unsigned long, brk)
192{ 192{
193 unsigned long retval; 193 unsigned long retval;
194 unsigned long newbrk, oldbrk; 194 unsigned long newbrk, oldbrk, origbrk;
195 struct mm_struct *mm = current->mm; 195 struct mm_struct *mm = current->mm;
196 struct vm_area_struct *next; 196 struct vm_area_struct *next;
197 unsigned long min_brk; 197 unsigned long min_brk;
198 bool populate; 198 bool populate;
199 bool downgraded = false;
199 LIST_HEAD(uf); 200 LIST_HEAD(uf);
200 201
201 if (down_write_killable(&mm->mmap_sem)) 202 if (down_write_killable(&mm->mmap_sem))
202 return -EINTR; 203 return -EINTR;
203 204
205 origbrk = mm->brk;
206
204#ifdef CONFIG_COMPAT_BRK 207#ifdef CONFIG_COMPAT_BRK
205 /* 208 /*
206 * CONFIG_COMPAT_BRK can still be overridden by setting 209 * CONFIG_COMPAT_BRK can still be overridden by setting
@@ -229,14 +232,32 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
229 232
230 newbrk = PAGE_ALIGN(brk); 233 newbrk = PAGE_ALIGN(brk);
231 oldbrk = PAGE_ALIGN(mm->brk); 234 oldbrk = PAGE_ALIGN(mm->brk);
232 if (oldbrk == newbrk) 235 if (oldbrk == newbrk) {
233 goto set_brk; 236 mm->brk = brk;
237 goto success;
238 }
234 239
235 /* Always allow shrinking brk. */ 240 /*
241 * Always allow shrinking brk.
242 * __do_munmap() may downgrade mmap_sem to read.
243 */
236 if (brk <= mm->brk) { 244 if (brk <= mm->brk) {
237 if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) 245 int ret;
238 goto set_brk; 246
239 goto out; 247 /*
248 * mm->brk must to be protected by write mmap_sem so update it
249 * before downgrading mmap_sem. When __do_munmap() fails,
250 * mm->brk will be restored from origbrk.
251 */
252 mm->brk = brk;
253 ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
254 if (ret < 0) {
255 mm->brk = origbrk;
256 goto out;
257 } else if (ret == 1) {
258 downgraded = true;
259 }
260 goto success;
240 } 261 }
241 262
242 /* Check against existing mmap mappings. */ 263 /* Check against existing mmap mappings. */
@@ -247,18 +268,21 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
247 /* Ok, looks good - let it rip. */ 268 /* Ok, looks good - let it rip. */
248 if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) 269 if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
249 goto out; 270 goto out;
250
251set_brk:
252 mm->brk = brk; 271 mm->brk = brk;
272
273success:
253 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; 274 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
254 up_write(&mm->mmap_sem); 275 if (downgraded)
276 up_read(&mm->mmap_sem);
277 else
278 up_write(&mm->mmap_sem);
255 userfaultfd_unmap_complete(mm, &uf); 279 userfaultfd_unmap_complete(mm, &uf);
256 if (populate) 280 if (populate)
257 mm_populate(oldbrk, newbrk - oldbrk); 281 mm_populate(oldbrk, newbrk - oldbrk);
258 return brk; 282 return brk;
259 283
260out: 284out:
261 retval = mm->brk; 285 retval = origbrk;
262 up_write(&mm->mmap_sem); 286 up_write(&mm->mmap_sem);
263 return retval; 287 return retval;
264} 288}
@@ -2687,8 +2711,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2687 * work. This now handles partial unmappings. 2711 * work. This now handles partial unmappings.
2688 * Jeremy Fitzhardinge <jeremy@goop.org> 2712 * Jeremy Fitzhardinge <jeremy@goop.org>
2689 */ 2713 */
2690int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, 2714int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2691 struct list_head *uf) 2715 struct list_head *uf, bool downgrade)
2692{ 2716{
2693 unsigned long end; 2717 unsigned long end;
2694 struct vm_area_struct *vma, *prev, *last; 2718 struct vm_area_struct *vma, *prev, *last;
@@ -2770,25 +2794,38 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2770 mm->locked_vm -= vma_pages(tmp); 2794 mm->locked_vm -= vma_pages(tmp);
2771 munlock_vma_pages_all(tmp); 2795 munlock_vma_pages_all(tmp);
2772 } 2796 }
2797
2773 tmp = tmp->vm_next; 2798 tmp = tmp->vm_next;
2774 } 2799 }
2775 } 2800 }
2776 2801
2777 /* 2802 /* Detach vmas from rbtree */
2778 * Remove the vma's, and unmap the actual pages
2779 */
2780 detach_vmas_to_be_unmapped(mm, vma, prev, end); 2803 detach_vmas_to_be_unmapped(mm, vma, prev, end);
2781 unmap_region(mm, vma, prev, start, end);
2782 2804
2805 /*
2806 * mpx unmap needs to be called with mmap_sem held for write.
2807 * It is safe to call it before unmap_region().
2808 */
2783 arch_unmap(mm, vma, start, end); 2809 arch_unmap(mm, vma, start, end);
2784 2810
2811 if (downgrade)
2812 downgrade_write(&mm->mmap_sem);
2813
2814 unmap_region(mm, vma, prev, start, end);
2815
2785 /* Fix up all other VM information */ 2816 /* Fix up all other VM information */
2786 remove_vma_list(mm, vma); 2817 remove_vma_list(mm, vma);
2787 2818
2788 return 0; 2819 return downgrade ? 1 : 0;
2789} 2820}
2790 2821
2791int vm_munmap(unsigned long start, size_t len) 2822int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2823 struct list_head *uf)
2824{
2825 return __do_munmap(mm, start, len, uf, false);
2826}
2827
2828static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
2792{ 2829{
2793 int ret; 2830 int ret;
2794 struct mm_struct *mm = current->mm; 2831 struct mm_struct *mm = current->mm;
@@ -2797,17 +2834,32 @@ int vm_munmap(unsigned long start, size_t len)
2797 if (down_write_killable(&mm->mmap_sem)) 2834 if (down_write_killable(&mm->mmap_sem))
2798 return -EINTR; 2835 return -EINTR;
2799 2836
2800 ret = do_munmap(mm, start, len, &uf); 2837 ret = __do_munmap(mm, start, len, &uf, downgrade);
2801 up_write(&mm->mmap_sem); 2838 /*
2839 * Returning 1 indicates mmap_sem is downgraded.
2840 * But 1 is not legal return value of vm_munmap() and munmap(), reset
2841 * it to 0 before return.
2842 */
2843 if (ret == 1) {
2844 up_read(&mm->mmap_sem);
2845 ret = 0;
2846 } else
2847 up_write(&mm->mmap_sem);
2848
2802 userfaultfd_unmap_complete(mm, &uf); 2849 userfaultfd_unmap_complete(mm, &uf);
2803 return ret; 2850 return ret;
2804} 2851}
2852
2853int vm_munmap(unsigned long start, size_t len)
2854{
2855 return __vm_munmap(start, len, false);
2856}
2805EXPORT_SYMBOL(vm_munmap); 2857EXPORT_SYMBOL(vm_munmap);
2806 2858
2807SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 2859SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2808{ 2860{
2809 profile_munmap(addr); 2861 profile_munmap(addr);
2810 return vm_munmap(addr, len); 2862 return __vm_munmap(addr, len, true);
2811} 2863}
2812 2864
2813 2865
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 82bb1a939c0e..5119ff846769 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
247} 247}
248EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); 248EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
249 249
250/*
251 * Must be called while holding mm->mmap_sem for either read or write.
252 * The result is guaranteed to be valid until mm->mmap_sem is dropped.
253 */
254bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
255{
256 struct mmu_notifier *mn;
257 int id;
258 bool ret = false;
259
260 WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
261
262 if (!mm_has_notifiers(mm))
263 return ret;
264
265 id = srcu_read_lock(&srcu);
266 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
267 if (!mn->ops->invalidate_range &&
268 !mn->ops->invalidate_range_start &&
269 !mn->ops->invalidate_range_end)
270 continue;
271
272 if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
273 ret = true;
274 break;
275 }
276 }
277 srcu_read_unlock(&srcu, id);
278 return ret;
279}
280
281static int do_mmu_notifier_register(struct mmu_notifier *mn, 250static int do_mmu_notifier_register(struct mmu_notifier *mn,
282 struct mm_struct *mm, 251 struct mm_struct *mm,
283 int take_mmap_sem) 252 int take_mmap_sem)
diff --git a/mm/mremap.c b/mm/mremap.c
index a9617e72e6b7..7f9f9180e401 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -521,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
521 unsigned long ret = -EINVAL; 521 unsigned long ret = -EINVAL;
522 unsigned long charged = 0; 522 unsigned long charged = 0;
523 bool locked = false; 523 bool locked = false;
524 bool downgraded = false;
524 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; 525 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
525 LIST_HEAD(uf_unmap_early); 526 LIST_HEAD(uf_unmap_early);
526 LIST_HEAD(uf_unmap); 527 LIST_HEAD(uf_unmap);
@@ -557,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
557 /* 558 /*
558 * Always allow a shrinking remap: that just unmaps 559 * Always allow a shrinking remap: that just unmaps
559 * the unnecessary pages.. 560 * the unnecessary pages..
560 * do_munmap does all the needed commit accounting 561 * __do_munmap does all the needed commit accounting, and
562 * downgrades mmap_sem to read if so directed.
561 */ 563 */
562 if (old_len >= new_len) { 564 if (old_len >= new_len) {
563 ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); 565 int retval;
564 if (ret && old_len != new_len) 566
567 retval = __do_munmap(mm, addr+new_len, old_len - new_len,
568 &uf_unmap, true);
569 if (retval < 0 && old_len != new_len) {
570 ret = retval;
565 goto out; 571 goto out;
572 /* Returning 1 indicates mmap_sem is downgraded to read. */
573 } else if (retval == 1)
574 downgraded = true;
566 ret = addr; 575 ret = addr;
567 goto out; 576 goto out;
568 } 577 }
@@ -627,7 +636,10 @@ out:
627 vm_unacct_memory(charged); 636 vm_unacct_memory(charged);
628 locked = 0; 637 locked = 0;
629 } 638 }
630 up_write(&current->mm->mmap_sem); 639 if (downgraded)
640 up_read(&current->mm->mmap_sem);
641 else
642 up_write(&current->mm->mmap_sem);
631 if (locked && new_len > old_len) 643 if (locked && new_len > old_len)
632 mm_populate(new_addr + old_len, new_len - old_len); 644 mm_populate(new_addr + old_len, new_len - old_len);
633 userfaultfd_unmap_complete(mm, &uf_unmap_early); 645 userfaultfd_unmap_complete(mm, &uf_unmap_early);
diff --git a/mm/nommu.c b/mm/nommu.c
index e4aac33216ae..749276beb109 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1709 return ret; 1709 return ret;
1710} 1710}
1711 1711
1712struct page *follow_page_mask(struct vm_area_struct *vma, 1712struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1713 unsigned long address, unsigned int flags, 1713 unsigned int foll_flags)
1714 unsigned int *page_mask)
1715{ 1714{
1716 *page_mask = 0;
1717 return NULL; 1715 return NULL;
1718} 1716}
1719 1717
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 84ae9bf5858a..439a304a6c92 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2149,6 +2149,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
2149 * not miss some pages (e.g., because some other process has cleared TOWRITE 2149 * not miss some pages (e.g., because some other process has cleared TOWRITE
2150 * tag we set). The rule we follow is that TOWRITE tag can be cleared only 2150 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
2151 * by the process clearing the DIRTY tag (and submitting the page for IO). 2151 * by the process clearing the DIRTY tag (and submitting the page for IO).
2152 *
2153 * To avoid deadlocks between range_cyclic writeback and callers that hold
2154 * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
2155 * we do not loop back to the start of the file. Doing so causes a page
2156 * lock/page writeback access order inversion - we should only ever lock
2157 * multiple pages in ascending page->index order, and looping back to the start
2158 * of the file violates that rule and causes deadlocks.
2152 */ 2159 */
2153int write_cache_pages(struct address_space *mapping, 2160int write_cache_pages(struct address_space *mapping,
2154 struct writeback_control *wbc, writepage_t writepage, 2161 struct writeback_control *wbc, writepage_t writepage,
@@ -2162,7 +2169,6 @@ int write_cache_pages(struct address_space *mapping,
2162 pgoff_t index; 2169 pgoff_t index;
2163 pgoff_t end; /* Inclusive */ 2170 pgoff_t end; /* Inclusive */
2164 pgoff_t done_index; 2171 pgoff_t done_index;
2165 int cycled;
2166 int range_whole = 0; 2172 int range_whole = 0;
2167 int tag; 2173 int tag;
2168 2174
@@ -2170,23 +2176,17 @@ int write_cache_pages(struct address_space *mapping,
2170 if (wbc->range_cyclic) { 2176 if (wbc->range_cyclic) {
2171 writeback_index = mapping->writeback_index; /* prev offset */ 2177 writeback_index = mapping->writeback_index; /* prev offset */
2172 index = writeback_index; 2178 index = writeback_index;
2173 if (index == 0)
2174 cycled = 1;
2175 else
2176 cycled = 0;
2177 end = -1; 2179 end = -1;
2178 } else { 2180 } else {
2179 index = wbc->range_start >> PAGE_SHIFT; 2181 index = wbc->range_start >> PAGE_SHIFT;
2180 end = wbc->range_end >> PAGE_SHIFT; 2182 end = wbc->range_end >> PAGE_SHIFT;
2181 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2183 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2182 range_whole = 1; 2184 range_whole = 1;
2183 cycled = 1; /* ignore range_cyclic tests */
2184 } 2185 }
2185 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2186 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2186 tag = PAGECACHE_TAG_TOWRITE; 2187 tag = PAGECACHE_TAG_TOWRITE;
2187 else 2188 else
2188 tag = PAGECACHE_TAG_DIRTY; 2189 tag = PAGECACHE_TAG_DIRTY;
2189retry:
2190 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2190 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2191 tag_pages_for_writeback(mapping, index, end); 2191 tag_pages_for_writeback(mapping, index, end);
2192 done_index = index; 2192 done_index = index;
@@ -2272,17 +2272,14 @@ continue_unlock:
2272 pagevec_release(&pvec); 2272 pagevec_release(&pvec);
2273 cond_resched(); 2273 cond_resched();
2274 } 2274 }
2275 if (!cycled && !done) { 2275
2276 /* 2276 /*
2277 * range_cyclic: 2277 * If we hit the last page and there is more work to be done: wrap
2278 * We hit the last page and there is more work to be done: wrap 2278 * back the index back to the start of the file for the next
2279 * back to the start of the file 2279 * time we are called.
2280 */ 2280 */
2281 cycled = 1; 2281 if (wbc->range_cyclic && !done)
2282 index = 0; 2282 done_index = 0;
2283 end = writeback_index - 1;
2284 goto retry;
2285 }
2286 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2283 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2287 mapping->writeback_index = done_index; 2284 mapping->writeback_index = done_index;
2288 2285
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2ef1c17942f..863d46da6586 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
66#include <linux/ftrace.h> 66#include <linux/ftrace.h>
67#include <linux/lockdep.h> 67#include <linux/lockdep.h>
68#include <linux/nmi.h> 68#include <linux/nmi.h>
69#include <linux/psi.h>
69 70
70#include <asm/sections.h> 71#include <asm/sections.h>
71#include <asm/tlbflush.h> 72#include <asm/tlbflush.h>
@@ -306,24 +307,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
306} 307}
307 308
308/* 309/*
309 * Returns false when the remaining initialisation should be deferred until 310 * Returns true when the remaining initialisation should be deferred until
310 * later in the boot cycle when it can be parallelised. 311 * later in the boot cycle when it can be parallelised.
311 */ 312 */
312static inline bool update_defer_init(pg_data_t *pgdat, 313static bool __meminit
313 unsigned long pfn, unsigned long zone_end, 314defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
314 unsigned long *nr_initialised)
315{ 315{
316 static unsigned long prev_end_pfn, nr_initialised;
317
318 /*
319 * prev_end_pfn static that contains the end of previous zone
320 * No need to protect because called very early in boot before smp_init.
321 */
322 if (prev_end_pfn != end_pfn) {
323 prev_end_pfn = end_pfn;
324 nr_initialised = 0;
325 }
326
316 /* Always populate low zones for address-constrained allocations */ 327 /* Always populate low zones for address-constrained allocations */
317 if (zone_end < pgdat_end_pfn(pgdat)) 328 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
318 return true;
319 (*nr_initialised)++;
320 if ((*nr_initialised > pgdat->static_init_pgcnt) &&
321 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
322 pgdat->first_deferred_pfn = pfn;
323 return false; 329 return false;
330 nr_initialised++;
331 if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
332 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
333 NODE_DATA(nid)->first_deferred_pfn = pfn;
334 return true;
324 } 335 }
325 336 return false;
326 return true;
327} 337}
328#else 338#else
329static inline bool early_page_uninitialised(unsigned long pfn) 339static inline bool early_page_uninitialised(unsigned long pfn)
@@ -331,11 +341,9 @@ static inline bool early_page_uninitialised(unsigned long pfn)
331 return false; 341 return false;
332} 342}
333 343
334static inline bool update_defer_init(pg_data_t *pgdat, 344static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
335 unsigned long pfn, unsigned long zone_end,
336 unsigned long *nr_initialised)
337{ 345{
338 return true; 346 return false;
339} 347}
340#endif 348#endif
341 349
@@ -1231,7 +1239,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1231 /* Avoid false-positive PageTail() */ 1239 /* Avoid false-positive PageTail() */
1232 INIT_LIST_HEAD(&page->lru); 1240 INIT_LIST_HEAD(&page->lru);
1233 1241
1234 SetPageReserved(page); 1242 /*
1243 * no need for atomic set_bit because the struct
1244 * page is not visible yet so nobody should
1245 * access it yet.
1246 */
1247 __SetPageReserved(page);
1235 } 1248 }
1236 } 1249 }
1237} 1250}
@@ -2015,10 +2028,6 @@ static int move_freepages(struct zone *zone,
2015 pfn_valid(page_to_pfn(end_page)) && 2028 pfn_valid(page_to_pfn(end_page)) &&
2016 page_zone(start_page) != page_zone(end_page)); 2029 page_zone(start_page) != page_zone(end_page));
2017#endif 2030#endif
2018
2019 if (num_movable)
2020 *num_movable = 0;
2021
2022 for (page = start_page; page <= end_page;) { 2031 for (page = start_page; page <= end_page;) {
2023 if (!pfn_valid_within(page_to_pfn(page))) { 2032 if (!pfn_valid_within(page_to_pfn(page))) {
2024 page++; 2033 page++;
@@ -2058,6 +2067,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
2058 unsigned long start_pfn, end_pfn; 2067 unsigned long start_pfn, end_pfn;
2059 struct page *start_page, *end_page; 2068 struct page *start_page, *end_page;
2060 2069
2070 if (num_movable)
2071 *num_movable = 0;
2072
2061 start_pfn = page_to_pfn(page); 2073 start_pfn = page_to_pfn(page);
2062 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 2074 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
2063 start_page = pfn_to_page(start_pfn); 2075 start_page = pfn_to_page(start_pfn);
@@ -3366,26 +3378,12 @@ try_this_zone:
3366 return NULL; 3378 return NULL;
3367} 3379}
3368 3380
3369/*
3370 * Large machines with many possible nodes should not always dump per-node
3371 * meminfo in irq context.
3372 */
3373static inline bool should_suppress_show_mem(void)
3374{
3375 bool ret = false;
3376
3377#if NODES_SHIFT > 8
3378 ret = in_interrupt();
3379#endif
3380 return ret;
3381}
3382
3383static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 3381static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3384{ 3382{
3385 unsigned int filter = SHOW_MEM_FILTER_NODES; 3383 unsigned int filter = SHOW_MEM_FILTER_NODES;
3386 static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); 3384 static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3387 3385
3388 if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) 3386 if (!__ratelimit(&show_mem_rs))
3389 return; 3387 return;
3390 3388
3391 /* 3389 /*
@@ -3549,15 +3547,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3549 enum compact_priority prio, enum compact_result *compact_result) 3547 enum compact_priority prio, enum compact_result *compact_result)
3550{ 3548{
3551 struct page *page; 3549 struct page *page;
3550 unsigned long pflags;
3552 unsigned int noreclaim_flag; 3551 unsigned int noreclaim_flag;
3553 3552
3554 if (!order) 3553 if (!order)
3555 return NULL; 3554 return NULL;
3556 3555
3556 psi_memstall_enter(&pflags);
3557 noreclaim_flag = memalloc_noreclaim_save(); 3557 noreclaim_flag = memalloc_noreclaim_save();
3558
3558 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3559 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3559 prio); 3560 prio);
3561
3560 memalloc_noreclaim_restore(noreclaim_flag); 3562 memalloc_noreclaim_restore(noreclaim_flag);
3563 psi_memstall_leave(&pflags);
3561 3564
3562 if (*compact_result <= COMPACT_INACTIVE) 3565 if (*compact_result <= COMPACT_INACTIVE)
3563 return NULL; 3566 return NULL;
@@ -3756,11 +3759,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3756 struct reclaim_state reclaim_state; 3759 struct reclaim_state reclaim_state;
3757 int progress; 3760 int progress;
3758 unsigned int noreclaim_flag; 3761 unsigned int noreclaim_flag;
3762 unsigned long pflags;
3759 3763
3760 cond_resched(); 3764 cond_resched();
3761 3765
3762 /* We now go into synchronous reclaim */ 3766 /* We now go into synchronous reclaim */
3763 cpuset_memory_pressure_bump(); 3767 cpuset_memory_pressure_bump();
3768 psi_memstall_enter(&pflags);
3764 fs_reclaim_acquire(gfp_mask); 3769 fs_reclaim_acquire(gfp_mask);
3765 noreclaim_flag = memalloc_noreclaim_save(); 3770 noreclaim_flag = memalloc_noreclaim_save();
3766 reclaim_state.reclaimed_slab = 0; 3771 reclaim_state.reclaimed_slab = 0;
@@ -3772,6 +3777,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3772 current->reclaim_state = NULL; 3777 current->reclaim_state = NULL;
3773 memalloc_noreclaim_restore(noreclaim_flag); 3778 memalloc_noreclaim_restore(noreclaim_flag);
3774 fs_reclaim_release(gfp_mask); 3779 fs_reclaim_release(gfp_mask);
3780 psi_memstall_leave(&pflags);
3775 3781
3776 cond_resched(); 3782 cond_resched();
3777 3783
@@ -3922,6 +3928,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3922{ 3928{
3923 struct zone *zone; 3929 struct zone *zone;
3924 struct zoneref *z; 3930 struct zoneref *z;
3931 bool ret = false;
3925 3932
3926 /* 3933 /*
3927 * Costly allocations might have made a progress but this doesn't mean 3934 * Costly allocations might have made a progress but this doesn't mean
@@ -3985,25 +3992,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3985 } 3992 }
3986 } 3993 }
3987 3994
3988 /* 3995 ret = true;
3989 * Memory allocation/reclaim might be called from a WQ 3996 goto out;
3990 * context and the current implementation of the WQ
3991 * concurrency control doesn't recognize that
3992 * a particular WQ is congested if the worker thread is
3993 * looping without ever sleeping. Therefore we have to
3994 * do a short sleep here rather than calling
3995 * cond_resched().
3996 */
3997 if (current->flags & PF_WQ_WORKER)
3998 schedule_timeout_uninterruptible(1);
3999 else
4000 cond_resched();
4001
4002 return true;
4003 } 3997 }
4004 } 3998 }
4005 3999
4006 return false; 4000out:
4001 /*
4002 * Memory allocation/reclaim might be called from a WQ context and the
4003 * current implementation of the WQ concurrency control doesn't
4004 * recognize that a particular WQ is congested if the worker thread is
4005 * looping without ever sleeping. Therefore we have to do a short sleep
4006 * here rather than calling cond_resched().
4007 */
4008 if (current->flags & PF_WQ_WORKER)
4009 schedule_timeout_uninterruptible(1);
4010 else
4011 cond_resched();
4012 return ret;
4007} 4013}
4008 4014
4009static inline bool 4015static inline bool
@@ -4701,6 +4707,7 @@ long si_mem_available(void)
4701 unsigned long pagecache; 4707 unsigned long pagecache;
4702 unsigned long wmark_low = 0; 4708 unsigned long wmark_low = 0;
4703 unsigned long pages[NR_LRU_LISTS]; 4709 unsigned long pages[NR_LRU_LISTS];
4710 unsigned long reclaimable;
4704 struct zone *zone; 4711 struct zone *zone;
4705 int lru; 4712 int lru;
4706 4713
@@ -4726,19 +4733,13 @@ long si_mem_available(void)
4726 available += pagecache; 4733 available += pagecache;
4727 4734
4728 /* 4735 /*
4729 * Part of the reclaimable slab consists of items that are in use, 4736 * Part of the reclaimable slab and other kernel memory consists of
4730 * and cannot be freed. Cap this estimate at the low watermark. 4737 * items that are in use, and cannot be freed. Cap this estimate at the
4738 * low watermark.
4731 */ 4739 */
4732 available += global_node_page_state(NR_SLAB_RECLAIMABLE) - 4740 reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
4733 min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, 4741 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
4734 wmark_low); 4742 available += reclaimable - min(reclaimable / 2, wmark_low);
4735
4736 /*
4737 * Part of the kernel memory, which can be released under memory
4738 * pressure.
4739 */
4740 available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
4741 PAGE_SHIFT;
4742 4743
4743 if (available < 0) 4744 if (available < 0)
4744 available = 0; 4745 available = 0;
@@ -5449,6 +5450,30 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
5449#endif 5450#endif
5450} 5451}
5451 5452
5453/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
5454static bool __meminit
5455overlap_memmap_init(unsigned long zone, unsigned long *pfn)
5456{
5457#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5458 static struct memblock_region *r;
5459
5460 if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5461 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
5462 for_each_memblock(memory, r) {
5463 if (*pfn < memblock_region_memory_end_pfn(r))
5464 break;
5465 }
5466 }
5467 if (*pfn >= memblock_region_memory_base_pfn(r) &&
5468 memblock_is_mirror(r)) {
5469 *pfn = memblock_region_memory_end_pfn(r);
5470 return true;
5471 }
5472 }
5473#endif
5474 return false;
5475}
5476
5452/* 5477/*
5453 * Initially all pages are reserved - free ones are freed 5478 * Initially all pages are reserved - free ones are freed
5454 * up by free_all_bootmem() once the early boot process is 5479 * up by free_all_bootmem() once the early boot process is
@@ -5458,67 +5483,118 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5458 unsigned long start_pfn, enum memmap_context context, 5483 unsigned long start_pfn, enum memmap_context context,
5459 struct vmem_altmap *altmap) 5484 struct vmem_altmap *altmap)
5460{ 5485{
5461 unsigned long end_pfn = start_pfn + size; 5486 unsigned long pfn, end_pfn = start_pfn + size;
5462 pg_data_t *pgdat = NODE_DATA(nid);
5463 unsigned long pfn;
5464 unsigned long nr_initialised = 0;
5465 struct page *page; 5487 struct page *page;
5466#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5467 struct memblock_region *r = NULL, *tmp;
5468#endif
5469 5488
5470 if (highest_memmap_pfn < end_pfn - 1) 5489 if (highest_memmap_pfn < end_pfn - 1)
5471 highest_memmap_pfn = end_pfn - 1; 5490 highest_memmap_pfn = end_pfn - 1;
5472 5491
5492#ifdef CONFIG_ZONE_DEVICE
5473 /* 5493 /*
5474 * Honor reservation requested by the driver for this ZONE_DEVICE 5494 * Honor reservation requested by the driver for this ZONE_DEVICE
5475 * memory 5495 * memory. We limit the total number of pages to initialize to just
5496 * those that might contain the memory mapping. We will defer the
5497 * ZONE_DEVICE page initialization until after we have released
5498 * the hotplug lock.
5476 */ 5499 */
5477 if (altmap && start_pfn == altmap->base_pfn) 5500 if (zone == ZONE_DEVICE) {
5478 start_pfn += altmap->reserve; 5501 if (!altmap)
5502 return;
5503
5504 if (start_pfn == altmap->base_pfn)
5505 start_pfn += altmap->reserve;
5506 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
5507 }
5508#endif
5479 5509
5480 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 5510 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5481 /* 5511 /*
5482 * There can be holes in boot-time mem_map[]s handed to this 5512 * There can be holes in boot-time mem_map[]s handed to this
5483 * function. They do not exist on hotplugged memory. 5513 * function. They do not exist on hotplugged memory.
5484 */ 5514 */
5485 if (context != MEMMAP_EARLY) 5515 if (context == MEMMAP_EARLY) {
5486 goto not_early; 5516 if (!early_pfn_valid(pfn))
5487
5488 if (!early_pfn_valid(pfn))
5489 continue;
5490 if (!early_pfn_in_nid(pfn, nid))
5491 continue;
5492 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5493 break;
5494
5495#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5496 /*
5497 * Check given memblock attribute by firmware which can affect
5498 * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5499 * mirrored, it's an overlapped memmap init. skip it.
5500 */
5501 if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5502 if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
5503 for_each_memblock(memory, tmp)
5504 if (pfn < memblock_region_memory_end_pfn(tmp))
5505 break;
5506 r = tmp;
5507 }
5508 if (pfn >= memblock_region_memory_base_pfn(r) &&
5509 memblock_is_mirror(r)) {
5510 /* already initialized as NORMAL */
5511 pfn = memblock_region_memory_end_pfn(r);
5512 continue; 5517 continue;
5513 } 5518 if (!early_pfn_in_nid(pfn, nid))
5519 continue;
5520 if (overlap_memmap_init(zone, &pfn))
5521 continue;
5522 if (defer_init(nid, pfn, end_pfn))
5523 break;
5514 } 5524 }
5515#endif
5516 5525
5517not_early:
5518 page = pfn_to_page(pfn); 5526 page = pfn_to_page(pfn);
5519 __init_single_page(page, pfn, zone, nid); 5527 __init_single_page(page, pfn, zone, nid);
5520 if (context == MEMMAP_HOTPLUG) 5528 if (context == MEMMAP_HOTPLUG)
5521 SetPageReserved(page); 5529 __SetPageReserved(page);
5530
5531 /*
5532 * Mark the block movable so that blocks are reserved for
5533 * movable at startup. This will force kernel allocations
5534 * to reserve their blocks rather than leaking throughout
5535 * the address space during boot when many long-lived
5536 * kernel allocations are made.
5537 *
5538 * bitmap is created for zone's valid pfn range. but memmap
5539 * can be created for invalid pages (for alignment)
5540 * check here not to call set_pageblock_migratetype() against
5541 * pfn out of zone.
5542 */
5543 if (!(pfn & (pageblock_nr_pages - 1))) {
5544 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5545 cond_resched();
5546 }
5547 }
5548}
5549
5550#ifdef CONFIG_ZONE_DEVICE
5551void __ref memmap_init_zone_device(struct zone *zone,
5552 unsigned long start_pfn,
5553 unsigned long size,
5554 struct dev_pagemap *pgmap)
5555{
5556 unsigned long pfn, end_pfn = start_pfn + size;
5557 struct pglist_data *pgdat = zone->zone_pgdat;
5558 unsigned long zone_idx = zone_idx(zone);
5559 unsigned long start = jiffies;
5560 int nid = pgdat->node_id;
5561
5562 if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
5563 return;
5564
5565 /*
5566 * The call to memmap_init_zone should have already taken care
5567 * of the pages reserved for the memmap, so we can just jump to
5568 * the end of that region and start processing the device pages.
5569 */
5570 if (pgmap->altmap_valid) {
5571 struct vmem_altmap *altmap = &pgmap->altmap;
5572
5573 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
5574 size = end_pfn - start_pfn;
5575 }
5576
5577 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5578 struct page *page = pfn_to_page(pfn);
5579
5580 __init_single_page(page, pfn, zone_idx, nid);
5581
5582 /*
5583 * Mark page reserved as it will need to wait for onlining
5584 * phase for it to be fully associated with a zone.
5585 *
5586 * We can use the non-atomic __set_bit operation for setting
5587 * the flag as we are still initializing the pages.
5588 */
5589 __SetPageReserved(page);
5590
5591 /*
5592 * ZONE_DEVICE pages union ->lru with a ->pgmap back
5593 * pointer and hmm_data. It is a bug if a ZONE_DEVICE
5594 * page is ever freed or placed on a driver-private list.
5595 */
5596 page->pgmap = pgmap;
5597 page->hmm_data = 0;
5522 5598
5523 /* 5599 /*
5524 * Mark the block movable so that blocks are reserved for 5600 * Mark the block movable so that blocks are reserved for
@@ -5540,8 +5616,12 @@ not_early:
5540 cond_resched(); 5616 cond_resched();
5541 } 5617 }
5542 } 5618 }
5619
5620 pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
5621 size, jiffies_to_msecs(jiffies - start));
5543} 5622}
5544 5623
5624#endif
5545static void __meminit zone_init_free_lists(struct zone *zone) 5625static void __meminit zone_init_free_lists(struct zone *zone)
5546{ 5626{
5547 unsigned int order, t; 5627 unsigned int order, t;
@@ -5551,10 +5631,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)
5551 } 5631 }
5552} 5632}
5553 5633
5554#ifndef __HAVE_ARCH_MEMMAP_INIT 5634void __meminit __weak memmap_init(unsigned long size, int nid,
5555#define memmap_init(size, nid, zone, start_pfn) \ 5635 unsigned long zone, unsigned long start_pfn)
5556 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) 5636{
5557#endif 5637 memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
5638}
5558 5639
5559static int zone_batchsize(struct zone *zone) 5640static int zone_batchsize(struct zone *zone)
5560{ 5641{
@@ -6428,45 +6509,65 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
6428} 6509}
6429 6510
6430#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) 6511#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
6512
6513/*
6514 * Zero all valid struct pages in range [spfn, epfn), return number of struct
6515 * pages zeroed
6516 */
6517static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
6518{
6519 unsigned long pfn;
6520 u64 pgcnt = 0;
6521
6522 for (pfn = spfn; pfn < epfn; pfn++) {
6523 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6524 pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6525 + pageblock_nr_pages - 1;
6526 continue;
6527 }
6528 mm_zero_struct_page(pfn_to_page(pfn));
6529 pgcnt++;
6530 }
6531
6532 return pgcnt;
6533}
6534
6431/* 6535/*
6432 * Only struct pages that are backed by physical memory are zeroed and 6536 * Only struct pages that are backed by physical memory are zeroed and
6433 * initialized by going through __init_single_page(). But, there are some 6537 * initialized by going through __init_single_page(). But, there are some
6434 * struct pages which are reserved in memblock allocator and their fields 6538 * struct pages which are reserved in memblock allocator and their fields
6435 * may be accessed (for example page_to_pfn() on some configuration accesses 6539 * may be accessed (for example page_to_pfn() on some configuration accesses
6436 * flags). We must explicitly zero those struct pages. 6540 * flags). We must explicitly zero those struct pages.
6541 *
6542 * This function also addresses a similar issue where struct pages are left
6543 * uninitialized because the physical address range is not covered by
6544 * memblock.memory or memblock.reserved. That could happen when memblock
6545 * layout is manually configured via memmap=.
6437 */ 6546 */
6438void __init zero_resv_unavail(void) 6547void __init zero_resv_unavail(void)
6439{ 6548{
6440 phys_addr_t start, end; 6549 phys_addr_t start, end;
6441 unsigned long pfn;
6442 u64 i, pgcnt; 6550 u64 i, pgcnt;
6551 phys_addr_t next = 0;
6443 6552
6444 /* 6553 /*
6445 * Loop through ranges that are reserved, but do not have reported 6554 * Loop through unavailable ranges not covered by memblock.memory.
6446 * physical memory backing.
6447 */ 6555 */
6448 pgcnt = 0; 6556 pgcnt = 0;
6449 for_each_resv_unavail_range(i, &start, &end) { 6557 for_each_mem_range(i, &memblock.memory, NULL,
6450 for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { 6558 NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6451 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { 6559 if (next < start)
6452 pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) 6560 pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6453 + pageblock_nr_pages - 1; 6561 next = end;
6454 continue;
6455 }
6456 mm_zero_struct_page(pfn_to_page(pfn));
6457 pgcnt++;
6458 }
6459 } 6562 }
6563 pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
6460 6564
6461 /* 6565 /*
6462 * Struct pages that do not have backing memory. This could be because 6566 * Struct pages that do not have backing memory. This could be because
6463 * firmware is using some of this memory, or for some other reasons. 6567 * firmware is using some of this memory, or for some other reasons.
6464 * Once memblock is changed so such behaviour is not allowed: i.e.
6465 * list of "reserved" memory must be a subset of list of "memory", then
6466 * this code can be removed.
6467 */ 6568 */
6468 if (pgcnt) 6569 if (pgcnt)
6469 pr_info("Reserved but unavailable: %lld pages", pgcnt); 6570 pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6470} 6571}
6471#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ 6572#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
6472 6573
@@ -6803,15 +6904,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
6803{ 6904{
6804 enum zone_type zone_type; 6905 enum zone_type zone_type;
6805 6906
6806 if (N_MEMORY == N_NORMAL_MEMORY)
6807 return;
6808
6809 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 6907 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
6810 struct zone *zone = &pgdat->node_zones[zone_type]; 6908 struct zone *zone = &pgdat->node_zones[zone_type];
6811 if (populated_zone(zone)) { 6909 if (populated_zone(zone)) {
6812 node_set_state(nid, N_HIGH_MEMORY); 6910 if (IS_ENABLED(CONFIG_HIGHMEM))
6813 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 6911 node_set_state(nid, N_HIGH_MEMORY);
6814 zone_type <= ZONE_NORMAL) 6912 if (zone_type <= ZONE_NORMAL)
6815 node_set_state(nid, N_NORMAL_MEMORY); 6913 node_set_state(nid, N_NORMAL_MEMORY);
6816 break; 6914 break;
6817 } 6915 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 573d3663d846..a451ffa9491c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
283 struct swap_info_struct *sis = page_swap_info(page); 283 struct swap_info_struct *sis = page_swap_info(page);
284 284
285 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 285 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
286 if (sis->flags & SWP_FILE) { 286 if (sis->flags & SWP_FS) {
287 struct kiocb kiocb; 287 struct kiocb kiocb;
288 struct file *swap_file = sis->swap_file; 288 struct file *swap_file = sis->swap_file;
289 struct address_space *mapping = swap_file->f_mapping; 289 struct address_space *mapping = swap_file->f_mapping;
@@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous)
365 goto out; 365 goto out;
366 } 366 }
367 367
368 if (sis->flags & SWP_FILE) { 368 if (sis->flags & SWP_FS) {
369 struct file *swap_file = sis->swap_file; 369 struct file *swap_file = sis->swap_file;
370 struct address_space *mapping = swap_file->f_mapping; 370 struct address_space *mapping = swap_file->f_mapping;
371 371
@@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page)
423{ 423{
424 struct swap_info_struct *sis = page_swap_info(page); 424 struct swap_info_struct *sis = page_swap_info(page);
425 425
426 if (sis->flags & SWP_FILE) { 426 if (sis->flags & SWP_FS) {
427 struct address_space *mapping = sis->swap_file->f_mapping; 427 struct address_space *mapping = sis->swap_file->f_mapping;
428 428
429 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 429 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/slab.c b/mm/slab.c
index aa76a70e087e..2a5654bb3b3f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1288,7 +1288,7 @@ void __init kmem_cache_init(void)
1288 * Initialize the caches that provide memory for the kmem_cache_node 1288 * Initialize the caches that provide memory for the kmem_cache_node
1289 * structures first. Without this, further allocations will bug. 1289 * structures first. Without this, further allocations will bug.
1290 */ 1290 */
1291 kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( 1291 kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
1292 kmalloc_info[INDEX_NODE].name, 1292 kmalloc_info[INDEX_NODE].name,
1293 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, 1293 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
1294 0, kmalloc_size(INDEX_NODE)); 1294 0, kmalloc_size(INDEX_NODE));
@@ -1304,7 +1304,7 @@ void __init kmem_cache_init(void)
1304 for_each_online_node(nid) { 1304 for_each_online_node(nid) {
1305 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); 1305 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
1306 1306
1307 init_list(kmalloc_caches[INDEX_NODE], 1307 init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
1308 &init_kmem_cache_node[SIZE_NODE + nid], nid); 1308 &init_kmem_cache_node[SIZE_NODE + nid], nid);
1309 } 1309 }
1310 } 1310 }
@@ -3675,6 +3675,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3675 struct kmem_cache *cachep; 3675 struct kmem_cache *cachep;
3676 void *ret; 3676 void *ret;
3677 3677
3678 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3679 return NULL;
3678 cachep = kmalloc_slab(size, flags); 3680 cachep = kmalloc_slab(size, flags);
3679 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3681 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3680 return cachep; 3682 return cachep;
@@ -3710,6 +3712,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3710 struct kmem_cache *cachep; 3712 struct kmem_cache *cachep;
3711 void *ret; 3713 void *ret;
3712 3714
3715 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3716 return NULL;
3713 cachep = kmalloc_slab(size, flags); 3717 cachep = kmalloc_slab(size, flags);
3714 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3718 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3715 return cachep; 3719 return cachep;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fea3376f9816..7eb8dc136c1c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -973,14 +973,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
973 return s; 973 return s;
974} 974}
975 975
976struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; 976struct kmem_cache *
977kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
977EXPORT_SYMBOL(kmalloc_caches); 978EXPORT_SYMBOL(kmalloc_caches);
978 979
979#ifdef CONFIG_ZONE_DMA
980struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
981EXPORT_SYMBOL(kmalloc_dma_caches);
982#endif
983
984/* 980/*
985 * Conversion table for small slabs sizes / 8 to the index in the 981 * Conversion table for small slabs sizes / 8 to the index in the
986 * kmalloc array. This is necessary for slabs < 192 since we have non power 982 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -1027,25 +1023,20 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
1027{ 1023{
1028 unsigned int index; 1024 unsigned int index;
1029 1025
1030 if (unlikely(size > KMALLOC_MAX_SIZE)) {
1031 WARN_ON_ONCE(!(flags & __GFP_NOWARN));
1032 return NULL;
1033 }
1034
1035 if (size <= 192) { 1026 if (size <= 192) {
1036 if (!size) 1027 if (!size)
1037 return ZERO_SIZE_PTR; 1028 return ZERO_SIZE_PTR;
1038 1029
1039 index = size_index[size_index_elem(size)]; 1030 index = size_index[size_index_elem(size)];
1040 } else 1031 } else {
1032 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
1033 WARN_ON(1);
1034 return NULL;
1035 }
1041 index = fls(size - 1); 1036 index = fls(size - 1);
1037 }
1042 1038
1043#ifdef CONFIG_ZONE_DMA 1039 return kmalloc_caches[kmalloc_type(flags)][index];
1044 if (unlikely((flags & GFP_DMA)))
1045 return kmalloc_dma_caches[index];
1046
1047#endif
1048 return kmalloc_caches[index];
1049} 1040}
1050 1041
1051/* 1042/*
@@ -1059,15 +1050,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
1059 {"kmalloc-16", 16}, {"kmalloc-32", 32}, 1050 {"kmalloc-16", 16}, {"kmalloc-32", 32},
1060 {"kmalloc-64", 64}, {"kmalloc-128", 128}, 1051 {"kmalloc-64", 64}, {"kmalloc-128", 128},
1061 {"kmalloc-256", 256}, {"kmalloc-512", 512}, 1052 {"kmalloc-256", 256}, {"kmalloc-512", 512},
1062 {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, 1053 {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048},
1063 {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, 1054 {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192},
1064 {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, 1055 {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768},
1065 {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, 1056 {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072},
1066 {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, 1057 {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288},
1067 {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, 1058 {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152},
1068 {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, 1059 {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608},
1069 {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, 1060 {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432},
1070 {"kmalloc-67108864", 67108864} 1061 {"kmalloc-64M", 67108864}
1071}; 1062};
1072 1063
1073/* 1064/*
@@ -1117,9 +1108,36 @@ void __init setup_kmalloc_cache_index_table(void)
1117 } 1108 }
1118} 1109}
1119 1110
1120static void __init new_kmalloc_cache(int idx, slab_flags_t flags) 1111static const char *
1112kmalloc_cache_name(const char *prefix, unsigned int size)
1113{
1114
1115 static const char units[3] = "\0kM";
1116 int idx = 0;
1117
1118 while (size >= 1024 && (size % 1024 == 0)) {
1119 size /= 1024;
1120 idx++;
1121 }
1122
1123 return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
1124}
1125
1126static void __init
1127new_kmalloc_cache(int idx, int type, slab_flags_t flags)
1121{ 1128{
1122 kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, 1129 const char *name;
1130
1131 if (type == KMALLOC_RECLAIM) {
1132 flags |= SLAB_RECLAIM_ACCOUNT;
1133 name = kmalloc_cache_name("kmalloc-rcl",
1134 kmalloc_info[idx].size);
1135 BUG_ON(!name);
1136 } else {
1137 name = kmalloc_info[idx].name;
1138 }
1139
1140 kmalloc_caches[type][idx] = create_kmalloc_cache(name,
1123 kmalloc_info[idx].size, flags, 0, 1141 kmalloc_info[idx].size, flags, 0,
1124 kmalloc_info[idx].size); 1142 kmalloc_info[idx].size);
1125} 1143}
@@ -1131,21 +1149,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
1131 */ 1149 */
1132void __init create_kmalloc_caches(slab_flags_t flags) 1150void __init create_kmalloc_caches(slab_flags_t flags)
1133{ 1151{
1134 int i; 1152 int i, type;
1135 1153
1136 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { 1154 for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
1137 if (!kmalloc_caches[i]) 1155 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
1138 new_kmalloc_cache(i, flags); 1156 if (!kmalloc_caches[type][i])
1157 new_kmalloc_cache(i, type, flags);
1139 1158
1140 /* 1159 /*
1141 * Caches that are not of the two-to-the-power-of size. 1160 * Caches that are not of the two-to-the-power-of size.
1142 * These have to be created immediately after the 1161 * These have to be created immediately after the
1143 * earlier power of two caches 1162 * earlier power of two caches
1144 */ 1163 */
1145 if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) 1164 if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
1146 new_kmalloc_cache(1, flags); 1165 !kmalloc_caches[type][1])
1147 if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) 1166 new_kmalloc_cache(1, type, flags);
1148 new_kmalloc_cache(2, flags); 1167 if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
1168 !kmalloc_caches[type][2])
1169 new_kmalloc_cache(2, type, flags);
1170 }
1149 } 1171 }
1150 1172
1151 /* Kmalloc array is now usable */ 1173 /* Kmalloc array is now usable */
@@ -1153,16 +1175,15 @@ void __init create_kmalloc_caches(slab_flags_t flags)
1153 1175
1154#ifdef CONFIG_ZONE_DMA 1176#ifdef CONFIG_ZONE_DMA
1155 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { 1177 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
1156 struct kmem_cache *s = kmalloc_caches[i]; 1178 struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
1157 1179
1158 if (s) { 1180 if (s) {
1159 unsigned int size = kmalloc_size(i); 1181 unsigned int size = kmalloc_size(i);
1160 char *n = kasprintf(GFP_NOWAIT, 1182 const char *n = kmalloc_cache_name("dma-kmalloc", size);
1161 "dma-kmalloc-%u", size);
1162 1183
1163 BUG_ON(!n); 1184 BUG_ON(!n);
1164 kmalloc_dma_caches[i] = create_kmalloc_cache(n, 1185 kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
1165 size, SLAB_CACHE_DMA | flags, 0, 0); 1186 n, size, SLAB_CACHE_DMA | flags, 0, 0);
1166 } 1187 }
1167 } 1188 }
1168#endif 1189#endif
diff --git a/mm/slub.c b/mm/slub.c
index 8da34a8af53d..e3629cd7aff1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1276,16 +1276,54 @@ out:
1276 1276
1277__setup("slub_debug", setup_slub_debug); 1277__setup("slub_debug", setup_slub_debug);
1278 1278
1279/*
1280 * kmem_cache_flags - apply debugging options to the cache
1281 * @object_size: the size of an object without meta data
1282 * @flags: flags to set
1283 * @name: name of the cache
1284 * @ctor: constructor function
1285 *
1286 * Debug option(s) are applied to @flags. In addition to the debug
1287 * option(s), if a slab name (or multiple) is specified i.e.
1288 * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1289 * then only the select slabs will receive the debug option(s).
1290 */
1279slab_flags_t kmem_cache_flags(unsigned int object_size, 1291slab_flags_t kmem_cache_flags(unsigned int object_size,
1280 slab_flags_t flags, const char *name, 1292 slab_flags_t flags, const char *name,
1281 void (*ctor)(void *)) 1293 void (*ctor)(void *))
1282{ 1294{
1283 /* 1295 char *iter;
1284 * Enable debugging if selected on the kernel commandline. 1296 size_t len;
1285 */ 1297
1286 if (slub_debug && (!slub_debug_slabs || (name && 1298 /* If slub_debug = 0, it folds into the if conditional. */
1287 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) 1299 if (!slub_debug_slabs)
1288 flags |= slub_debug; 1300 return flags | slub_debug;
1301
1302 len = strlen(name);
1303 iter = slub_debug_slabs;
1304 while (*iter) {
1305 char *end, *glob;
1306 size_t cmplen;
1307
1308 end = strchr(iter, ',');
1309 if (!end)
1310 end = iter + strlen(iter);
1311
1312 glob = strnchr(iter, end - iter, '*');
1313 if (glob)
1314 cmplen = glob - iter;
1315 else
1316 cmplen = max_t(size_t, len, (end - iter));
1317
1318 if (!strncmp(name, iter, cmplen)) {
1319 flags |= slub_debug;
1320 break;
1321 }
1322
1323 if (!*end)
1324 break;
1325 iter = end + 1;
1326 }
1289 1327
1290 return flags; 1328 return flags;
1291} 1329}
@@ -3621,9 +3659,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
3621#ifdef CONFIG_SLUB_DEBUG 3659#ifdef CONFIG_SLUB_DEBUG
3622 void *addr = page_address(page); 3660 void *addr = page_address(page);
3623 void *p; 3661 void *p;
3624 unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects), 3662 unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
3625 sizeof(long),
3626 GFP_ATOMIC);
3627 if (!map) 3663 if (!map)
3628 return; 3664 return;
3629 slab_err(s, page, text, s->name); 3665 slab_err(s, page, text, s->name);
@@ -3638,7 +3674,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
3638 } 3674 }
3639 } 3675 }
3640 slab_unlock(page); 3676 slab_unlock(page);
3641 kfree(map); 3677 bitmap_free(map);
3642#endif 3678#endif
3643} 3679}
3644 3680
@@ -4411,10 +4447,8 @@ static long validate_slab_cache(struct kmem_cache *s)
4411{ 4447{
4412 int node; 4448 int node;
4413 unsigned long count = 0; 4449 unsigned long count = 0;
4414 unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
4415 sizeof(unsigned long),
4416 GFP_KERNEL);
4417 struct kmem_cache_node *n; 4450 struct kmem_cache_node *n;
4451 unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
4418 4452
4419 if (!map) 4453 if (!map)
4420 return -ENOMEM; 4454 return -ENOMEM;
@@ -4422,7 +4456,7 @@ static long validate_slab_cache(struct kmem_cache *s)
4422 flush_all(s); 4456 flush_all(s);
4423 for_each_kmem_cache_node(s, node, n) 4457 for_each_kmem_cache_node(s, node, n)
4424 count += validate_slab_node(s, n, map); 4458 count += validate_slab_node(s, n, map);
4425 kfree(map); 4459 bitmap_free(map);
4426 return count; 4460 return count;
4427} 4461}
4428/* 4462/*
@@ -4573,14 +4607,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
4573 unsigned long i; 4607 unsigned long i;
4574 struct loc_track t = { 0, 0, NULL }; 4608 struct loc_track t = { 0, 0, NULL };
4575 int node; 4609 int node;
4576 unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
4577 sizeof(unsigned long),
4578 GFP_KERNEL);
4579 struct kmem_cache_node *n; 4610 struct kmem_cache_node *n;
4611 unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
4580 4612
4581 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4613 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
4582 GFP_KERNEL)) { 4614 GFP_KERNEL)) {
4583 kfree(map); 4615 bitmap_free(map);
4584 return sprintf(buf, "Out of memory\n"); 4616 return sprintf(buf, "Out of memory\n");
4585 } 4617 }
4586 /* Push back cpu slabs */ 4618 /* Push back cpu slabs */
@@ -4646,7 +4678,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
4646 } 4678 }
4647 4679
4648 free_loc_track(&t); 4680 free_loc_track(&t);
4649 kfree(map); 4681 bitmap_free(map);
4650 if (!t.count) 4682 if (!t.count)
4651 len += sprintf(buf, "No data\n"); 4683 len += sprintf(buf, "No data\n");
4652 return len; 4684 return len;
@@ -4657,6 +4689,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
4657static void __init resiliency_test(void) 4689static void __init resiliency_test(void)
4658{ 4690{
4659 u8 *p; 4691 u8 *p;
4692 int type = KMALLOC_NORMAL;
4660 4693
4661 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); 4694 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4662 4695
@@ -4669,7 +4702,7 @@ static void __init resiliency_test(void)
4669 pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", 4702 pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
4670 p + 16); 4703 p + 16);
4671 4704
4672 validate_slab_cache(kmalloc_caches[4]); 4705 validate_slab_cache(kmalloc_caches[type][4]);
4673 4706
4674 /* Hmmm... The next two are dangerous */ 4707 /* Hmmm... The next two are dangerous */
4675 p = kzalloc(32, GFP_KERNEL); 4708 p = kzalloc(32, GFP_KERNEL);
@@ -4678,33 +4711,33 @@ static void __init resiliency_test(void)
4678 p); 4711 p);
4679 pr_err("If allocated object is overwritten then not detectable\n\n"); 4712 pr_err("If allocated object is overwritten then not detectable\n\n");
4680 4713
4681 validate_slab_cache(kmalloc_caches[5]); 4714 validate_slab_cache(kmalloc_caches[type][5]);
4682 p = kzalloc(64, GFP_KERNEL); 4715 p = kzalloc(64, GFP_KERNEL);
4683 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4716 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
4684 *p = 0x56; 4717 *p = 0x56;
4685 pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4718 pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
4686 p); 4719 p);
4687 pr_err("If allocated object is overwritten then not detectable\n\n"); 4720 pr_err("If allocated object is overwritten then not detectable\n\n");
4688 validate_slab_cache(kmalloc_caches[6]); 4721 validate_slab_cache(kmalloc_caches[type][6]);
4689 4722
4690 pr_err("\nB. Corruption after free\n"); 4723 pr_err("\nB. Corruption after free\n");
4691 p = kzalloc(128, GFP_KERNEL); 4724 p = kzalloc(128, GFP_KERNEL);
4692 kfree(p); 4725 kfree(p);
4693 *p = 0x78; 4726 *p = 0x78;
4694 pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4727 pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4695 validate_slab_cache(kmalloc_caches[7]); 4728 validate_slab_cache(kmalloc_caches[type][7]);
4696 4729
4697 p = kzalloc(256, GFP_KERNEL); 4730 p = kzalloc(256, GFP_KERNEL);
4698 kfree(p); 4731 kfree(p);
4699 p[50] = 0x9a; 4732 p[50] = 0x9a;
4700 pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 4733 pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
4701 validate_slab_cache(kmalloc_caches[8]); 4734 validate_slab_cache(kmalloc_caches[type][8]);
4702 4735
4703 p = kzalloc(512, GFP_KERNEL); 4736 p = kzalloc(512, GFP_KERNEL);
4704 kfree(p); 4737 kfree(p);
4705 p[512] = 0xab; 4738 p[512] = 0xab;
4706 pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4739 pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4707 validate_slab_cache(kmalloc_caches[9]); 4740 validate_slab_cache(kmalloc_caches[type][9]);
4708} 4741}
4709#else 4742#else
4710#ifdef CONFIG_SYSFS 4743#ifdef CONFIG_SYSFS
diff --git a/mm/sparse.c b/mm/sparse.c
index 10b07eea9a6e..67ad061f7fb8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -696,13 +696,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
696 goto out; 696 goto out;
697 } 697 }
698 698
699#ifdef CONFIG_DEBUG_VM
700 /* 699 /*
701 * Poison uninitialized struct pages in order to catch invalid flags 700 * Poison uninitialized struct pages in order to catch invalid flags
702 * combinations. 701 * combinations.
703 */ 702 */
704 memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); 703 page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
705#endif
706 704
707 section_mark_present(ms); 705 section_mark_present(ms);
708 sparse_init_one_section(ms, section_nr, memmap, usemap); 706 sparse_init_one_section(ms, section_nr, memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 26fc9b5f1b6c..87a54c8dee34 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,7 +29,6 @@
29#include <linux/cpu.h> 29#include <linux/cpu.h>
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memremap.h>
33#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
34#include <linux/gfp.h> 33#include <linux/gfp.h>
35#include <linux/uio.h> 34#include <linux/uio.h>
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ecee9c6c4cc1..0d6a7f268d2e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
448 /* 448 /*
449 * Initiate read into locked page and return. 449 * Initiate read into locked page and return.
450 */ 450 */
451 SetPageWorkingset(new_page);
451 lru_cache_add_anon(new_page); 452 lru_cache_add_anon(new_page);
452 *new_page_allocated = true; 453 *new_page_allocated = true;
453 return new_page; 454 return new_page;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d954b71c4f9c..644f746e167a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -103,26 +103,39 @@ static inline unsigned char swap_count(unsigned char ent)
103 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ 103 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
104} 104}
105 105
106/* Reclaim the swap entry anyway if possible */
107#define TTRS_ANYWAY 0x1
108/*
109 * Reclaim the swap entry if there are no more mappings of the
110 * corresponding page
111 */
112#define TTRS_UNMAPPED 0x2
113/* Reclaim the swap entry if swap is getting full*/
114#define TTRS_FULL 0x4
115
106/* returns 1 if swap entry is freed */ 116/* returns 1 if swap entry is freed */
107static int 117static int __try_to_reclaim_swap(struct swap_info_struct *si,
108__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 118 unsigned long offset, unsigned long flags)
109{ 119{
110 swp_entry_t entry = swp_entry(si->type, offset); 120 swp_entry_t entry = swp_entry(si->type, offset);
111 struct page *page; 121 struct page *page;
112 int ret = 0; 122 int ret = 0;
113 123
114 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 124 page = find_get_page(swap_address_space(entry), offset);
115 if (!page) 125 if (!page)
116 return 0; 126 return 0;
117 /* 127 /*
118 * This function is called from scan_swap_map() and it's called 128 * When this function is called from scan_swap_map_slots() and it's
119 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. 129 * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
120 * We have to use trylock for avoiding deadlock. This is a special 130 * here. We have to use trylock for avoiding deadlock. This is a special
121 * case and you should use try_to_free_swap() with explicit lock_page() 131 * case and you should use try_to_free_swap() with explicit lock_page()
122 * in usual operations. 132 * in usual operations.
123 */ 133 */
124 if (trylock_page(page)) { 134 if (trylock_page(page)) {
125 ret = try_to_free_swap(page); 135 if ((flags & TTRS_ANYWAY) ||
136 ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
137 ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
138 ret = try_to_free_swap(page);
126 unlock_page(page); 139 unlock_page(page);
127 } 140 }
128 put_page(page); 141 put_page(page);
@@ -780,7 +793,7 @@ checks:
780 int swap_was_freed; 793 int swap_was_freed;
781 unlock_cluster(ci); 794 unlock_cluster(ci);
782 spin_unlock(&si->lock); 795 spin_unlock(&si->lock);
783 swap_was_freed = __try_to_reclaim_swap(si, offset); 796 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
784 spin_lock(&si->lock); 797 spin_lock(&si->lock);
785 /* entry was freed successfully, try to use this again */ 798 /* entry was freed successfully, try to use this again */
786 if (swap_was_freed) 799 if (swap_was_freed)
@@ -919,6 +932,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
919 struct swap_cluster_info *ci; 932 struct swap_cluster_info *ci;
920 933
921 ci = lock_cluster(si, offset); 934 ci = lock_cluster(si, offset);
935 memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
922 cluster_set_count_flag(ci, 0, 0); 936 cluster_set_count_flag(ci, 0, 0);
923 free_cluster(si, idx); 937 free_cluster(si, idx);
924 unlock_cluster(ci); 938 unlock_cluster(ci);
@@ -989,7 +1003,7 @@ start_over:
989 goto nextsi; 1003 goto nextsi;
990 } 1004 }
991 if (size == SWAPFILE_CLUSTER) { 1005 if (size == SWAPFILE_CLUSTER) {
992 if (!(si->flags & SWP_FILE)) 1006 if (!(si->flags & SWP_FS))
993 n_ret = swap_alloc_cluster(si, swp_entries); 1007 n_ret = swap_alloc_cluster(si, swp_entries);
994 } else 1008 } else
995 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, 1009 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
@@ -1169,6 +1183,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
1169 ci = lock_cluster_or_swap_info(p, offset); 1183 ci = lock_cluster_or_swap_info(p, offset);
1170 usage = __swap_entry_free_locked(p, offset, usage); 1184 usage = __swap_entry_free_locked(p, offset, usage);
1171 unlock_cluster_or_swap_info(p, ci); 1185 unlock_cluster_or_swap_info(p, ci);
1186 if (!usage)
1187 free_swap_slot(entry);
1172 1188
1173 return usage; 1189 return usage;
1174} 1190}
@@ -1199,10 +1215,8 @@ void swap_free(swp_entry_t entry)
1199 struct swap_info_struct *p; 1215 struct swap_info_struct *p;
1200 1216
1201 p = _swap_info_get(entry); 1217 p = _swap_info_get(entry);
1202 if (p) { 1218 if (p)
1203 if (!__swap_entry_free(p, entry, 1)) 1219 __swap_entry_free(p, entry, 1);
1204 free_swap_slot(entry);
1205 }
1206} 1220}
1207 1221
1208/* 1222/*
@@ -1237,9 +1251,6 @@ void put_swap_page(struct page *page, swp_entry_t entry)
1237 if (free_entries == SWAPFILE_CLUSTER) { 1251 if (free_entries == SWAPFILE_CLUSTER) {
1238 unlock_cluster_or_swap_info(si, ci); 1252 unlock_cluster_or_swap_info(si, ci);
1239 spin_lock(&si->lock); 1253 spin_lock(&si->lock);
1240 ci = lock_cluster(si, offset);
1241 memset(map, 0, SWAPFILE_CLUSTER);
1242 unlock_cluster(ci);
1243 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); 1254 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1244 swap_free_cluster(si, idx); 1255 swap_free_cluster(si, idx);
1245 spin_unlock(&si->lock); 1256 spin_unlock(&si->lock);
@@ -1612,7 +1623,6 @@ int try_to_free_swap(struct page *page)
1612int free_swap_and_cache(swp_entry_t entry) 1623int free_swap_and_cache(swp_entry_t entry)
1613{ 1624{
1614 struct swap_info_struct *p; 1625 struct swap_info_struct *p;
1615 struct page *page = NULL;
1616 unsigned char count; 1626 unsigned char count;
1617 1627
1618 if (non_swap_entry(entry)) 1628 if (non_swap_entry(entry))
@@ -1622,30 +1632,9 @@ int free_swap_and_cache(swp_entry_t entry)
1622 if (p) { 1632 if (p) {
1623 count = __swap_entry_free(p, entry, 1); 1633 count = __swap_entry_free(p, entry, 1);
1624 if (count == SWAP_HAS_CACHE && 1634 if (count == SWAP_HAS_CACHE &&
1625 !swap_page_trans_huge_swapped(p, entry)) { 1635 !swap_page_trans_huge_swapped(p, entry))
1626 page = find_get_page(swap_address_space(entry), 1636 __try_to_reclaim_swap(p, swp_offset(entry),
1627 swp_offset(entry)); 1637 TTRS_UNMAPPED | TTRS_FULL);
1628 if (page && !trylock_page(page)) {
1629 put_page(page);
1630 page = NULL;
1631 }
1632 } else if (!count)
1633 free_swap_slot(entry);
1634 }
1635 if (page) {
1636 /*
1637 * Not mapped elsewhere, or swap space full? Free it!
1638 * Also recheck PageSwapCache now page is locked (above).
1639 */
1640 if (PageSwapCache(page) && !PageWriteback(page) &&
1641 (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
1642 !swap_page_trans_huge_swapped(p, entry)) {
1643 page = compound_head(page);
1644 delete_from_swap_cache(page);
1645 SetPageDirty(page);
1646 }
1647 unlock_page(page);
1648 put_page(page);
1649 } 1638 }
1650 return p != NULL; 1639 return p != NULL;
1651} 1640}
@@ -2310,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
2310 kfree(se); 2299 kfree(se);
2311 } 2300 }
2312 2301
2313 if (sis->flags & SWP_FILE) { 2302 if (sis->flags & SWP_ACTIVATED) {
2314 struct file *swap_file = sis->swap_file; 2303 struct file *swap_file = sis->swap_file;
2315 struct address_space *mapping = swap_file->f_mapping; 2304 struct address_space *mapping = swap_file->f_mapping;
2316 2305
2317 sis->flags &= ~SWP_FILE; 2306 sis->flags &= ~SWP_ACTIVATED;
2318 mapping->a_ops->swap_deactivate(swap_file); 2307 if (mapping->a_ops->swap_deactivate)
2308 mapping->a_ops->swap_deactivate(swap_file);
2319 } 2309 }
2320} 2310}
2321 2311
@@ -2364,6 +2354,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2364 list_add_tail(&new_se->list, &sis->first_swap_extent.list); 2354 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2365 return 1; 2355 return 1;
2366} 2356}
2357EXPORT_SYMBOL_GPL(add_swap_extent);
2367 2358
2368/* 2359/*
2369 * A `swap extent' is a simple thing which maps a contiguous range of pages 2360 * A `swap extent' is a simple thing which maps a contiguous range of pages
@@ -2411,8 +2402,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2411 2402
2412 if (mapping->a_ops->swap_activate) { 2403 if (mapping->a_ops->swap_activate) {
2413 ret = mapping->a_ops->swap_activate(sis, swap_file, span); 2404 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2405 if (ret >= 0)
2406 sis->flags |= SWP_ACTIVATED;
2414 if (!ret) { 2407 if (!ret) {
2415 sis->flags |= SWP_FILE; 2408 sis->flags |= SWP_FS;
2416 ret = add_swap_extent(sis, 0, sis->max, 0); 2409 ret = add_swap_extent(sis, 0, sis->max, 0);
2417 *span = sis->pages; 2410 *span = sis->pages;
2418 } 2411 }
diff --git a/mm/util.c b/mm/util.c
index 470f5cd80b64..8bf08b5b5760 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -435,7 +435,7 @@ EXPORT_SYMBOL(kvmalloc_node);
435 * It is slightly more efficient to use kfree() or vfree() if you are certain 435 * It is slightly more efficient to use kfree() or vfree() if you are certain
436 * that you know which one to use. 436 * that you know which one to use.
437 * 437 *
438 * Context: Any context except NMI. 438 * Context: Either preemptible task context or not-NMI interrupt.
439 */ 439 */
440void kvfree(const void *addr) 440void kvfree(const void *addr)
441{ 441{
@@ -678,8 +678,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
678 * Part of the kernel memory, which can be released 678 * Part of the kernel memory, which can be released
679 * under memory pressure. 679 * under memory pressure.
680 */ 680 */
681 free += global_node_page_state( 681 free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
682 NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT;
683 682
684 /* 683 /*
685 * Leave reserved pages. The pages are not for anonymous pages. 684 * Leave reserved pages. The pages are not for anonymous pages.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a728fc492557..97d4b25d0373 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1577,6 +1577,8 @@ void vfree_atomic(const void *addr)
1577 * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 1577 * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
1578 * conventions for vfree() arch-depenedent would be a really bad idea) 1578 * conventions for vfree() arch-depenedent would be a really bad idea)
1579 * 1579 *
1580 * May sleep if called *not* from interrupt context.
1581 *
1580 * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) 1582 * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
1581 */ 1583 */
1582void vfree(const void *addr) 1584void vfree(const void *addr)
@@ -1585,6 +1587,8 @@ void vfree(const void *addr)
1585 1587
1586 kmemleak_free(addr); 1588 kmemleak_free(addr);
1587 1589
1590 might_sleep_if(!in_interrupt());
1591
1588 if (!addr) 1592 if (!addr)
1589 return; 1593 return;
1590 if (unlikely(in_interrupt())) 1594 if (unlikely(in_interrupt()))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5ef7240cbcb..28c9ae5633b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -49,6 +49,7 @@
49#include <linux/prefetch.h> 49#include <linux/prefetch.h>
50#include <linux/printk.h> 50#include <linux/printk.h>
51#include <linux/dax.h> 51#include <linux/dax.h>
52#include <linux/psi.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
54#include <asm/div64.h> 55#include <asm/div64.h>
@@ -473,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
473 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 474 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
474 475
475 total_scan = nr; 476 total_scan = nr;
476 delta = freeable >> priority; 477 if (shrinker->seeks) {
477 delta *= 4; 478 delta = freeable >> priority;
478 do_div(delta, shrinker->seeks); 479 delta *= 4;
480 do_div(delta, shrinker->seeks);
481 } else {
482 /*
483 * These objects don't require any IO to create. Trim
484 * them aggressively under memory pressure to keep
485 * them from causing refetches in the IO caches.
486 */
487 delta = freeable / 2;
488 }
479 489
480 /* 490 /*
481 * Make sure we apply some minimal pressure on default priority 491 * Make sure we apply some minimal pressure on default priority
@@ -2145,6 +2155,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
2145 } 2155 }
2146 2156
2147 ClearPageActive(page); /* we are de-activating */ 2157 ClearPageActive(page); /* we are de-activating */
2158 SetPageWorkingset(page);
2148 list_add(&page->lru, &l_inactive); 2159 list_add(&page->lru, &l_inactive);
2149 } 2160 }
2150 2161
@@ -2456,9 +2467,11 @@ out:
2456 /* 2467 /*
2457 * Scan types proportional to swappiness and 2468 * Scan types proportional to swappiness and
2458 * their relative recent reclaim efficiency. 2469 * their relative recent reclaim efficiency.
2470 * Make sure we don't miss the last page
2471 * because of a round-off error.
2459 */ 2472 */
2460 scan = div64_u64(scan * fraction[file], 2473 scan = DIV64_U64_ROUND_UP(scan * fraction[file],
2461 denominator); 2474 denominator);
2462 break; 2475 break;
2463 case SCAN_FILE: 2476 case SCAN_FILE:
2464 case SCAN_ANON: 2477 case SCAN_ANON:
@@ -3302,6 +3315,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3302{ 3315{
3303 struct zonelist *zonelist; 3316 struct zonelist *zonelist;
3304 unsigned long nr_reclaimed; 3317 unsigned long nr_reclaimed;
3318 unsigned long pflags;
3305 int nid; 3319 int nid;
3306 unsigned int noreclaim_flag; 3320 unsigned int noreclaim_flag;
3307 struct scan_control sc = { 3321 struct scan_control sc = {
@@ -3330,9 +3344,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3330 sc.gfp_mask, 3344 sc.gfp_mask,
3331 sc.reclaim_idx); 3345 sc.reclaim_idx);
3332 3346
3347 psi_memstall_enter(&pflags);
3333 noreclaim_flag = memalloc_noreclaim_save(); 3348 noreclaim_flag = memalloc_noreclaim_save();
3349
3334 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3350 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3351
3335 memalloc_noreclaim_restore(noreclaim_flag); 3352 memalloc_noreclaim_restore(noreclaim_flag);
3353 psi_memstall_leave(&pflags);
3336 3354
3337 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 3355 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3338 3356
@@ -3497,6 +3515,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3497 int i; 3515 int i;
3498 unsigned long nr_soft_reclaimed; 3516 unsigned long nr_soft_reclaimed;
3499 unsigned long nr_soft_scanned; 3517 unsigned long nr_soft_scanned;
3518 unsigned long pflags;
3500 struct zone *zone; 3519 struct zone *zone;
3501 struct scan_control sc = { 3520 struct scan_control sc = {
3502 .gfp_mask = GFP_KERNEL, 3521 .gfp_mask = GFP_KERNEL,
@@ -3507,6 +3526,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3507 .may_swap = 1, 3526 .may_swap = 1,
3508 }; 3527 };
3509 3528
3529 psi_memstall_enter(&pflags);
3510 __fs_reclaim_acquire(); 3530 __fs_reclaim_acquire();
3511 3531
3512 count_vm_event(PAGEOUTRUN); 3532 count_vm_event(PAGEOUTRUN);
@@ -3608,6 +3628,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3608out: 3628out:
3609 snapshot_refaults(NULL, pgdat); 3629 snapshot_refaults(NULL, pgdat);
3610 __fs_reclaim_release(); 3630 __fs_reclaim_release();
3631 psi_memstall_leave(&pflags);
3611 /* 3632 /*
3612 * Return the order kswapd stopped reclaiming at as 3633 * Return the order kswapd stopped reclaiming at as
3613 * prepare_kswapd_sleep() takes it into account. If another caller 3634 * prepare_kswapd_sleep() takes it into account. If another caller
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7878da76abf2..6038ce593ce3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1143,8 +1143,10 @@ const char * const vmstat_text[] = {
1143 "nr_slab_unreclaimable", 1143 "nr_slab_unreclaimable",
1144 "nr_isolated_anon", 1144 "nr_isolated_anon",
1145 "nr_isolated_file", 1145 "nr_isolated_file",
1146 "workingset_nodes",
1146 "workingset_refault", 1147 "workingset_refault",
1147 "workingset_activate", 1148 "workingset_activate",
1149 "workingset_restore",
1148 "workingset_nodereclaim", 1150 "workingset_nodereclaim",
1149 "nr_anon_pages", 1151 "nr_anon_pages",
1150 "nr_mapped", 1152 "nr_mapped",
@@ -1161,7 +1163,7 @@ const char * const vmstat_text[] = {
1161 "nr_vmscan_immediate_reclaim", 1163 "nr_vmscan_immediate_reclaim",
1162 "nr_dirtied", 1164 "nr_dirtied",
1163 "nr_written", 1165 "nr_written",
1164 "", /* nr_indirectly_reclaimable */ 1166 "nr_kernel_misc_reclaimable",
1165 1167
1166 /* enum writeback_stat_item counters */ 1168 /* enum writeback_stat_item counters */
1167 "nr_dirty_threshold", 1169 "nr_dirty_threshold",
@@ -1663,6 +1665,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
1663 stat_items_size += sizeof(struct vm_event_state); 1665 stat_items_size += sizeof(struct vm_event_state);
1664#endif 1666#endif
1665 1667
1668 BUILD_BUG_ON(stat_items_size !=
1669 ARRAY_SIZE(vmstat_text) * sizeof(unsigned long));
1666 v = kmalloc(stat_items_size, GFP_KERNEL); 1670 v = kmalloc(stat_items_size, GFP_KERNEL);
1667 m->private = v; 1671 m->private = v;
1668 if (!v) 1672 if (!v)
@@ -1706,10 +1710,6 @@ static int vmstat_show(struct seq_file *m, void *arg)
1706 unsigned long *l = arg; 1710 unsigned long *l = arg;
1707 unsigned long off = l - (unsigned long *)m->private; 1711 unsigned long off = l - (unsigned long *)m->private;
1708 1712
1709 /* Skip hidden vmstat items. */
1710 if (*vmstat_text[off] == '\0')
1711 return 0;
1712
1713 seq_puts(m, vmstat_text[off]); 1713 seq_puts(m, vmstat_text[off]);
1714 seq_put_decimal_ull(m, " ", *l); 1714 seq_put_decimal_ull(m, " ", *l);
1715 seq_putc(m, '\n'); 1715 seq_putc(m, '\n');
diff --git a/mm/workingset.c b/mm/workingset.c
index 4516dd790129..cbc13d4dfa79 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -121,7 +121,7 @@
121 * the only thing eating into inactive list space is active pages. 121 * the only thing eating into inactive list space is active pages.
122 * 122 *
123 * 123 *
124 * Activating refaulting pages 124 * Refaulting inactive pages
125 * 125 *
126 * All that is known about the active list is that the pages have been 126 * All that is known about the active list is that the pages have been
127 * accessed more than once in the past. This means that at any given 127 * accessed more than once in the past. This means that at any given
@@ -134,6 +134,10 @@
134 * used less frequently than the refaulting page - or even not used at 134 * used less frequently than the refaulting page - or even not used at
135 * all anymore. 135 * all anymore.
136 * 136 *
137 * That means if inactive cache is refaulting with a suitable refault
138 * distance, we assume the cache workingset is transitioning and put
139 * pressure on the current active list.
140 *
137 * If this is wrong and demotion kicks in, the pages which are truly 141 * If this is wrong and demotion kicks in, the pages which are truly
138 * used more frequently will be reactivated while the less frequently 142 * used more frequently will be reactivated while the less frequently
139 * used once will be evicted from memory. 143 * used once will be evicted from memory.
@@ -141,6 +145,14 @@
141 * But if this is right, the stale pages will be pushed out of memory 145 * But if this is right, the stale pages will be pushed out of memory
142 * and the used pages get to stay in cache. 146 * and the used pages get to stay in cache.
143 * 147 *
148 * Refaulting active pages
149 *
150 * If on the other hand the refaulting pages have recently been
151 * deactivated, it means that the active list is no longer protecting
152 * actively used cache from reclaim. The cache is NOT transitioning to
153 * a different workingset; the existing workingset is thrashing in the
154 * space allocated to the page cache.
155 *
144 * 156 *
145 * Implementation 157 * Implementation
146 * 158 *
@@ -156,8 +168,7 @@
156 */ 168 */
157 169
158#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ 170#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
159 NODES_SHIFT + \ 171 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
160 MEM_CGROUP_ID_SHIFT)
161#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 172#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
162 173
163/* 174/*
@@ -170,23 +181,28 @@
170 */ 181 */
171static unsigned int bucket_order __read_mostly; 182static unsigned int bucket_order __read_mostly;
172 183
173static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) 184static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
185 bool workingset)
174{ 186{
175 eviction >>= bucket_order; 187 eviction >>= bucket_order;
176 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 188 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
177 eviction = (eviction << NODES_SHIFT) | pgdat->node_id; 189 eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
190 eviction = (eviction << 1) | workingset;
178 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); 191 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
179 192
180 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); 193 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
181} 194}
182 195
183static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, 196static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
184 unsigned long *evictionp) 197 unsigned long *evictionp, bool *workingsetp)
185{ 198{
186 unsigned long entry = (unsigned long)shadow; 199 unsigned long entry = (unsigned long)shadow;
187 int memcgid, nid; 200 int memcgid, nid;
201 bool workingset;
188 202
189 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; 203 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
204 workingset = entry & 1;
205 entry >>= 1;
190 nid = entry & ((1UL << NODES_SHIFT) - 1); 206 nid = entry & ((1UL << NODES_SHIFT) - 1);
191 entry >>= NODES_SHIFT; 207 entry >>= NODES_SHIFT;
192 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); 208 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
195 *memcgidp = memcgid; 211 *memcgidp = memcgid;
196 *pgdat = NODE_DATA(nid); 212 *pgdat = NODE_DATA(nid);
197 *evictionp = entry << bucket_order; 213 *evictionp = entry << bucket_order;
214 *workingsetp = workingset;
198} 215}
199 216
200/** 217/**
@@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
207 */ 224 */
208void *workingset_eviction(struct address_space *mapping, struct page *page) 225void *workingset_eviction(struct address_space *mapping, struct page *page)
209{ 226{
210 struct mem_cgroup *memcg = page_memcg(page);
211 struct pglist_data *pgdat = page_pgdat(page); 227 struct pglist_data *pgdat = page_pgdat(page);
228 struct mem_cgroup *memcg = page_memcg(page);
212 int memcgid = mem_cgroup_id(memcg); 229 int memcgid = mem_cgroup_id(memcg);
213 unsigned long eviction; 230 unsigned long eviction;
214 struct lruvec *lruvec; 231 struct lruvec *lruvec;
@@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
220 237
221 lruvec = mem_cgroup_lruvec(pgdat, memcg); 238 lruvec = mem_cgroup_lruvec(pgdat, memcg);
222 eviction = atomic_long_inc_return(&lruvec->inactive_age); 239 eviction = atomic_long_inc_return(&lruvec->inactive_age);
223 return pack_shadow(memcgid, pgdat, eviction); 240 return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
224} 241}
225 242
226/** 243/**
227 * workingset_refault - evaluate the refault of a previously evicted page 244 * workingset_refault - evaluate the refault of a previously evicted page
245 * @page: the freshly allocated replacement page
228 * @shadow: shadow entry of the evicted page 246 * @shadow: shadow entry of the evicted page
229 * 247 *
230 * Calculates and evaluates the refault distance of the previously 248 * Calculates and evaluates the refault distance of the previously
231 * evicted page in the context of the node it was allocated in. 249 * evicted page in the context of the node it was allocated in.
232 *
233 * Returns %true if the page should be activated, %false otherwise.
234 */ 250 */
235bool workingset_refault(void *shadow) 251void workingset_refault(struct page *page, void *shadow)
236{ 252{
237 unsigned long refault_distance; 253 unsigned long refault_distance;
254 struct pglist_data *pgdat;
238 unsigned long active_file; 255 unsigned long active_file;
239 struct mem_cgroup *memcg; 256 struct mem_cgroup *memcg;
240 unsigned long eviction; 257 unsigned long eviction;
241 struct lruvec *lruvec; 258 struct lruvec *lruvec;
242 unsigned long refault; 259 unsigned long refault;
243 struct pglist_data *pgdat; 260 bool workingset;
244 int memcgid; 261 int memcgid;
245 262
246 unpack_shadow(shadow, &memcgid, &pgdat, &eviction); 263 unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
247 264
248 rcu_read_lock(); 265 rcu_read_lock();
249 /* 266 /*
@@ -263,41 +280,51 @@ bool workingset_refault(void *shadow)
263 * configurations instead. 280 * configurations instead.
264 */ 281 */
265 memcg = mem_cgroup_from_id(memcgid); 282 memcg = mem_cgroup_from_id(memcgid);
266 if (!mem_cgroup_disabled() && !memcg) { 283 if (!mem_cgroup_disabled() && !memcg)
267 rcu_read_unlock(); 284 goto out;
268 return false;
269 }
270 lruvec = mem_cgroup_lruvec(pgdat, memcg); 285 lruvec = mem_cgroup_lruvec(pgdat, memcg);
271 refault = atomic_long_read(&lruvec->inactive_age); 286 refault = atomic_long_read(&lruvec->inactive_age);
272 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); 287 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
273 288
274 /* 289 /*
275 * The unsigned subtraction here gives an accurate distance 290 * Calculate the refault distance
276 * across inactive_age overflows in most cases.
277 * 291 *
278 * There is a special case: usually, shadow entries have a 292 * The unsigned subtraction here gives an accurate distance
279 * short lifetime and are either refaulted or reclaimed along 293 * across inactive_age overflows in most cases. There is a
280 * with the inode before they get too old. But it is not 294 * special case: usually, shadow entries have a short lifetime
281 * impossible for the inactive_age to lap a shadow entry in 295 * and are either refaulted or reclaimed along with the inode
282 * the field, which can then can result in a false small 296 * before they get too old. But it is not impossible for the
283 * refault distance, leading to a false activation should this 297 * inactive_age to lap a shadow entry in the field, which can
284 * old entry actually refault again. However, earlier kernels 298 * then result in a false small refault distance, leading to a
285 * used to deactivate unconditionally with *every* reclaim 299 * false activation should this old entry actually refault
286 * invocation for the longest time, so the occasional 300 * again. However, earlier kernels used to deactivate
287 * inappropriate activation leading to pressure on the active 301 * unconditionally with *every* reclaim invocation for the
288 * list is not a problem. 302 * longest time, so the occasional inappropriate activation
303 * leading to pressure on the active list is not a problem.
289 */ 304 */
290 refault_distance = (refault - eviction) & EVICTION_MASK; 305 refault_distance = (refault - eviction) & EVICTION_MASK;
291 306
292 inc_lruvec_state(lruvec, WORKINGSET_REFAULT); 307 inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
293 308
294 if (refault_distance <= active_file) { 309 /*
295 inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); 310 * Compare the distance to the existing workingset size. We
296 rcu_read_unlock(); 311 * don't act on pages that couldn't stay resident even if all
297 return true; 312 * the memory was available to the page cache.
313 */
314 if (refault_distance > active_file)
315 goto out;
316
317 SetPageActive(page);
318 atomic_long_inc(&lruvec->inactive_age);
319 inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
320
321 /* Page was active prior to eviction */
322 if (workingset) {
323 SetPageWorkingset(page);
324 inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
298 } 325 }
326out:
299 rcu_read_unlock(); 327 rcu_read_unlock();
300 return false;
301} 328}
302 329
303/** 330/**
@@ -350,12 +377,20 @@ void workingset_update_node(struct radix_tree_node *node)
350 * already where they should be. The list_empty() test is safe 377 * already where they should be. The list_empty() test is safe
351 * as node->private_list is protected by the i_pages lock. 378 * as node->private_list is protected by the i_pages lock.
352 */ 379 */
380 VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
381
353 if (node->count && node->count == node->exceptional) { 382 if (node->count && node->count == node->exceptional) {
354 if (list_empty(&node->private_list)) 383 if (list_empty(&node->private_list)) {
355 list_lru_add(&shadow_nodes, &node->private_list); 384 list_lru_add(&shadow_nodes, &node->private_list);
385 __inc_lruvec_page_state(virt_to_page(node),
386 WORKINGSET_NODES);
387 }
356 } else { 388 } else {
357 if (!list_empty(&node->private_list)) 389 if (!list_empty(&node->private_list)) {
358 list_lru_del(&shadow_nodes, &node->private_list); 390 list_lru_del(&shadow_nodes, &node->private_list);
391 __dec_lruvec_page_state(virt_to_page(node),
392 WORKINGSET_NODES);
393 }
359 } 394 }
360} 395}
361 396
@@ -364,7 +399,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
364{ 399{
365 unsigned long max_nodes; 400 unsigned long max_nodes;
366 unsigned long nodes; 401 unsigned long nodes;
367 unsigned long cache; 402 unsigned long pages;
368 403
369 nodes = list_lru_shrink_count(&shadow_nodes, sc); 404 nodes = list_lru_shrink_count(&shadow_nodes, sc);
370 405
@@ -390,14 +425,20 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
390 * 425 *
391 * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE 426 * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
392 */ 427 */
428#ifdef CONFIG_MEMCG
393 if (sc->memcg) { 429 if (sc->memcg) {
394 cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, 430 struct lruvec *lruvec;
395 LRU_ALL_FILE); 431
396 } else { 432 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
397 cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + 433 LRU_ALL);
398 node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); 434 lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
399 } 435 pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
400 max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); 436 pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
437 } else
438#endif
439 pages = node_present_pages(sc->nid);
440
441 max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3);
401 442
402 if (!nodes) 443 if (!nodes)
403 return SHRINK_EMPTY; 444 return SHRINK_EMPTY;
@@ -440,6 +481,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
440 } 481 }
441 482
442 list_lru_isolate(lru, item); 483 list_lru_isolate(lru, item);
484 __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES);
485
443 spin_unlock(lru_lock); 486 spin_unlock(lru_lock);
444 487
445 /* 488 /*
@@ -467,7 +510,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
467 } 510 }
468 if (WARN_ON_ONCE(node->exceptional)) 511 if (WARN_ON_ONCE(node->exceptional))
469 goto out_invalid; 512 goto out_invalid;
470 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); 513 __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
471 __radix_tree_delete_node(&mapping->i_pages, node, 514 __radix_tree_delete_node(&mapping->i_pages, node,
472 workingset_lookup_update(mapping)); 515 workingset_lookup_update(mapping));
473 516
@@ -491,7 +534,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
491static struct shrinker workingset_shadow_shrinker = { 534static struct shrinker workingset_shadow_shrinker = {
492 .count_objects = count_shadow_nodes, 535 .count_objects = count_shadow_nodes,
493 .scan_objects = scan_shadow_nodes, 536 .scan_objects = scan_shadow_nodes,
494 .seeks = DEFAULT_SEEKS, 537 .seeks = 0, /* ->count reports only fully expendable nodes */
495 .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, 538 .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
496}; 539};
497 540
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9da65552e7ca..0787d33b80d8 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -418,7 +418,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle,
418 case ZPOOL_MM_WO: 418 case ZPOOL_MM_WO:
419 zs_mm = ZS_MM_WO; 419 zs_mm = ZS_MM_WO;
420 break; 420 break;
421 case ZPOOL_MM_RW: /* fallthru */ 421 case ZPOOL_MM_RW: /* fall through */
422 default: 422 default:
423 zs_mm = ZS_MM_RW; 423 zs_mm = ZS_MM_RW;
424 break; 424 break;
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 26de7d5aa5c8..4fa070f9231a 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -203,7 +203,7 @@ regex_c=(
203 '/\<DECLARE_\(TASKLET\|WORK\|DELAYED_WORK\)(\([[:alnum:]_]*\)/\2/v/' 203 '/\<DECLARE_\(TASKLET\|WORK\|DELAYED_WORK\)(\([[:alnum:]_]*\)/\2/v/'
204 '/\(^\s\)OFFSET(\([[:alnum:]_]*\)/\2/v/' 204 '/\(^\s\)OFFSET(\([[:alnum:]_]*\)/\2/v/'
205 '/\(^\s\)DEFINE(\([[:alnum:]_]*\)/\2/v/' 205 '/\(^\s\)DEFINE(\([[:alnum:]_]*\)/\2/v/'
206 '/\<DEFINE_HASHTABLE(\([[:alnum:]_]*\)/\1/v/' 206 '/\<\(DEFINE\|DECLARE\)_HASHTABLE(\([[:alnum:]_]*\)/\2/v/'
207) 207)
208regex_kconfig=( 208regex_kconfig=(
209 '/^[[:blank:]]*\(menu\|\)config[[:blank:]]\+\([[:alnum:]_]\+\)/\2/' 209 '/^[[:blank:]]*\(menu\|\)config[[:blank:]]\+\([[:alnum:]_]\+\)/\2/'
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 9f420d98b5fb..8cb504d30384 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t)
203 "SWAP %15s%15s%15s\n" 203 "SWAP %15s%15s%15s\n"
204 " %15llu%15llu%15llums\n" 204 " %15llu%15llu%15llums\n"
205 "RECLAIM %12s%15s%15s\n" 205 "RECLAIM %12s%15s%15s\n"
206 " %15llu%15llu%15llums\n"
207 "THRASHING%12s%15s%15s\n"
206 " %15llu%15llu%15llums\n", 208 " %15llu%15llu%15llums\n",
207 "count", "real total", "virtual total", 209 "count", "real total", "virtual total",
208 "delay total", "delay average", 210 "delay total", "delay average",
@@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t)
222 "count", "delay total", "delay average", 224 "count", "delay total", "delay average",
223 (unsigned long long)t->freepages_count, 225 (unsigned long long)t->freepages_count,
224 (unsigned long long)t->freepages_delay_total, 226 (unsigned long long)t->freepages_delay_total,
225 average_ms(t->freepages_delay_total, t->freepages_count)); 227 average_ms(t->freepages_delay_total, t->freepages_count),
228 "count", "delay total", "delay average",
229 (unsigned long long)t->thrashing_count,
230 (unsigned long long)t->thrashing_delay_total,
231 average_ms(t->thrashing_delay_total, t->thrashing_count));
226} 232}
227 233
228static void task_context_switch_counts(struct taskstats *t) 234static void task_context_switch_counts(struct taskstats *t)
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index af5ff83f6d7f..31b3c98b6d34 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -13,3 +13,4 @@ mlock-random-test
13virtual_address_range 13virtual_address_range
14gup_benchmark 14gup_benchmark
15va_128TBswitch 15va_128TBswitch
16map_fixed_noreplace
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index e94b7b14bcb2..6e67e726e5a5 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -12,6 +12,7 @@ TEST_GEN_FILES += gup_benchmark
12TEST_GEN_FILES += hugepage-mmap 12TEST_GEN_FILES += hugepage-mmap
13TEST_GEN_FILES += hugepage-shm 13TEST_GEN_FILES += hugepage-shm
14TEST_GEN_FILES += map_hugetlb 14TEST_GEN_FILES += map_hugetlb
15TEST_GEN_FILES += map_fixed_noreplace
15TEST_GEN_FILES += map_populate 16TEST_GEN_FILES += map_populate
16TEST_GEN_FILES += mlock-random-test 17TEST_GEN_FILES += mlock-random-test
17TEST_GEN_FILES += mlock2-tests 18TEST_GEN_FILES += mlock2-tests
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
index 36df55132036..880b96fc80d4 100644
--- a/tools/testing/selftests/vm/gup_benchmark.c
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -15,9 +15,12 @@
15#define PAGE_SIZE sysconf(_SC_PAGESIZE) 15#define PAGE_SIZE sysconf(_SC_PAGESIZE)
16 16
17#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) 17#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
18#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
19#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
18 20
19struct gup_benchmark { 21struct gup_benchmark {
20 __u64 delta_usec; 22 __u64 get_delta_usec;
23 __u64 put_delta_usec;
21 __u64 addr; 24 __u64 addr;
22 __u64 size; 25 __u64 size;
23 __u32 nr_pages_per_call; 26 __u32 nr_pages_per_call;
@@ -28,10 +31,12 @@ int main(int argc, char **argv)
28{ 31{
29 struct gup_benchmark gup; 32 struct gup_benchmark gup;
30 unsigned long size = 128 * MB; 33 unsigned long size = 128 * MB;
31 int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; 34 int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
35 int cmd = GUP_FAST_BENCHMARK, flags = MAP_PRIVATE;
36 char *file = "/dev/zero";
32 char *p; 37 char *p;
33 38
34 while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) { 39 while ((opt = getopt(argc, argv, "m:r:n:f:tTLUSH")) != -1) {
35 switch (opt) { 40 switch (opt) {
36 case 'm': 41 case 'm':
37 size = atoi(optarg) * MB; 42 size = atoi(optarg) * MB;
@@ -48,13 +53,36 @@ int main(int argc, char **argv)
48 case 'T': 53 case 'T':
49 thp = 0; 54 thp = 0;
50 break; 55 break;
56 case 'L':
57 cmd = GUP_LONGTERM_BENCHMARK;
58 break;
59 case 'U':
60 cmd = GUP_BENCHMARK;
61 break;
51 case 'w': 62 case 'w':
52 write = 1; 63 write = 1;
64 break;
65 case 'f':
66 file = optarg;
67 break;
68 case 'S':
69 flags &= ~MAP_PRIVATE;
70 flags |= MAP_SHARED;
71 break;
72 case 'H':
73 flags |= MAP_HUGETLB;
74 break;
53 default: 75 default:
54 return -1; 76 return -1;
55 } 77 }
56 } 78 }
57 79
80 filed = open(file, O_RDWR|O_CREAT);
81 if (filed < 0) {
82 perror("open");
83 exit(filed);
84 }
85
58 gup.nr_pages_per_call = nr_pages; 86 gup.nr_pages_per_call = nr_pages;
59 gup.flags = write; 87 gup.flags = write;
60 88
@@ -62,8 +90,7 @@ int main(int argc, char **argv)
62 if (fd == -1) 90 if (fd == -1)
63 perror("open"), exit(1); 91 perror("open"), exit(1);
64 92
65 p = mmap(NULL, size, PROT_READ | PROT_WRITE, 93 p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
66 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
67 if (p == MAP_FAILED) 94 if (p == MAP_FAILED)
68 perror("mmap"), exit(1); 95 perror("mmap"), exit(1);
69 gup.addr = (unsigned long)p; 96 gup.addr = (unsigned long)p;
@@ -78,10 +105,11 @@ int main(int argc, char **argv)
78 105
79 for (i = 0; i < repeats; i++) { 106 for (i = 0; i < repeats; i++) {
80 gup.size = size; 107 gup.size = size;
81 if (ioctl(fd, GUP_FAST_BENCHMARK, &gup)) 108 if (ioctl(fd, cmd, &gup))
82 perror("ioctl"), exit(1); 109 perror("ioctl"), exit(1);
83 110
84 printf("Time: %lld us", gup.delta_usec); 111 printf("Time: get:%lld put:%lld us", gup.get_delta_usec,
112 gup.put_delta_usec);
85 if (gup.size != size) 113 if (gup.size != size)
86 printf(", truncated (size: %lld)", gup.size); 114 printf(", truncated (size: %lld)", gup.size);
87 printf("\n"); 115 printf("\n");
diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c
new file mode 100644
index 000000000000..d91bde511268
--- /dev/null
+++ b/tools/testing/selftests/vm/map_fixed_noreplace.c
@@ -0,0 +1,206 @@
1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Test that MAP_FIXED_NOREPLACE works.
5 *
6 * Copyright 2018, Jann Horn <jannh@google.com>
7 * Copyright 2018, Michael Ellerman, IBM Corporation.
8 */
9
10#include <sys/mman.h>
11#include <errno.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <unistd.h>
15
16#ifndef MAP_FIXED_NOREPLACE
17#define MAP_FIXED_NOREPLACE 0x100000
18#endif
19
20#define BASE_ADDRESS (256ul * 1024 * 1024)
21
22
23static void dump_maps(void)
24{
25 char cmd[32];
26
27 snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
28 system(cmd);
29}
30
31int main(void)
32{
33 unsigned long flags, addr, size, page_size;
34 char *p;
35
36 page_size = sysconf(_SC_PAGE_SIZE);
37
38 flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
39
40 // Check we can map all the areas we need below
41 errno = 0;
42 addr = BASE_ADDRESS;
43 size = 5 * page_size;
44 p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
45
46 printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
47
48 if (p == MAP_FAILED) {
49 dump_maps();
50 printf("Error: couldn't map the space we need for the test\n");
51 return 1;
52 }
53
54 errno = 0;
55 if (munmap((void *)addr, 5 * page_size) != 0) {
56 dump_maps();
57 printf("Error: munmap failed!?\n");
58 return 1;
59 }
60 printf("unmap() successful\n");
61
62 errno = 0;
63 addr = BASE_ADDRESS + page_size;
64 size = 3 * page_size;
65 p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
66 printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
67
68 if (p == MAP_FAILED) {
69 dump_maps();
70 printf("Error: first mmap() failed unexpectedly\n");
71 return 1;
72 }
73
74 /*
75 * Exact same mapping again:
76 * base | free | new
77 * +1 | mapped | new
78 * +2 | mapped | new
79 * +3 | mapped | new
80 * +4 | free | new
81 */
82 errno = 0;
83 addr = BASE_ADDRESS;
84 size = 5 * page_size;
85 p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
86 printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
87
88 if (p != MAP_FAILED) {
89 dump_maps();
90 printf("Error:1: mmap() succeeded when it shouldn't have\n");
91 return 1;
92 }
93
94 /*
95 * Second mapping contained within first:
96 *
97 * base | free |
98 * +1 | mapped |
99 * +2 | mapped | new
100 * +3 | mapped |
101 * +4 | free |
102 */
103 errno = 0;
104 addr = BASE_ADDRESS + (2 * page_size);
105 size = page_size;
106 p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
107 printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
108
109 if (p != MAP_FAILED) {
110 dump_maps();
111 printf("Error:2: mmap() succeeded when it shouldn't have\n");
112 return 1;
113 }
114
115 /*
116 * Overlap end of existing mapping:
117 * base | free |
118 * +1 | mapped |
119 * +2 | mapped |
120 * +3 | mapped | new
121 * +4 | free | new
122 */
123 errno = 0;
124 addr = BASE_ADDRESS + (3 * page_size);
125 size = 2 * page_size;
126 p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
127 printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
128
129 if (p != MAP_FAILED) {
130 dump_maps();
131 printf("Error:3: mmap() succeeded when it shouldn't have\n");
132 return 1;
133 }
134
135 /*
136 * Overlap start of existing mapping:
137 * base | free | new
138 * +1 | mapped | new
139 * +2 | mapped |
140 * +3 | mapped |
141 * +4 | free |
142 */
143 errno = 0;
144 addr = BASE_ADDRESS;
145 size = 2 * page_size;
146 p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
147 printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
148
149 if (p != MAP_FAILED) {
150 dump_maps();
151 printf("Error:4: mmap() succeeded when it shouldn't have\n");
152 return 1;
153 }
154
155 /*
156 * Adjacent to start of existing mapping:
157 * base | free | new
158 * +1 | mapped |
159 * +2 | mapped |
160 * +3 | mapped |
161 * +4 | free |
162 */
163 errno = 0;
164 addr = BASE_ADDRESS;
165 size = page_size;
166 p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
167 printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
168
169 if (p == MAP_FAILED) {
170 dump_maps();
171 printf("Error:5: mmap() failed when it shouldn't have\n");
172 return 1;
173 }
174
175 /*
176 * Adjacent to end of existing mapping:
177 * base | free |
178 * +1 | mapped |
179 * +2 | mapped |
180 * +3 | mapped |
181 * +4 | free | new
182 */
183 errno = 0;
184 addr = BASE_ADDRESS + (4 * page_size);
185 size = page_size;
186 p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
187 printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
188
189 if (p == MAP_FAILED) {
190 dump_maps();
191 printf("Error:6: mmap() failed when it shouldn't have\n");
192 return 1;
193 }
194
195 addr = BASE_ADDRESS;
196 size = 5 * page_size;
197 if (munmap((void *)addr, size) != 0) {
198 dump_maps();
199 printf("Error: munmap failed!?\n");
200 return 1;
201 }
202 printf("unmap() successful\n");
203
204 printf("OK\n");
205 return 0;
206}
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 7b8171e3128a..5d1db824f73a 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -34,18 +34,6 @@
34 * per-CPU threads 1 by triggering userfaults inside 34 * per-CPU threads 1 by triggering userfaults inside
35 * pthread_mutex_lock will also verify the atomicity of the memory 35 * pthread_mutex_lock will also verify the atomicity of the memory
36 * transfer (UFFDIO_COPY). 36 * transfer (UFFDIO_COPY).
37 *
38 * The program takes two parameters: the amounts of physical memory in
39 * megabytes (MiB) of the area and the number of bounces to execute.
40 *
41 * # 100MiB 99999 bounces
42 * ./userfaultfd 100 99999
43 *
44 * # 1GiB 99 bounces
45 * ./userfaultfd 1000 99
46 *
47 * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
48 * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
49 */ 37 */
50 38
51#define _GNU_SOURCE 39#define _GNU_SOURCE
@@ -115,6 +103,30 @@ pthread_attr_t attr;
115 ~(unsigned long)(sizeof(unsigned long long) \ 103 ~(unsigned long)(sizeof(unsigned long long) \
116 - 1))) 104 - 1)))
117 105
106const char *examples =
107 "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
108 "./userfaultfd anon 100 99999\n\n"
109 "# Run share memory test on 1GiB region with 99 bounces:\n"
110 "./userfaultfd shmem 1000 99\n\n"
111 "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
112 "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
113 "# Run the same hugetlb test but using shmem:\n"
114 "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
115 "# 10MiB-~6GiB 999 bounces anonymous test, "
116 "continue forever unless an error triggers\n"
117 "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
118
119static void usage(void)
120{
121 fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
122 "[hugetlbfs_file]\n\n");
123 fprintf(stderr, "Supported <test type>: anon, hugetlb, "
124 "hugetlb_shared, shmem\n\n");
125 fprintf(stderr, "Examples:\n\n");
126 fprintf(stderr, examples);
127 exit(1);
128}
129
118static int anon_release_pages(char *rel_area) 130static int anon_release_pages(char *rel_area)
119{ 131{
120 int ret = 0; 132 int ret = 0;
@@ -439,6 +451,43 @@ static int copy_page(int ufd, unsigned long offset)
439 return __copy_page(ufd, offset, false); 451 return __copy_page(ufd, offset, false);
440} 452}
441 453
454static int uffd_read_msg(int ufd, struct uffd_msg *msg)
455{
456 int ret = read(uffd, msg, sizeof(*msg));
457
458 if (ret != sizeof(*msg)) {
459 if (ret < 0) {
460 if (errno == EAGAIN)
461 return 1;
462 else
463 perror("blocking read error"), exit(1);
464 } else {
465 fprintf(stderr, "short read\n"), exit(1);
466 }
467 }
468
469 return 0;
470}
471
472/* Return 1 if page fault handled by us; otherwise 0 */
473static int uffd_handle_page_fault(struct uffd_msg *msg)
474{
475 unsigned long offset;
476
477 if (msg->event != UFFD_EVENT_PAGEFAULT)
478 fprintf(stderr, "unexpected msg event %u\n",
479 msg->event), exit(1);
480
481 if (bounces & BOUNCE_VERIFY &&
482 msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
483 fprintf(stderr, "unexpected write fault\n"), exit(1);
484
485 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
486 offset &= ~(page_size-1);
487
488 return copy_page(uffd, offset);
489}
490
442static void *uffd_poll_thread(void *arg) 491static void *uffd_poll_thread(void *arg)
443{ 492{
444 unsigned long cpu = (unsigned long) arg; 493 unsigned long cpu = (unsigned long) arg;
@@ -446,7 +495,6 @@ static void *uffd_poll_thread(void *arg)
446 struct uffd_msg msg; 495 struct uffd_msg msg;
447 struct uffdio_register uffd_reg; 496 struct uffdio_register uffd_reg;
448 int ret; 497 int ret;
449 unsigned long offset;
450 char tmp_chr; 498 char tmp_chr;
451 unsigned long userfaults = 0; 499 unsigned long userfaults = 0;
452 500
@@ -470,25 +518,15 @@ static void *uffd_poll_thread(void *arg)
470 if (!(pollfd[0].revents & POLLIN)) 518 if (!(pollfd[0].revents & POLLIN))
471 fprintf(stderr, "pollfd[0].revents %d\n", 519 fprintf(stderr, "pollfd[0].revents %d\n",
472 pollfd[0].revents), exit(1); 520 pollfd[0].revents), exit(1);
473 ret = read(uffd, &msg, sizeof(msg)); 521 if (uffd_read_msg(uffd, &msg))
474 if (ret < 0) { 522 continue;
475 if (errno == EAGAIN)
476 continue;
477 perror("nonblocking read error"), exit(1);
478 }
479 switch (msg.event) { 523 switch (msg.event) {
480 default: 524 default:
481 fprintf(stderr, "unexpected msg event %u\n", 525 fprintf(stderr, "unexpected msg event %u\n",
482 msg.event), exit(1); 526 msg.event), exit(1);
483 break; 527 break;
484 case UFFD_EVENT_PAGEFAULT: 528 case UFFD_EVENT_PAGEFAULT:
485 if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) 529 userfaults += uffd_handle_page_fault(&msg);
486 fprintf(stderr, "unexpected write fault\n"), exit(1);
487 offset = (char *)(unsigned long)msg.arg.pagefault.address -
488 area_dst;
489 offset &= ~(page_size-1);
490 if (copy_page(uffd, offset))
491 userfaults++;
492 break; 530 break;
493 case UFFD_EVENT_FORK: 531 case UFFD_EVENT_FORK:
494 close(uffd); 532 close(uffd);
@@ -516,8 +554,6 @@ static void *uffd_read_thread(void *arg)
516{ 554{
517 unsigned long *this_cpu_userfaults; 555 unsigned long *this_cpu_userfaults;
518 struct uffd_msg msg; 556 struct uffd_msg msg;
519 unsigned long offset;
520 int ret;
521 557
522 this_cpu_userfaults = (unsigned long *) arg; 558 this_cpu_userfaults = (unsigned long *) arg;
523 *this_cpu_userfaults = 0; 559 *this_cpu_userfaults = 0;
@@ -526,24 +562,9 @@ static void *uffd_read_thread(void *arg)
526 /* from here cancellation is ok */ 562 /* from here cancellation is ok */
527 563
528 for (;;) { 564 for (;;) {
529 ret = read(uffd, &msg, sizeof(msg)); 565 if (uffd_read_msg(uffd, &msg))
530 if (ret != sizeof(msg)) { 566 continue;
531 if (ret < 0) 567 (*this_cpu_userfaults) += uffd_handle_page_fault(&msg);
532 perror("blocking read error"), exit(1);
533 else
534 fprintf(stderr, "short read\n"), exit(1);
535 }
536 if (msg.event != UFFD_EVENT_PAGEFAULT)
537 fprintf(stderr, "unexpected msg event %u\n",
538 msg.event), exit(1);
539 if (bounces & BOUNCE_VERIFY &&
540 msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
541 fprintf(stderr, "unexpected write fault\n"), exit(1);
542 offset = (char *)(unsigned long)msg.arg.pagefault.address -
543 area_dst;
544 offset &= ~(page_size-1);
545 if (copy_page(uffd, offset))
546 (*this_cpu_userfaults)++;
547 } 568 }
548 return (void *)NULL; 569 return (void *)NULL;
549} 570}
@@ -605,6 +626,12 @@ static int stress(unsigned long *userfaults)
605 if (uffd_test_ops->release_pages(area_src)) 626 if (uffd_test_ops->release_pages(area_src))
606 return 1; 627 return 1;
607 628
629
630 finished = 1;
631 for (cpu = 0; cpu < nr_cpus; cpu++)
632 if (pthread_join(locking_threads[cpu], NULL))
633 return 1;
634
608 for (cpu = 0; cpu < nr_cpus; cpu++) { 635 for (cpu = 0; cpu < nr_cpus; cpu++) {
609 char c; 636 char c;
610 if (bounces & BOUNCE_POLL) { 637 if (bounces & BOUNCE_POLL) {
@@ -622,11 +649,6 @@ static int stress(unsigned long *userfaults)
622 } 649 }
623 } 650 }
624 651
625 finished = 1;
626 for (cpu = 0; cpu < nr_cpus; cpu++)
627 if (pthread_join(locking_threads[cpu], NULL))
628 return 1;
629
630 return 0; 652 return 0;
631} 653}
632 654
@@ -1272,8 +1294,7 @@ static void sigalrm(int sig)
1272int main(int argc, char **argv) 1294int main(int argc, char **argv)
1273{ 1295{
1274 if (argc < 4) 1296 if (argc < 4)
1275 fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"), 1297 usage();
1276 exit(1);
1277 1298
1278 if (signal(SIGALRM, sigalrm) == SIG_ERR) 1299 if (signal(SIGALRM, sigalrm) == SIG_ERR)
1279 fprintf(stderr, "failed to arm SIGALRM"), exit(1); 1300 fprintf(stderr, "failed to arm SIGALRM"), exit(1);
@@ -1286,20 +1307,19 @@ int main(int argc, char **argv)
1286 nr_cpus; 1307 nr_cpus;
1287 if (!nr_pages_per_cpu) { 1308 if (!nr_pages_per_cpu) {
1288 fprintf(stderr, "invalid MiB\n"); 1309 fprintf(stderr, "invalid MiB\n");
1289 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); 1310 usage();
1290 } 1311 }
1291 1312
1292 bounces = atoi(argv[3]); 1313 bounces = atoi(argv[3]);
1293 if (bounces <= 0) { 1314 if (bounces <= 0) {
1294 fprintf(stderr, "invalid bounces\n"); 1315 fprintf(stderr, "invalid bounces\n");
1295 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); 1316 usage();
1296 } 1317 }
1297 nr_pages = nr_pages_per_cpu * nr_cpus; 1318 nr_pages = nr_pages_per_cpu * nr_cpus;
1298 1319
1299 if (test_type == TEST_HUGETLB) { 1320 if (test_type == TEST_HUGETLB) {
1300 if (argc < 5) 1321 if (argc < 5)
1301 fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"), 1322 usage();
1302 exit(1);
1303 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); 1323 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1304 if (huge_fd < 0) { 1324 if (huge_fd < 0) {
1305 fprintf(stderr, "Open of %s failed", argv[3]); 1325 fprintf(stderr, "Open of %s failed", argv[3]);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 786ade1843a2..2679e476b6c3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -497,7 +497,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
497} 497}
498 498
499static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 499static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
500 .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
501 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 500 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
502 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 501 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
503 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 502 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,