summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 22:42:40 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 22:42:40 -0500
commit7c225c69f86c934e3be9be63ecde754e286838d7 (patch)
treeff2df419b0c4886b37407235f7d21215e4cf45e4
parent6363b3f3ac5be096d08c8c504128befa0c033529 (diff)
parent1b7176aea0a924ac59c6a283129d3e8eb00aa915 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few misc bits - ocfs2 updates - almost all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (131 commits) memory hotplug: fix comments when adding section mm: make alloc_node_mem_map a void call if we don't have CONFIG_FLAT_NODE_MEM_MAP mm: simplify nodemask printing mm,oom_reaper: remove pointless kthread_run() error check mm/page_ext.c: check if page_ext is not prepared writeback: remove unused function parameter mm: do not rely on preempt_count in print_vma_addr mm, sparse: do not swamp log with huge vmemmap allocation failures mm/hmm: remove redundant variable align_end mm/list_lru.c: mark expected switch fall-through mm/shmem.c: mark expected switch fall-through mm/page_alloc.c: broken deferred calculation mm: don't warn about allocations which stall for too long fs: fuse: account fuse_inode slab memory as reclaimable mm, page_alloc: fix potential false positive in __zone_watermark_ok mm: mlock: remove lru_add_drain_all() mm, sysctl: make NUMA stats configurable shmem: convert shmem_init_inodecache() to void Unify migrate_pages and move_pages access checks mm, pagevec: rename pagevec drained field ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt7
-rw-r--r--Documentation/dev-tools/index.rst1
-rw-r--r--Documentation/dev-tools/kmemcheck.rst733
-rw-r--r--Documentation/filesystems/proc.txt1
-rw-r--r--Documentation/sysctl/vm.txt25
-rw-r--r--Documentation/vm/mmu_notifier.txt93
-rw-r--r--MAINTAINERS10
-rw-r--r--arch/arm/include/asm/dma-iommu.h1
-rw-r--r--arch/arm/include/asm/pgalloc.h2
-rw-r--r--arch/arm/mm/pgd.c2
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/arm64/include/asm/pgalloc.h2
-rw-r--r--arch/arm64/mm/kasan_init.c130
-rw-r--r--arch/frv/mm/init.c14
-rw-r--r--arch/h8300/mm/init.c13
-rw-r--r--arch/m32r/Kconfig4
-rw-r--r--arch/mips/include/asm/pgtable-64.h8
-rw-r--r--arch/mn10300/kernel/head.S8
-rw-r--r--arch/openrisc/include/asm/dma-mapping.h1
-rw-r--r--arch/powerpc/include/asm/pgalloc.h2
-rw-r--r--arch/powerpc/mm/hugetlbpage.c1
-rw-r--r--arch/powerpc/mm/mmu_context_book3s64.c2
-rw-r--r--arch/powerpc/mm/pgtable_64.c2
-rw-r--r--arch/s390/include/asm/mmu_context.h4
-rw-r--r--arch/sh/kernel/dwarf.c4
-rw-r--r--arch/sh/kernel/head_64.S8
-rw-r--r--arch/sh/kernel/process.c2
-rw-r--r--arch/sparc/include/asm/pgtable_64.h30
-rw-r--r--arch/sparc/mm/hugetlbpage.c3
-rw-r--r--arch/sparc/mm/init_64.c38
-rw-r--r--arch/tile/mm/homecache.c2
-rw-r--r--arch/um/kernel/mem.c3
-rw-r--r--arch/unicore32/include/asm/pgalloc.h2
-rw-r--r--arch/unicore32/mm/pgd.c2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/dma-mapping.h1
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h13
-rw-r--r--arch/x86/include/asm/string_32.h9
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c15
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c6
-rw-r--r--arch/x86/mm/init.c8
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/kasan_init_64.c143
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c227
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c658
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c70
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c173
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h18
-rw-r--r--arch/x86/mm/pageattr.c10
-rw-r--r--arch/x86/mm/pgtable.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c2
-rw-r--r--block/blk-mq.c2
-rw-r--r--crypto/xor.c7
-rw-r--r--drivers/block/brd.c2
-rw-r--r--drivers/block/zram/zcomp.c6
-rw-r--r--drivers/block/zram/zram_drv.c18
-rw-r--r--drivers/char/random.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c2
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_gem.c6
-rw-r--r--drivers/gpu/drm/i915/i915_gem_gtt.c2
-rw-r--r--drivers/gpu/drm/i915/i915_gem_userptr.c4
-rw-r--r--drivers/gpu/drm/radeon/radeon_ttm.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_init.c5
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.c2
-rw-r--r--drivers/misc/c2port/core.c2
-rw-r--r--drivers/net/ethernet/amazon/ena/ena_netdev.c2
-rw-r--r--drivers/net/ethernet/amd/xgbe/xgbe-desc.c2
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_ring.c3
-rw-r--r--drivers/net/ethernet/cavium/liquidio/octeon_network.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_rx.c5
-rw-r--r--drivers/net/ethernet/netronome/nfp/nfp_net_common.c2
-rw-r--r--drivers/net/ethernet/qlogic/qlge/qlge_main.c3
-rw-r--r--drivers/net/ethernet/sfc/falcon/rx.c2
-rw-r--r--drivers/net/ethernet/sfc/rx.c2
-rw-r--r--drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c2
-rw-r--r--drivers/net/ethernet/ti/netcp_core.c2
-rw-r--r--drivers/net/virtio_net.c1
-rw-r--r--drivers/nvdimm/btt.c3
-rw-r--r--drivers/nvdimm/pmem.c2
-rw-r--r--drivers/staging/lustre/lustre/mdc/mdc_request.c2
-rw-r--r--fs/afs/write.c15
-rw-r--r--fs/btrfs/extent_io.c23
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/cachefiles/rdwr.c10
-rw-r--r--fs/ceph/addr.c30
-rw-r--r--fs/cifs/file.c21
-rw-r--r--fs/dax.c13
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/ext4/inode.c20
-rw-r--r--fs/f2fs/checkpoint.c15
-rw-r--r--fs/f2fs/data.c11
-rw-r--r--fs/f2fs/file.c13
-rw-r--r--fs/f2fs/node.c73
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/inode.c6
-rw-r--r--fs/gfs2/aops.c22
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/nilfs2/btree.c6
-rw-r--r--fs/nilfs2/page.c15
-rw-r--r--fs/nilfs2/segment.c16
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c26
-rw-r--r--fs/ocfs2/buffer_head_io.h3
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c63
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c1
-rw-r--r--fs/ocfs2/file.c9
-rw-r--r--fs/ocfs2/suballoc.c5
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/ocfs2/super.h3
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/userfaultfd.c2
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--include/linux/backing-dev.h10
-rw-r--r--include/linux/bootmem.h27
-rw-r--r--include/linux/c2port.h4
-rw-r--r--include/linux/dma-mapping.h8
-rw-r--r--include/linux/filter.h2
-rw-r--r--include/linux/gfp.h18
-rw-r--r--include/linux/hmm.h4
-rw-r--r--include/linux/interrupt.h15
-rw-r--r--include/linux/kasan.h4
-rw-r--r--include/linux/kmemcheck.h171
-rw-r--r--include/linux/kmemleak.h8
-rw-r--r--include/linux/memblock.h24
-rw-r--r--include/linux/mm.h83
-rw-r--r--include/linux/mm_types.h19
-rw-r--r--include/linux/mmu_notifier.h20
-rw-r--r--include/linux/mmzone.h9
-rw-r--r--include/linux/net.h3
-rw-r--r--include/linux/nodemask.h4
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/page-isolation.h2
-rw-r--r--include/linux/pagemap.h26
-rw-r--r--include/linux/pagevec.h20
-rw-r--r--include/linux/radix-tree.h7
-rw-r--r--include/linux/ring_buffer.h3
-rw-r--r--include/linux/skbuff.h5
-rw-r--r--include/linux/slab.h81
-rw-r--r--include/linux/slab_def.h2
-rw-r--r--include/linux/slub_def.h2
-rw-r--r--include/linux/swap.h35
-rw-r--r--include/linux/thread_info.h5
-rw-r--r--include/linux/types.h1
-rw-r--r--include/linux/vmstat.h10
-rw-r--r--include/net/inet_sock.h3
-rw-r--r--include/net/inet_timewait_sock.h4
-rw-r--r--include/net/sock.h5
-rw-r--r--include/trace/events/kmem.h11
-rw-r--r--include/trace/events/mmflags.h2
-rw-r--r--init/Kconfig6
-rw-r--r--init/do_mounts.c3
-rw-r--r--init/main.c1
-rw-r--r--kernel/bpf/core.c6
-rw-r--r--kernel/fork.c24
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/trace/ring_buffer.c3
-rw-r--r--lib/Kconfig.debug6
-rw-r--r--lib/Kconfig.kmemcheck94
-rw-r--r--lib/idr.c2
-rw-r--r--lib/radix-tree.c30
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/debug.c5
-rw-r--r--mm/filemap.c221
-rw-r--r--mm/hmm.c3
-rw-r--r--mm/huge_memory.c78
-rw-r--r--mm/hugetlb.c16
-rw-r--r--mm/kasan/kasan.c2
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/kmemcheck.c125
-rw-r--r--mm/kmemleak.c11
-rw-r--r--mm/ksm.c15
-rw-r--r--mm/list_lru.c1
-rw-r--r--mm/memblock.c68
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c90
-rw-r--r--mm/memory_hotplug.c50
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c15
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmu_notifier.c11
-rw-r--r--mm/oom_kill.c60
-rw-r--r--mm/page-writeback.c47
-rw-r--r--mm/page_alloc.c465
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c45
-rw-r--r--mm/slab.h41
-rw-r--r--mm/slab_common.c59
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c67
-rw-r--r--mm/sparse-vmemmap.c34
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c35
-rw-r--r--mm/swap_slots.c11
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c21
-rw-r--r--mm/truncate.c149
-rw-r--r--mm/vmscan.c8
-rw-r--r--mm/vmstat.c77
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zsmalloc.c2
-rw-r--r--net/core/skbuff.c9
-rw-r--r--net/core/sock.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c3
-rw-r--r--net/ipv4/tcp_input.c1
-rw-r--r--net/rds/ib_fmr.c4
-rw-r--r--net/socket.c1
-rwxr-xr-xscripts/bloat-o-meter89
-rwxr-xr-xscripts/kernel-doc2
-rw-r--r--tools/include/linux/kmemcheck.h8
-rw-r--r--tools/perf/builtin-kmem.c2
-rw-r--r--tools/testing/radix-tree/multiorder.c2
-rw-r--r--tools/vm/slabinfo.c11
250 files changed, 2276 insertions, 4084 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b74e13312fdc..00bb04972612 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1864,13 +1864,6 @@
1864 Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, 1864 Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
1865 the default is off. 1865 the default is off.
1866 1866
1867 kmemcheck= [X86] Boot-time kmemcheck enable/disable/one-shot mode
1868 Valid arguments: 0, 1, 2
1869 kmemcheck=0 (disabled)
1870 kmemcheck=1 (enabled)
1871 kmemcheck=2 (one-shot mode)
1872 Default: 2 (one-shot mode)
1873
1874 kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. 1867 kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
1875 Default is 0 (don't ignore, but inject #GP) 1868 Default is 0 (don't ignore, but inject #GP)
1876 1869
diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst
index a81787cd47d7..e313925fb0fa 100644
--- a/Documentation/dev-tools/index.rst
+++ b/Documentation/dev-tools/index.rst
@@ -21,7 +21,6 @@ whole; patches welcome!
21 kasan 21 kasan
22 ubsan 22 ubsan
23 kmemleak 23 kmemleak
24 kmemcheck
25 gdb-kernel-debugging 24 gdb-kernel-debugging
26 kgdb 25 kgdb
27 kselftest 26 kselftest
diff --git a/Documentation/dev-tools/kmemcheck.rst b/Documentation/dev-tools/kmemcheck.rst
deleted file mode 100644
index 7f3d1985de74..000000000000
--- a/Documentation/dev-tools/kmemcheck.rst
+++ /dev/null
@@ -1,733 +0,0 @@
1Getting started with kmemcheck
2==============================
3
4Vegard Nossum <vegardno@ifi.uio.no>
5
6
7Introduction
8------------
9
10kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
11is a dynamic checker that detects and warns about some uses of uninitialized
12memory.
13
14Userspace programmers might be familiar with Valgrind's memcheck. The main
15difference between memcheck and kmemcheck is that memcheck works for userspace
16programs only, and kmemcheck works for the kernel only. The implementations
17are of course vastly different. Because of this, kmemcheck is not as accurate
18as memcheck, but it turns out to be good enough in practice to discover real
19programmer errors that the compiler is not able to find through static
20analysis.
21
22Enabling kmemcheck on a kernel will probably slow it down to the extent that
23the machine will not be usable for normal workloads such as e.g. an
24interactive desktop. kmemcheck will also cause the kernel to use about twice
25as much memory as normal. For this reason, kmemcheck is strictly a debugging
26feature.
27
28
29Downloading
30-----------
31
32As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel.
33
34
35Configuring and compiling
36-------------------------
37
38kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
39configuration variables must have specific settings in order for the kmemcheck
40menu to even appear in "menuconfig". These are:
41
42- ``CONFIG_CC_OPTIMIZE_FOR_SIZE=n``
43 This option is located under "General setup" / "Optimize for size".
44
45 Without this, gcc will use certain optimizations that usually lead to
46 false positive warnings from kmemcheck. An example of this is a 16-bit
47 field in a struct, where gcc may load 32 bits, then discard the upper
48 16 bits. kmemcheck sees only the 32-bit load, and may trigger a
49 warning for the upper 16 bits (if they're uninitialized).
50
51- ``CONFIG_SLAB=y`` or ``CONFIG_SLUB=y``
52 This option is located under "General setup" / "Choose SLAB
53 allocator".
54
55- ``CONFIG_FUNCTION_TRACER=n``
56 This option is located under "Kernel hacking" / "Tracers" / "Kernel
57 Function Tracer"
58
59 When function tracing is compiled in, gcc emits a call to another
60 function at the beginning of every function. This means that when the
61 page fault handler is called, the ftrace framework will be called
62 before kmemcheck has had a chance to handle the fault. If ftrace then
63 modifies memory that was tracked by kmemcheck, the result is an
64 endless recursive page fault.
65
66- ``CONFIG_DEBUG_PAGEALLOC=n``
67 This option is located under "Kernel hacking" / "Memory Debugging"
68 / "Debug page memory allocations".
69
70In addition, I highly recommend turning on ``CONFIG_DEBUG_INFO=y``. This is also
71located under "Kernel hacking". With this, you will be able to get line number
72information from the kmemcheck warnings, which is extremely valuable in
73debugging a problem. This option is not mandatory, however, because it slows
74down the compilation process and produces a much bigger kernel image.
75
76Now the kmemcheck menu should be visible (under "Kernel hacking" / "Memory
77Debugging" / "kmemcheck: trap use of uninitialized memory"). Here follows
78a description of the kmemcheck configuration variables:
79
80- ``CONFIG_KMEMCHECK``
81 This must be enabled in order to use kmemcheck at all...
82
83- ``CONFIG_KMEMCHECK_``[``DISABLED`` | ``ENABLED`` | ``ONESHOT``]``_BY_DEFAULT``
84 This option controls the status of kmemcheck at boot-time. "Enabled"
85 will enable kmemcheck right from the start, "disabled" will boot the
86 kernel as normal (but with the kmemcheck code compiled in, so it can
87 be enabled at run-time after the kernel has booted), and "one-shot" is
88 a special mode which will turn kmemcheck off automatically after
89 detecting the first use of uninitialized memory.
90
91 If you are using kmemcheck to actively debug a problem, then you
92 probably want to choose "enabled" here.
93
94 The one-shot mode is mostly useful in automated test setups because it
95 can prevent floods of warnings and increase the chances of the machine
96 surviving in case something is really wrong. In other cases, the one-
97 shot mode could actually be counter-productive because it would turn
98 itself off at the very first error -- in the case of a false positive
99 too -- and this would come in the way of debugging the specific
100 problem you were interested in.
101
102 If you would like to use your kernel as normal, but with a chance to
103 enable kmemcheck in case of some problem, it might be a good idea to
104 choose "disabled" here. When kmemcheck is disabled, most of the run-
105 time overhead is not incurred, and the kernel will be almost as fast
106 as normal.
107
108- ``CONFIG_KMEMCHECK_QUEUE_SIZE``
109 Select the maximum number of error reports to store in an internal
110 (fixed-size) buffer. Since errors can occur virtually anywhere and in
111 any context, we need a temporary storage area which is guaranteed not
112 to generate any other page faults when accessed. The queue will be
113 emptied as soon as a tasklet may be scheduled. If the queue is full,
114 new error reports will be lost.
115
116 The default value of 64 is probably fine. If some code produces more
117 than 64 errors within an irqs-off section, then the code is likely to
118 produce many, many more, too, and these additional reports seldom give
119 any more information (the first report is usually the most valuable
120 anyway).
121
122 This number might have to be adjusted if you are not using serial
123 console or similar to capture the kernel log. If you are using the
124 "dmesg" command to save the log, then getting a lot of kmemcheck
125 warnings might overflow the kernel log itself, and the earlier reports
126 will get lost in that way instead. Try setting this to 10 or so on
127 such a setup.
128
129- ``CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT``
130 Select the number of shadow bytes to save along with each entry of the
131 error-report queue. These bytes indicate what parts of an allocation
132 are initialized, uninitialized, etc. and will be displayed when an
133 error is detected to help the debugging of a particular problem.
134
135 The number entered here is actually the logarithm of the number of
136 bytes that will be saved. So if you pick for example 5 here, kmemcheck
137 will save 2^5 = 32 bytes.
138
139 The default value should be fine for debugging most problems. It also
140 fits nicely within 80 columns.
141
142- ``CONFIG_KMEMCHECK_PARTIAL_OK``
143 This option (when enabled) works around certain GCC optimizations that
144 produce 32-bit reads from 16-bit variables where the upper 16 bits are
145 thrown away afterwards.
146
147 The default value (enabled) is recommended. This may of course hide
148 some real errors, but disabling it would probably produce a lot of
149 false positives.
150
151- ``CONFIG_KMEMCHECK_BITOPS_OK``
152 This option silences warnings that would be generated for bit-field
153 accesses where not all the bits are initialized at the same time. This
154 may also hide some real bugs.
155
156 This option is probably obsolete, or it should be replaced with
157 the kmemcheck-/bitfield-annotations for the code in question. The
158 default value is therefore fine.
159
160Now compile the kernel as usual.
161
162
163How to use
164----------
165
166Booting
167~~~~~~~
168
169First some information about the command-line options. There is only one
170option specific to kmemcheck, and this is called "kmemcheck". It can be used
171to override the default mode as chosen by the ``CONFIG_KMEMCHECK_*_BY_DEFAULT``
172option. Its possible settings are:
173
174- ``kmemcheck=0`` (disabled)
175- ``kmemcheck=1`` (enabled)
176- ``kmemcheck=2`` (one-shot mode)
177
178If SLUB debugging has been enabled in the kernel, it may take precedence over
179kmemcheck in such a way that the slab caches which are under SLUB debugging
180will not be tracked by kmemcheck. In order to ensure that this doesn't happen
181(even though it shouldn't by default), use SLUB's boot option ``slub_debug``,
182like this: ``slub_debug=-``
183
184In fact, this option may also be used for fine-grained control over SLUB vs.
185kmemcheck. For example, if the command line includes
186``kmemcheck=1 slub_debug=,dentry``, then SLUB debugging will be used only
187for the "dentry" slab cache, and with kmemcheck tracking all the other
188caches. This is advanced usage, however, and is not generally recommended.
189
190
191Run-time enable/disable
192~~~~~~~~~~~~~~~~~~~~~~~
193
194When the kernel has booted, it is possible to enable or disable kmemcheck at
195run-time. WARNING: This feature is still experimental and may cause false
196positive warnings to appear. Therefore, try not to use this. If you find that
197it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
198will be happy to take bug reports.
199
200Use the file ``/proc/sys/kernel/kmemcheck`` for this purpose, e.g.::
201
202 $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
203
204The numbers are the same as for the ``kmemcheck=`` command-line option.
205
206
207Debugging
208~~~~~~~~~
209
210A typical report will look something like this::
211
212 WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
213 80000000000000000000000000000000000000000088ffff0000000000000000
214 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
215 ^
216
217 Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
218 RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
219 RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002
220 RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
221 RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
222 RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
223 R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
224 R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
225 FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
226 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
227 CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
228 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
229 DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
230 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
231 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
232 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
233 [<ffffffff8100c7b5>] int_signal+0x12/0x17
234 [<ffffffffffffffff>] 0xffffffffffffffff
235
236The single most valuable information in this report is the RIP (or EIP on 32-
237bit) value. This will help us pinpoint exactly which instruction that caused
238the warning.
239
240If your kernel was compiled with ``CONFIG_DEBUG_INFO=y``, then all we have to do
241is give this address to the addr2line program, like this::
242
243 $ addr2line -e vmlinux -i ffffffff8104ede8
244 arch/x86/include/asm/string_64.h:12
245 include/asm-generic/siginfo.h:287
246 kernel/signal.c:380
247 kernel/signal.c:410
248
249The "``-e vmlinux``" tells addr2line which file to look in. **IMPORTANT:**
250This must be the vmlinux of the kernel that produced the warning in the
251first place! If not, the line number information will almost certainly be
252wrong.
253
254The "``-i``" tells addr2line to also print the line numbers of inlined
255functions. In this case, the flag was very important, because otherwise,
256it would only have printed the first line, which is just a call to
257``memcpy()``, which could be called from a thousand places in the kernel, and
258is therefore not very useful. These inlined functions would not show up in
259the stack trace above, simply because the kernel doesn't load the extra
260debugging information. This technique can of course be used with ordinary
261kernel oopses as well.
262
263In this case, it's the caller of ``memcpy()`` that is interesting, and it can be
264found in ``include/asm-generic/siginfo.h``, line 287::
265
266 281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
267 282 {
268 283 if (from->si_code < 0)
269 284 memcpy(to, from, sizeof(*to));
270 285 else
271 286 /* _sigchld is currently the largest know union member */
272 287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
273 288 }
274
275Since this was a read (kmemcheck usually warns about reads only, though it can
276warn about writes to unallocated or freed memory as well), it was probably the
277"from" argument which contained some uninitialized bytes. Following the chain
278of calls, we move upwards to see where "from" was allocated or initialized,
279``kernel/signal.c``, line 380::
280
281 359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
282 360 {
283 ...
284 367 list_for_each_entry(q, &list->list, list) {
285 368 if (q->info.si_signo == sig) {
286 369 if (first)
287 370 goto still_pending;
288 371 first = q;
289 ...
290 377 if (first) {
291 378 still_pending:
292 379 list_del_init(&first->list);
293 380 copy_siginfo(info, &first->info);
294 381 __sigqueue_free(first);
295 ...
296 392 }
297 393 }
298
299Here, it is ``&first->info`` that is being passed on to ``copy_siginfo()``. The
300variable ``first`` was found on a list -- passed in as the second argument to
301``collect_signal()``. We continue our journey through the stack, to figure out
302where the item on "list" was allocated or initialized. We move to line 410::
303
304 395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
305 396 siginfo_t *info)
306 397 {
307 ...
308 410 collect_signal(sig, pending, info);
309 ...
310 414 }
311
312Now we need to follow the ``pending`` pointer, since that is being passed on to
313``collect_signal()`` as ``list``. At this point, we've run out of lines from the
314"addr2line" output. Not to worry, we just paste the next addresses from the
315kmemcheck stack dump, i.e.::
316
317 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
318 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
319 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
320 [<ffffffff8100c7b5>] int_signal+0x12/0x17
321
322 $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
323 ffffffff8100b87d ffffffff8100c7b5
324 kernel/signal.c:446
325 kernel/signal.c:1806
326 arch/x86/kernel/signal.c:805
327 arch/x86/kernel/signal.c:871
328 arch/x86/kernel/entry_64.S:694
329
330Remember that since these addresses were found on the stack and not as the
331RIP value, they actually point to the _next_ instruction (they are return
332addresses). This becomes obvious when we look at the code for line 446::
333
334 422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
335 423 {
336 ...
337 431 signr = __dequeue_signal(&tsk->signal->shared_pending,
338 432 mask, info);
339 433 /*
340 434 * itimer signal ?
341 435 *
342 436 * itimers are process shared and we restart periodic
343 437 * itimers in the signal delivery path to prevent DoS
344 438 * attacks in the high resolution timer case. This is
345 439 * compliant with the old way of self restarting
346 440 * itimers, as the SIGALRM is a legacy signal and only
347 441 * queued once. Changing the restart behaviour to
348 442 * restart the timer in the signal dequeue path is
349 443 * reducing the timer noise on heavy loaded !highres
350 444 * systems too.
351 445 */
352 446 if (unlikely(signr == SIGALRM)) {
353 ...
354 489 }
355
356So instead of looking at 446, we should be looking at 431, which is the line
357that executes just before 446. Here we see that what we are looking for is
358``&tsk->signal->shared_pending``.
359
360Our next task is now to figure out which function that puts items on this
361``shared_pending`` list. A crude, but efficient tool, is ``git grep``::
362
363 $ git grep -n 'shared_pending' kernel/
364 ...
365 kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending;
366 kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending;
367 ...
368
369There were more results, but none of them were related to list operations,
370and these were the only assignments. We inspect the line numbers more closely
371and find that this is indeed where items are being added to the list::
372
373 816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
374 817 int group)
375 818 {
376 ...
377 828 pending = group ? &t->signal->shared_pending : &t->pending;
378 ...
379 851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
380 852 (is_si_special(info) ||
381 853 info->si_code >= 0)));
382 854 if (q) {
383 855 list_add_tail(&q->list, &pending->list);
384 ...
385 890 }
386
387and::
388
389 1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
390 1310 {
391 ....
392 1339 pending = group ? &t->signal->shared_pending : &t->pending;
393 1340 list_add_tail(&q->list, &pending->list);
394 ....
395 1347 }
396
397In the first case, the list element we are looking for, ``q``, is being
398returned from the function ``__sigqueue_alloc()``, which looks like an
399allocation function. Let's take a look at it::
400
401 187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
402 188 int override_rlimit)
403 189 {
404 190 struct sigqueue *q = NULL;
405 191 struct user_struct *user;
406 192
407 193 /*
408 194 * We won't get problems with the target's UID changing under us
409 195 * because changing it requires RCU be used, and if t != current, the
410 196 * caller must be holding the RCU readlock (by way of a spinlock) and
411 197 * we use RCU protection here
412 198 */
413 199 user = get_uid(__task_cred(t)->user);
414 200 atomic_inc(&user->sigpending);
415 201 if (override_rlimit ||
416 202 atomic_read(&user->sigpending) <=
417 203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
418 204 q = kmem_cache_alloc(sigqueue_cachep, flags);
419 205 if (unlikely(q == NULL)) {
420 206 atomic_dec(&user->sigpending);
421 207 free_uid(user);
422 208 } else {
423 209 INIT_LIST_HEAD(&q->list);
424 210 q->flags = 0;
425 211 q->user = user;
426 212 }
427 213
428 214 return q;
429 215 }
430
431We see that this function initializes ``q->list``, ``q->flags``, and
432``q->user``. It seems that now is the time to look at the definition of
433``struct sigqueue``, e.g.::
434
435 14 struct sigqueue {
436 15 struct list_head list;
437 16 int flags;
438 17 siginfo_t info;
439 18 struct user_struct *user;
440 19 };
441
442And, you might remember, it was a ``memcpy()`` on ``&first->info`` that
443caused the warning, so this makes perfect sense. It also seems reasonable
444to assume that it is the caller of ``__sigqueue_alloc()`` that has the
445responsibility of filling out (initializing) this member.
446
447But just which fields of the struct were uninitialized? Let's look at
448kmemcheck's report again::
449
450 WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
451 80000000000000000000000000000000000000000088ffff0000000000000000
452 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
453 ^
454
455These first two lines are the memory dump of the memory object itself, and
456the shadow bytemap, respectively. The memory object itself is in this case
457``&first->info``. Just beware that the start of this dump is NOT the start
458of the object itself! The position of the caret (^) corresponds with the
459address of the read (ffff88003e4a2024).
460
461The shadow bytemap dump legend is as follows:
462
463- i: initialized
464- u: uninitialized
465- a: unallocated (memory has been allocated by the slab layer, but has not
466 yet been handed off to anybody)
467- f: freed (memory has been allocated by the slab layer, but has been freed
468 by the previous owner)
469
470In order to figure out where (relative to the start of the object) the
471uninitialized memory was located, we have to look at the disassembly. For
472that, we'll need the RIP address again::
473
474 RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
475
476 $ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8:
477 ffffffff8104edc8: mov %r8,0x8(%r8)
478 ffffffff8104edcc: test %r10d,%r10d
479 ffffffff8104edcf: js ffffffff8104ee88 <__dequeue_signal+0x168>
480 ffffffff8104edd5: mov %rax,%rdx
481 ffffffff8104edd8: mov $0xc,%ecx
482 ffffffff8104eddd: mov %r13,%rdi
483 ffffffff8104ede0: mov $0x30,%eax
484 ffffffff8104ede5: mov %rdx,%rsi
485 ffffffff8104ede8: rep movsl %ds:(%rsi),%es:(%rdi)
486 ffffffff8104edea: test $0x2,%al
487 ffffffff8104edec: je ffffffff8104edf0 <__dequeue_signal+0xd0>
488 ffffffff8104edee: movsw %ds:(%rsi),%es:(%rdi)
489 ffffffff8104edf0: test $0x1,%al
490 ffffffff8104edf2: je ffffffff8104edf5 <__dequeue_signal+0xd5>
491 ffffffff8104edf4: movsb %ds:(%rsi),%es:(%rdi)
492 ffffffff8104edf5: mov %r8,%rdi
493 ffffffff8104edf8: callq ffffffff8104de60 <__sigqueue_free>
494
495As expected, it's the "``rep movsl``" instruction from the ``memcpy()``
496that causes the warning. We know about ``REP MOVSL`` that it uses the register
497``RCX`` to count the number of remaining iterations. By taking a look at the
498register dump again (from the kmemcheck report), we can figure out how many
499bytes were left to copy::
500
501 RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
502
503By looking at the disassembly, we also see that ``%ecx`` is being loaded
504with the value ``$0xc`` just before (ffffffff8104edd8), so we are very
505lucky. Keep in mind that this is the number of iterations, not bytes. And
506since this is a "long" operation, we need to multiply by 4 to get the
507number of bytes. So this means that the uninitialized value was encountered
508at 4 * (0xc - 0x9) = 12 bytes from the start of the object.
509
510We can now try to figure out which field of the "``struct siginfo``" that
511was not initialized. This is the beginning of the struct::
512
513 40 typedef struct siginfo {
514 41 int si_signo;
515 42 int si_errno;
516 43 int si_code;
517 44
518 45 union {
519 ..
520 92 } _sifields;
521 93 } siginfo_t;
522
523On 64-bit, the int is 4 bytes long, so it must the union member that has
524not been initialized. We can verify this using gdb::
525
526 $ gdb vmlinux
527 ...
528 (gdb) p &((struct siginfo *) 0)->_sifields
529 $1 = (union {...} *) 0x10
530
531Actually, it seems that the union member is located at offset 0x10 -- which
532means that gcc has inserted 4 bytes of padding between the members ``si_code``
533and ``_sifields``. We can now get a fuller picture of the memory dump::
534
535 _----------------------------=> si_code
536 / _--------------------=> (padding)
537 | / _------------=> _sifields(._kill._pid)
538 | | / _----=> _sifields(._kill._uid)
539 | | | /
540 -------|-------|-------|-------|
541 80000000000000000000000000000000000000000088ffff0000000000000000
542 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
543
544This allows us to realize another important fact: ``si_code`` contains the
545value 0x80. Remember that x86 is little endian, so the first 4 bytes
546"80000000" are really the number 0x00000080. With a bit of research, we
547find that this is actually the constant ``SI_KERNEL`` defined in
548``include/asm-generic/siginfo.h``::
549
550 144 #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */
551
552This macro is used in exactly one place in the x86 kernel: In ``send_signal()``
553in ``kernel/signal.c``::
554
555 816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
556 817 int group)
557 818 {
558 ...
559 828 pending = group ? &t->signal->shared_pending : &t->pending;
560 ...
561 851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
562 852 (is_si_special(info) ||
563 853 info->si_code >= 0)));
564 854 if (q) {
565 855 list_add_tail(&q->list, &pending->list);
566 856 switch ((unsigned long) info) {
567 ...
568 865 case (unsigned long) SEND_SIG_PRIV:
569 866 q->info.si_signo = sig;
570 867 q->info.si_errno = 0;
571 868 q->info.si_code = SI_KERNEL;
572 869 q->info.si_pid = 0;
573 870 q->info.si_uid = 0;
574 871 break;
575 ...
576 890 }
577
578Not only does this match with the ``.si_code`` member, it also matches the place
579we found earlier when looking for where siginfo_t objects are enqueued on the
580``shared_pending`` list.
581
582So to sum up: It seems that it is the padding introduced by the compiler
583between two struct fields that is uninitialized, and this gets reported when
584we do a ``memcpy()`` on the struct. This means that we have identified a false
585positive warning.
586
587Normally, kmemcheck will not report uninitialized accesses in ``memcpy()`` calls
588when both the source and destination addresses are tracked. (Instead, we copy
589the shadow bytemap as well). In this case, the destination address clearly
590was not tracked. We can dig a little deeper into the stack trace from above::
591
592 arch/x86/kernel/signal.c:805
593 arch/x86/kernel/signal.c:871
594 arch/x86/kernel/entry_64.S:694
595
596And we clearly see that the destination siginfo object is located on the
597stack::
598
599 782 static void do_signal(struct pt_regs *regs)
600 783 {
601 784 struct k_sigaction ka;
602 785 siginfo_t info;
603 ...
604 804 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
605 ...
606 854 }
607
608And this ``&info`` is what eventually gets passed to ``copy_siginfo()`` as the
609destination argument.
610
611Now, even though we didn't find an actual error here, the example is still a
612good one, because it shows how one would go about to find out what the report
613was all about.
614
615
616Annotating false positives
617~~~~~~~~~~~~~~~~~~~~~~~~~~
618
619There are a few different ways to make annotations in the source code that
620will keep kmemcheck from checking and reporting certain allocations. Here
621they are:
622
623- ``__GFP_NOTRACK_FALSE_POSITIVE``
624 This flag can be passed to ``kmalloc()`` or ``kmem_cache_alloc()``
625 (therefore also to other functions that end up calling one of
626 these) to indicate that the allocation should not be tracked
627 because it would lead to a false positive report. This is a "big
628 hammer" way of silencing kmemcheck; after all, even if the false
629 positive pertains to particular field in a struct, for example, we
630 will now lose the ability to find (real) errors in other parts of
631 the same struct.
632
633 Example::
634
635 /* No warnings will ever trigger on accessing any part of x */
636 x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE);
637
638- ``kmemcheck_bitfield_begin(name)``/``kmemcheck_bitfield_end(name)`` and
639 ``kmemcheck_annotate_bitfield(ptr, name)``
640 The first two of these three macros can be used inside struct
641 definitions to signal, respectively, the beginning and end of a
642 bitfield. Additionally, this will assign the bitfield a name, which
643 is given as an argument to the macros.
644
645 Having used these markers, one can later use
646 kmemcheck_annotate_bitfield() at the point of allocation, to indicate
647 which parts of the allocation is part of a bitfield.
648
649 Example::
650
651 struct foo {
652 int x;
653
654 kmemcheck_bitfield_begin(flags);
655 int flag_a:1;
656 int flag_b:1;
657 kmemcheck_bitfield_end(flags);
658
659 int y;
660 };
661
662 struct foo *x = kmalloc(sizeof *x);
663
664 /* No warnings will trigger on accessing the bitfield of x */
665 kmemcheck_annotate_bitfield(x, flags);
666
667 Note that ``kmemcheck_annotate_bitfield()`` can be used even before the
668 return value of ``kmalloc()`` is checked -- in other words, passing NULL
669 as the first argument is legal (and will do nothing).
670
671
672Reporting errors
673----------------
674
675As we have seen, kmemcheck will produce false positive reports. Therefore, it
676is not very wise to blindly post kmemcheck warnings to mailing lists and
677maintainers. Instead, I encourage maintainers and developers to find errors
678in their own code. If you get a warning, you can try to work around it, try
679to figure out if it's a real error or not, or simply ignore it. Most
680developers know their own code and will quickly and efficiently determine the
681root cause of a kmemcheck report. This is therefore also the most efficient
682way to work with kmemcheck.
683
684That said, we (the kmemcheck maintainers) will always be on the lookout for
685false positives that we can annotate and silence. So whatever you find,
686please drop us a note privately! Kernel configs and steps to reproduce (if
687available) are of course a great help too.
688
689Happy hacking!
690
691
692Technical description
693---------------------
694
695kmemcheck works by marking memory pages non-present. This means that whenever
696somebody attempts to access the page, a page fault is generated. The page
697fault handler notices that the page was in fact only hidden, and so it calls
698on the kmemcheck code to make further investigations.
699
700When the investigations are completed, kmemcheck "shows" the page by marking
701it present (as it would be under normal circumstances). This way, the
702interrupted code can continue as usual.
703
704But after the instruction has been executed, we should hide the page again, so
705that we can catch the next access too! Now kmemcheck makes use of a debugging
706feature of the processor, namely single-stepping. When the processor has
707finished the one instruction that generated the memory access, a debug
708exception is raised. From here, we simply hide the page again and continue
709execution, this time with the single-stepping feature turned off.
710
711kmemcheck requires some assistance from the memory allocator in order to work.
712The memory allocator needs to
713
714 1. Tell kmemcheck about newly allocated pages and pages that are about to
715 be freed. This allows kmemcheck to set up and tear down the shadow memory
716 for the pages in question. The shadow memory stores the status of each
717 byte in the allocation proper, e.g. whether it is initialized or
718 uninitialized.
719
720 2. Tell kmemcheck which parts of memory should be marked uninitialized.
721 There are actually a few more states, such as "not yet allocated" and
722 "recently freed".
723
724If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
725memory that can take page faults because of kmemcheck.
726
727If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
728request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags.
729This does not prevent the page faults from occurring, however, but marks the
730object in question as being initialized so that no warnings will ever be
731produced for this object.
732
733Currently, the SLAB and SLUB allocators are supported by kmemcheck.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index adba21b5ada7..ec571b9bb18a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8)
250 VmExe size of text segment 250 VmExe size of text segment
251 VmLib size of shared library code 251 VmLib size of shared library code
252 VmPTE size of page table entries 252 VmPTE size of page table entries
253 VmPMD size of second level page tables
254 VmSwap amount of swap used by anonymous private data 253 VmSwap amount of swap used by anonymous private data
255 (shmem swap usage is not included) 254 (shmem swap usage is not included)
256 HugetlbPages size of hugetlb memory portions 255 HugetlbPages size of hugetlb memory portions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a9ef4e..055c8b3e1018 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm:
58- percpu_pagelist_fraction 58- percpu_pagelist_fraction
59- stat_interval 59- stat_interval
60- stat_refresh 60- stat_refresh
61- numa_stat
61- swappiness 62- swappiness
62- user_reserve_kbytes 63- user_reserve_kbytes
63- vfs_cache_pressure 64- vfs_cache_pressure
@@ -157,6 +158,10 @@ Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
157value lower than this limit will be ignored and the old configuration will be 158value lower than this limit will be ignored and the old configuration will be
158retained. 159retained.
159 160
161Note: the value of dirty_bytes also must be set greater than
162dirty_background_bytes or the amount of memory corresponding to
163dirty_background_ratio.
164
160============================================================== 165==============================================================
161 166
162dirty_expire_centisecs 167dirty_expire_centisecs
@@ -176,6 +181,9 @@ generating disk writes will itself start writing out dirty data.
176 181
177The total available memory is not equal to total system memory. 182The total available memory is not equal to total system memory.
178 183
184Note: dirty_ratio must be set greater than dirty_background_ratio or
185ratio corresponding to dirty_background_bytes.
186
179============================================================== 187==============================================================
180 188
181dirty_writeback_centisecs 189dirty_writeback_centisecs
@@ -622,7 +630,7 @@ oom_dump_tasks
622 630
623Enables a system-wide task dump (excluding kernel threads) to be produced 631Enables a system-wide task dump (excluding kernel threads) to be produced
624when the kernel performs an OOM-killing and includes such information as 632when the kernel performs an OOM-killing and includes such information as
625pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj 633pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
626score, and name. This is helpful to determine why the OOM killer was 634score, and name. This is helpful to determine why the OOM killer was
627invoked, to identify the rogue task that caused it, and to determine why 635invoked, to identify the rogue task that caused it, and to determine why
628the OOM killer chose the task it did to kill. 636the OOM killer chose the task it did to kill.
@@ -792,6 +800,21 @@ with no ill effects: errors and warnings on these stats are suppressed.)
792 800
793============================================================== 801==============================================================
794 802
803numa_stat
804
805This interface allows runtime configuration of numa statistics.
806
807When page allocation performance becomes a bottleneck and you can tolerate
808some possible tool breakage and decreased numa counter precision, you can
809do:
810 echo 0 > /proc/sys/vm/numa_stat
811
812When page allocation performance is not a bottleneck and you want all
813tooling to work, you can do:
814 echo 1 > /proc/sys/vm/numa_stat
815
816==============================================================
817
795swappiness 818swappiness
796 819
797This control is used to define how aggressive the kernel will swap 820This control is used to define how aggressive the kernel will swap
diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt
new file mode 100644
index 000000000000..23b462566bb7
--- /dev/null
+++ b/Documentation/vm/mmu_notifier.txt
@@ -0,0 +1,93 @@
1When do you need to notify inside page table lock ?
2
3When clearing a pte/pmd we are given a choice to notify the event through
4(notify version of *_clear_flush call mmu_notifier_invalidate_range) under
5the page table lock. But that notification is not necessary in all cases.
6
7For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
8thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
9process virtual address space). There is only 2 cases when you need to notify
10those secondary TLB while holding page table lock when clearing a pte/pmd:
11
12 A) page backing address is free before mmu_notifier_invalidate_range_end()
13 B) a page table entry is updated to point to a new page (COW, write fault
14 on zero page, __replace_page(), ...)
15
16Case A is obvious you do not want to take the risk for the device to write to
17a page that might now be used by some completely different task.
18
19Case B is more subtle. For correctness it requires the following sequence to
20happen:
21 - take page table lock
22 - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
23 - set page table entry to point to new page
24
25If clearing the page table entry is not followed by a notify before setting
26the new pte/pmd value then you can break memory model like C11 or C++11 for
27the device.
28
29Consider the following scenario (device use a feature similar to ATS/PASID):
30
31Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume
32they are write protected for COW (other case of B apply too).
33
34[Time N] --------------------------------------------------------------------
35CPU-thread-0 {try to write to addrA}
36CPU-thread-1 {try to write to addrB}
37CPU-thread-2 {}
38CPU-thread-3 {}
39DEV-thread-0 {read addrA and populate device TLB}
40DEV-thread-2 {read addrB and populate device TLB}
41[Time N+1] ------------------------------------------------------------------
42CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
43CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
44CPU-thread-2 {}
45CPU-thread-3 {}
46DEV-thread-0 {}
47DEV-thread-2 {}
48[Time N+2] ------------------------------------------------------------------
49CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}}
50CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}}
51CPU-thread-2 {}
52CPU-thread-3 {}
53DEV-thread-0 {}
54DEV-thread-2 {}
55[Time N+3] ------------------------------------------------------------------
56CPU-thread-0 {preempted}
57CPU-thread-1 {preempted}
58CPU-thread-2 {write to addrA which is a write to new page}
59CPU-thread-3 {}
60DEV-thread-0 {}
61DEV-thread-2 {}
62[Time N+3] ------------------------------------------------------------------
63CPU-thread-0 {preempted}
64CPU-thread-1 {preempted}
65CPU-thread-2 {}
66CPU-thread-3 {write to addrB which is a write to new page}
67DEV-thread-0 {}
68DEV-thread-2 {}
69[Time N+4] ------------------------------------------------------------------
70CPU-thread-0 {preempted}
71CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
72CPU-thread-2 {}
73CPU-thread-3 {}
74DEV-thread-0 {}
75DEV-thread-2 {}
76[Time N+5] ------------------------------------------------------------------
77CPU-thread-0 {preempted}
78CPU-thread-1 {}
79CPU-thread-2 {}
80CPU-thread-3 {}
81DEV-thread-0 {read addrA from old page}
82DEV-thread-2 {read addrB from new page}
83
84So here because at time N+2 the clear page table entry was not pair with a
85notification to invalidate the secondary TLB, the device see the new value for
86addrB before seing the new value for addrA. This break total memory ordering
87for the device.
88
89When changing a pte to write protect or to point to a new write protected page
90with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
91call to mmu_notifier_invalidate_range_end() outside the page table lock. This
92is true even if the thread doing the page table update is preempted right after
93releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/MAINTAINERS b/MAINTAINERS
index cd7e12dc6af4..b0543c223f6a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7692,16 +7692,6 @@ F: include/linux/kdb.h
7692F: include/linux/kgdb.h 7692F: include/linux/kgdb.h
7693F: kernel/debug/ 7693F: kernel/debug/
7694 7694
7695KMEMCHECK
7696M: Vegard Nossum <vegardno@ifi.uio.no>
7697M: Pekka Enberg <penberg@kernel.org>
7698S: Maintained
7699F: Documentation/dev-tools/kmemcheck.rst
7700F: arch/x86/include/asm/kmemcheck.h
7701F: arch/x86/mm/kmemcheck/
7702F: include/linux/kmemcheck.h
7703F: mm/kmemcheck.c
7704
7705KMEMLEAK 7695KMEMLEAK
7706M: Catalin Marinas <catalin.marinas@arm.com> 7696M: Catalin Marinas <catalin.marinas@arm.com>
7707S: Maintained 7697S: Maintained
diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h
index 0722ec6be692..6821f1249300 100644
--- a/arch/arm/include/asm/dma-iommu.h
+++ b/arch/arm/include/asm/dma-iommu.h
@@ -7,7 +7,6 @@
7#include <linux/mm_types.h> 7#include <linux/mm_types.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9#include <linux/dma-debug.h> 9#include <linux/dma-debug.h>
10#include <linux/kmemcheck.h>
11#include <linux/kref.h> 10#include <linux/kref.h>
12 11
13#define ARM_MAPPING_ERROR (~(dma_addr_t)0x0) 12#define ARM_MAPPING_ERROR (~(dma_addr_t)0x0)
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index b2902a5cd780..2d7344f0e208 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
57extern pgd_t *pgd_alloc(struct mm_struct *mm); 57extern pgd_t *pgd_alloc(struct mm_struct *mm);
58extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); 58extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
59 59
60#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) 60#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
61 61
62static inline void clean_pte_table(pte_t *pte) 62static inline void clean_pte_table(pte_t *pte)
63{ 63{
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index c1c1a5c67da1..61e281cb29fb 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base)
141 pte = pmd_pgtable(*pmd); 141 pte = pmd_pgtable(*pmd);
142 pmd_clear(pmd); 142 pmd_clear(pmd);
143 pte_free(mm, pte); 143 pte_free(mm, pte);
144 atomic_long_dec(&mm->nr_ptes); 144 mm_dec_nr_ptes(mm);
145no_pmd: 145no_pmd:
146 pud_clear(pud); 146 pud_clear(pud);
147 pmd_free(mm, pmd); 147 pmd_free(mm, pmd);
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index ba6aab55d464..a93339f5178f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -85,7 +85,7 @@ config ARM64
85 select HAVE_ARCH_BITREVERSE 85 select HAVE_ARCH_BITREVERSE
86 select HAVE_ARCH_HUGE_VMAP 86 select HAVE_ARCH_HUGE_VMAP
87 select HAVE_ARCH_JUMP_LABEL 87 select HAVE_ARCH_JUMP_LABEL
88 select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) 88 select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
89 select HAVE_ARCH_KGDB 89 select HAVE_ARCH_KGDB
90 select HAVE_ARCH_MMAP_RND_BITS 90 select HAVE_ARCH_MMAP_RND_BITS
91 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT 91 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index d25f4f137c2a..5ca6a573a701 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -26,7 +26,7 @@
26 26
27#define check_pgt_cache() do { } while (0) 27#define check_pgt_cache() do { } while (0)
28 28
29#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) 29#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
30#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) 30#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
31 31
32#if CONFIG_PGTABLE_LEVELS > 2 32#if CONFIG_PGTABLE_LEVELS > 2
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 81f03959a4ab..acba49fb5aac 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#define pr_fmt(fmt) "kasan: " fmt 13#define pr_fmt(fmt) "kasan: " fmt
14#include <linux/bootmem.h>
14#include <linux/kasan.h> 15#include <linux/kasan.h>
15#include <linux/kernel.h> 16#include <linux/kernel.h>
16#include <linux/sched/task.h> 17#include <linux/sched/task.h>
@@ -35,77 +36,117 @@ static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
35 * with the physical address from __pa_symbol. 36 * with the physical address from __pa_symbol.
36 */ 37 */
37 38
38static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr, 39static phys_addr_t __init kasan_alloc_zeroed_page(int node)
39 unsigned long end)
40{ 40{
41 pte_t *pte; 41 void *p = memblock_virt_alloc_try_nid(PAGE_SIZE, PAGE_SIZE,
42 unsigned long next; 42 __pa(MAX_DMA_ADDRESS),
43 MEMBLOCK_ALLOC_ACCESSIBLE, node);
44 return __pa(p);
45}
46
47static pte_t *__init kasan_pte_offset(pmd_t *pmd, unsigned long addr, int node,
48 bool early)
49{
50 if (pmd_none(*pmd)) {
51 phys_addr_t pte_phys = early ? __pa_symbol(kasan_zero_pte)
52 : kasan_alloc_zeroed_page(node);
53 __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
54 }
55
56 return early ? pte_offset_kimg(pmd, addr)
57 : pte_offset_kernel(pmd, addr);
58}
43 59
44 if (pmd_none(*pmd)) 60static pmd_t *__init kasan_pmd_offset(pud_t *pud, unsigned long addr, int node,
45 __pmd_populate(pmd, __pa_symbol(kasan_zero_pte), PMD_TYPE_TABLE); 61 bool early)
62{
63 if (pud_none(*pud)) {
64 phys_addr_t pmd_phys = early ? __pa_symbol(kasan_zero_pmd)
65 : kasan_alloc_zeroed_page(node);
66 __pud_populate(pud, pmd_phys, PMD_TYPE_TABLE);
67 }
68
69 return early ? pmd_offset_kimg(pud, addr) : pmd_offset(pud, addr);
70}
71
72static pud_t *__init kasan_pud_offset(pgd_t *pgd, unsigned long addr, int node,
73 bool early)
74{
75 if (pgd_none(*pgd)) {
76 phys_addr_t pud_phys = early ? __pa_symbol(kasan_zero_pud)
77 : kasan_alloc_zeroed_page(node);
78 __pgd_populate(pgd, pud_phys, PMD_TYPE_TABLE);
79 }
80
81 return early ? pud_offset_kimg(pgd, addr) : pud_offset(pgd, addr);
82}
83
84static void __init kasan_pte_populate(pmd_t *pmd, unsigned long addr,
85 unsigned long end, int node, bool early)
86{
87 unsigned long next;
88 pte_t *pte = kasan_pte_offset(pmd, addr, node, early);
46 89
47 pte = pte_offset_kimg(pmd, addr);
48 do { 90 do {
91 phys_addr_t page_phys = early ? __pa_symbol(kasan_zero_page)
92 : kasan_alloc_zeroed_page(node);
49 next = addr + PAGE_SIZE; 93 next = addr + PAGE_SIZE;
50 set_pte(pte, pfn_pte(sym_to_pfn(kasan_zero_page), 94 set_pte(pte, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
51 PAGE_KERNEL));
52 } while (pte++, addr = next, addr != end && pte_none(*pte)); 95 } while (pte++, addr = next, addr != end && pte_none(*pte));
53} 96}
54 97
55static void __init kasan_early_pmd_populate(pud_t *pud, 98static void __init kasan_pmd_populate(pud_t *pud, unsigned long addr,
56 unsigned long addr, 99 unsigned long end, int node, bool early)
57 unsigned long end)
58{ 100{
59 pmd_t *pmd;
60 unsigned long next; 101 unsigned long next;
102 pmd_t *pmd = kasan_pmd_offset(pud, addr, node, early);
61 103
62 if (pud_none(*pud))
63 __pud_populate(pud, __pa_symbol(kasan_zero_pmd), PMD_TYPE_TABLE);
64
65 pmd = pmd_offset_kimg(pud, addr);
66 do { 104 do {
67 next = pmd_addr_end(addr, end); 105 next = pmd_addr_end(addr, end);
68 kasan_early_pte_populate(pmd, addr, next); 106 kasan_pte_populate(pmd, addr, next, node, early);
69 } while (pmd++, addr = next, addr != end && pmd_none(*pmd)); 107 } while (pmd++, addr = next, addr != end && pmd_none(*pmd));
70} 108}
71 109
72static void __init kasan_early_pud_populate(pgd_t *pgd, 110static void __init kasan_pud_populate(pgd_t *pgd, unsigned long addr,
73 unsigned long addr, 111 unsigned long end, int node, bool early)
74 unsigned long end)
75{ 112{
76 pud_t *pud;
77 unsigned long next; 113 unsigned long next;
114 pud_t *pud = kasan_pud_offset(pgd, addr, node, early);
78 115
79 if (pgd_none(*pgd))
80 __pgd_populate(pgd, __pa_symbol(kasan_zero_pud), PUD_TYPE_TABLE);
81
82 pud = pud_offset_kimg(pgd, addr);
83 do { 116 do {
84 next = pud_addr_end(addr, end); 117 next = pud_addr_end(addr, end);
85 kasan_early_pmd_populate(pud, addr, next); 118 kasan_pmd_populate(pud, addr, next, node, early);
86 } while (pud++, addr = next, addr != end && pud_none(*pud)); 119 } while (pud++, addr = next, addr != end && pud_none(*pud));
87} 120}
88 121
89static void __init kasan_map_early_shadow(void) 122static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
123 int node, bool early)
90{ 124{
91 unsigned long addr = KASAN_SHADOW_START;
92 unsigned long end = KASAN_SHADOW_END;
93 unsigned long next; 125 unsigned long next;
94 pgd_t *pgd; 126 pgd_t *pgd;
95 127
96 pgd = pgd_offset_k(addr); 128 pgd = pgd_offset_k(addr);
97 do { 129 do {
98 next = pgd_addr_end(addr, end); 130 next = pgd_addr_end(addr, end);
99 kasan_early_pud_populate(pgd, addr, next); 131 kasan_pud_populate(pgd, addr, next, node, early);
100 } while (pgd++, addr = next, addr != end); 132 } while (pgd++, addr = next, addr != end);
101} 133}
102 134
135/* The early shadow maps everything to a single page of zeroes */
103asmlinkage void __init kasan_early_init(void) 136asmlinkage void __init kasan_early_init(void)
104{ 137{
105 BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61)); 138 BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61));
106 BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); 139 BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE));
107 BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); 140 BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
108 kasan_map_early_shadow(); 141 kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE,
142 true);
143}
144
145/* Set up full kasan mappings, ensuring that the mapped pages are zeroed */
146static void __init kasan_map_populate(unsigned long start, unsigned long end,
147 int node)
148{
149 kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false);
109} 150}
110 151
111/* 152/*
@@ -142,8 +183,8 @@ void __init kasan_init(void)
142 struct memblock_region *reg; 183 struct memblock_region *reg;
143 int i; 184 int i;
144 185
145 kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); 186 kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK;
146 kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); 187 kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end));
147 188
148 mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); 189 mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR);
149 mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); 190 mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END);
@@ -161,19 +202,8 @@ void __init kasan_init(void)
161 202
162 clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); 203 clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
163 204
164 vmemmap_populate(kimg_shadow_start, kimg_shadow_end, 205 kasan_map_populate(kimg_shadow_start, kimg_shadow_end,
165 pfn_to_nid(virt_to_pfn(lm_alias(_text)))); 206 pfn_to_nid(virt_to_pfn(lm_alias(_text))));
166
167 /*
168 * vmemmap_populate() has populated the shadow region that covers the
169 * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round
170 * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent
171 * kasan_populate_zero_shadow() from replacing the page table entries
172 * (PMD or PTE) at the edges of the shadow region for the kernel
173 * image.
174 */
175 kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE);
176 kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE);
177 207
178 kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, 208 kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
179 (void *)mod_shadow_start); 209 (void *)mod_shadow_start);
@@ -191,9 +221,9 @@ void __init kasan_init(void)
191 if (start >= end) 221 if (start >= end)
192 break; 222 break;
193 223
194 vmemmap_populate((unsigned long)kasan_mem_to_shadow(start), 224 kasan_map_populate((unsigned long)kasan_mem_to_shadow(start),
195 (unsigned long)kasan_mem_to_shadow(end), 225 (unsigned long)kasan_mem_to_shadow(end),
196 pfn_to_nid(virt_to_pfn(start))); 226 pfn_to_nid(virt_to_pfn(start)));
197 } 227 }
198 228
199 /* 229 /*
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 328f0a292316..cf464100e838 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -42,21 +42,9 @@
42#undef DEBUG 42#undef DEBUG
43 43
44/* 44/*
45 * BAD_PAGE is the page that is used for page faults when linux
46 * is out-of-memory. Older versions of linux just did a
47 * do_exit(), but using this instead means there is less risk
48 * for a process dying in kernel mode, possibly leaving a inode
49 * unused etc..
50 *
51 * BAD_PAGETABLE is the accompanying page-table: it is initialized
52 * to point to BAD_PAGE entries.
53 *
54 * ZERO_PAGE is a special page that is used for zero-initialized 45 * ZERO_PAGE is a special page that is used for zero-initialized
55 * data and COW. 46 * data and COW.
56 */ 47 */
57static unsigned long empty_bad_page_table;
58static unsigned long empty_bad_page;
59
60unsigned long empty_zero_page; 48unsigned long empty_zero_page;
61EXPORT_SYMBOL(empty_zero_page); 49EXPORT_SYMBOL(empty_zero_page);
62 50
@@ -72,8 +60,6 @@ void __init paging_init(void)
72 unsigned long zones_size[MAX_NR_ZONES] = {0, }; 60 unsigned long zones_size[MAX_NR_ZONES] = {0, };
73 61
74 /* allocate some pages for kernel housekeeping tasks */ 62 /* allocate some pages for kernel housekeeping tasks */
75 empty_bad_page_table = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
76 empty_bad_page = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
77 empty_zero_page = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); 63 empty_zero_page = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
78 64
79 memset((void *) empty_zero_page, 0, PAGE_SIZE); 65 memset((void *) empty_zero_page, 0, PAGE_SIZE);
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index eeead51bed2d..015287ac8ce8 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -40,20 +40,9 @@
40#include <asm/sections.h> 40#include <asm/sections.h>
41 41
42/* 42/*
43 * BAD_PAGE is the page that is used for page faults when linux
44 * is out-of-memory. Older versions of linux just did a
45 * do_exit(), but using this instead means there is less risk
46 * for a process dying in kernel mode, possibly leaving a inode
47 * unused etc..
48 *
49 * BAD_PAGETABLE is the accompanying page-table: it is initialized
50 * to point to BAD_PAGE entries.
51 *
52 * ZERO_PAGE is a special page that is used for zero-initialized 43 * ZERO_PAGE is a special page that is used for zero-initialized
53 * data and COW. 44 * data and COW.
54 */ 45 */
55static unsigned long empty_bad_page_table;
56static unsigned long empty_bad_page;
57unsigned long empty_zero_page; 46unsigned long empty_zero_page;
58 47
59/* 48/*
@@ -78,8 +67,6 @@ void __init paging_init(void)
78 * Initialize the bad page table and bad page to point 67 * Initialize the bad page table and bad page to point
79 * to a couple of allocated pages. 68 * to a couple of allocated pages.
80 */ 69 */
81 empty_bad_page_table = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
82 empty_bad_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
83 empty_zero_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); 70 empty_zero_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
84 memset((void *)empty_zero_page, 0, PAGE_SIZE); 71 memset((void *)empty_zero_page, 0, PAGE_SIZE);
85 72
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 0d9446c37ae8..498398d915c1 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -196,8 +196,8 @@ config TIMER_DIVIDE
196 default "128" 196 default "128"
197 197
198config CPU_BIG_ENDIAN 198config CPU_BIG_ENDIAN
199 bool "Generate big endian code" 199 bool
200 default n 200 default !CPU_LITTLE_ENDIAN
201 201
202config CPU_LITTLE_ENDIAN 202config CPU_LITTLE_ENDIAN
203 bool "Generate little endian code" 203 bool "Generate little endian code"
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 67fe6dc5211c..0036ea0c7173 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -31,12 +31,7 @@
31 * tables. Each page table is also a single 4K page, giving 512 (== 31 * tables. Each page table is also a single 4K page, giving 512 (==
32 * PTRS_PER_PTE) 8 byte ptes. Each pud entry is initialized to point to 32 * PTRS_PER_PTE) 8 byte ptes. Each pud entry is initialized to point to
33 * invalid_pmd_table, each pmd entry is initialized to point to 33 * invalid_pmd_table, each pmd entry is initialized to point to
34 * invalid_pte_table, each pte is initialized to 0. When memory is low, 34 * invalid_pte_table, each pte is initialized to 0.
35 * and a pmd table or a page table allocation fails, empty_bad_pmd_table
36 * and empty_bad_page_table is returned back to higher layer code, so
37 * that the failure is recognized later on. Linux does not seem to
38 * handle these failures very well though. The empty_bad_page_table has
39 * invalid pte entries in it, to force page faults.
40 * 35 *
41 * Kernel mappings: kernel mappings are held in the swapper_pg_table. 36 * Kernel mappings: kernel mappings are held in the swapper_pg_table.
42 * The layout is identical to userspace except it's indexed with the 37 * The layout is identical to userspace except it's indexed with the
@@ -175,7 +170,6 @@
175 printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) 170 printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
176 171
177extern pte_t invalid_pte_table[PTRS_PER_PTE]; 172extern pte_t invalid_pte_table[PTRS_PER_PTE];
178extern pte_t empty_bad_page_table[PTRS_PER_PTE];
179 173
180#ifndef __PAGETABLE_PUD_FOLDED 174#ifndef __PAGETABLE_PUD_FOLDED
181/* 175/*
diff --git a/arch/mn10300/kernel/head.S b/arch/mn10300/kernel/head.S
index 73e00fc78072..0b15f759e0d2 100644
--- a/arch/mn10300/kernel/head.S
+++ b/arch/mn10300/kernel/head.S
@@ -434,14 +434,6 @@ ENTRY(empty_zero_page)
434 .space PAGE_SIZE 434 .space PAGE_SIZE
435 435
436 .balign PAGE_SIZE 436 .balign PAGE_SIZE
437ENTRY(empty_bad_page)
438 .space PAGE_SIZE
439
440 .balign PAGE_SIZE
441ENTRY(empty_bad_pte_table)
442 .space PAGE_SIZE
443
444 .balign PAGE_SIZE
445ENTRY(large_page_table) 437ENTRY(large_page_table)
446 .space PAGE_SIZE 438 .space PAGE_SIZE
447 439
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index f41bd3cb76d9..e212a1f0b6d2 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -23,7 +23,6 @@
23 */ 23 */
24 24
25#include <linux/dma-debug.h> 25#include <linux/dma-debug.h>
26#include <linux/kmemcheck.h>
27#include <linux/dma-mapping.h> 26#include <linux/dma-mapping.h>
28 27
29extern const struct dma_map_ops or1k_dma_map_ops; 28extern const struct dma_map_ops or1k_dma_map_ops;
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index a14203c005f1..e11f03007b57 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp)
18} 18}
19#endif /* MODULE */ 19#endif /* MODULE */
20 20
21#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) 21#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
22 22
23#ifdef CONFIG_PPC_BOOK3S 23#ifdef CONFIG_PPC_BOOK3S
24#include <asm/book3s/pgalloc.h> 24#include <asm/book3s/pgalloc.h>
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1571a498a33f..a9b9083c5e49 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
433 pud = pud_offset(pgd, start); 433 pud = pud_offset(pgd, start);
434 pgd_clear(pgd); 434 pgd_clear(pgd);
435 pud_free_tlb(tlb, pud, start); 435 pud_free_tlb(tlb, pud, start);
436 mm_dec_nr_puds(tlb->mm);
436} 437}
437 438
438/* 439/*
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 05e15386d4cb..a7e998158f37 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -200,7 +200,7 @@ static void destroy_pagetable_page(struct mm_struct *mm)
200 /* We allow PTE_FRAG_NR fragments from a PTE page */ 200 /* We allow PTE_FRAG_NR fragments from a PTE page */
201 if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { 201 if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
202 pgtable_page_dtor(page); 202 pgtable_page_dtor(page);
203 free_hot_cold_page(page, 0); 203 free_unref_page(page);
204 } 204 }
205} 205}
206 206
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index ac0717a90ca6..1ec3aee43624 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -404,7 +404,7 @@ void pte_fragment_free(unsigned long *table, int kernel)
404 if (put_page_testzero(page)) { 404 if (put_page_testzero(page)) {
405 if (!kernel) 405 if (!kernel)
406 pgtable_page_dtor(page); 406 pgtable_page_dtor(page);
407 free_hot_cold_page(page, 0); 407 free_unref_page(page);
408 } 408 }
409} 409}
410 410
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 43607bb12cc2..cf4c1cb17dcd 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk,
44 mm->context.asce_limit = STACK_TOP_MAX; 44 mm->context.asce_limit = STACK_TOP_MAX;
45 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | 45 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
46 _ASCE_USER_BITS | _ASCE_TYPE_REGION3; 46 _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
47 /* pgd_alloc() did not account this pud */
48 mm_inc_nr_puds(mm);
47 break; 49 break;
48 case -PAGE_SIZE: 50 case -PAGE_SIZE:
49 /* forked 5-level task, set new asce with new_mm->pgd */ 51 /* forked 5-level task, set new asce with new_mm->pgd */
@@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk,
59 /* forked 2-level compat task, set new asce with new mm->pgd */ 61 /* forked 2-level compat task, set new asce with new mm->pgd */
60 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | 62 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
61 _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; 63 _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
62 /* pgd_alloc() did not increase mm->nr_pmds */ 64 /* pgd_alloc() did not account this pmd */
63 mm_inc_nr_pmds(mm); 65 mm_inc_nr_pmds(mm);
64 } 66 }
65 crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); 67 crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index e1d751ae2498..1a2526676a87 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void)
1172 1172
1173 dwarf_frame_cachep = kmem_cache_create("dwarf_frames", 1173 dwarf_frame_cachep = kmem_cache_create("dwarf_frames",
1174 sizeof(struct dwarf_frame), 0, 1174 sizeof(struct dwarf_frame), 0,
1175 SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); 1175 SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
1176 1176
1177 dwarf_reg_cachep = kmem_cache_create("dwarf_regs", 1177 dwarf_reg_cachep = kmem_cache_create("dwarf_regs",
1178 sizeof(struct dwarf_reg), 0, 1178 sizeof(struct dwarf_reg), 0,
1179 SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); 1179 SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
1180 1180
1181 dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ, 1181 dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ,
1182 dwarf_frame_cachep); 1182 dwarf_frame_cachep);
diff --git a/arch/sh/kernel/head_64.S b/arch/sh/kernel/head_64.S
index defd851abefa..cca491397a28 100644
--- a/arch/sh/kernel/head_64.S
+++ b/arch/sh/kernel/head_64.S
@@ -101,14 +101,6 @@ empty_zero_page:
101mmu_pdtp_cache: 101mmu_pdtp_cache:
102 .space PAGE_SIZE, 0 102 .space PAGE_SIZE, 0
103 103
104 .global empty_bad_page
105empty_bad_page:
106 .space PAGE_SIZE, 0
107
108 .global empty_bad_pte_table
109empty_bad_pte_table:
110 .space PAGE_SIZE, 0
111
112 .global fpu_in_use 104 .global fpu_in_use
113fpu_in_use: .quad 0 105fpu_in_use: .quad 0
114 106
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index b2d9963d5978..68b1a67533ce 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -59,7 +59,7 @@ void arch_task_cache_init(void)
59 59
60 task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, 60 task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size,
61 __alignof__(union thread_xstate), 61 __alignof__(union thread_xstate),
62 SLAB_PANIC | SLAB_NOTRACK, NULL); 62 SLAB_PANIC, NULL);
63} 63}
64 64
65#ifdef CONFIG_SH_FPU_EMU 65#ifdef CONFIG_SH_FPU_EMU
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index fd9d9bac7cfa..5a9e96be1665 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -231,6 +231,36 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
231extern struct page *mem_map_zero; 231extern struct page *mem_map_zero;
232#define ZERO_PAGE(vaddr) (mem_map_zero) 232#define ZERO_PAGE(vaddr) (mem_map_zero)
233 233
234/* This macro must be updated when the size of struct page grows above 80
235 * or reduces below 64.
236 * The idea that compiler optimizes out switch() statement, and only
237 * leaves clrx instructions
238 */
239#define mm_zero_struct_page(pp) do { \
240 unsigned long *_pp = (void *)(pp); \
241 \
242 /* Check that struct page is either 64, 72, or 80 bytes */ \
243 BUILD_BUG_ON(sizeof(struct page) & 7); \
244 BUILD_BUG_ON(sizeof(struct page) < 64); \
245 BUILD_BUG_ON(sizeof(struct page) > 80); \
246 \
247 switch (sizeof(struct page)) { \
248 case 80: \
249 _pp[9] = 0; /* fallthrough */ \
250 case 72: \
251 _pp[8] = 0; /* fallthrough */ \
252 default: \
253 _pp[7] = 0; \
254 _pp[6] = 0; \
255 _pp[5] = 0; \
256 _pp[4] = 0; \
257 _pp[3] = 0; \
258 _pp[2] = 0; \
259 _pp[1] = 0; \
260 _pp[0] = 0; \
261 } \
262} while (0)
263
234/* PFNs are real physical page numbers. However, mem_map only begins to record 264/* PFNs are real physical page numbers. However, mem_map only begins to record
235 * per-page information starting at pfn_base. This is to handle systems where 265 * per-page information starting at pfn_base. This is to handle systems where
236 * the first physical page in the machine is at some huge physical address, 266 * the first physical page in the machine is at some huge physical address,
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5078b7f68890..0112d6942288 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
397 397
398 pmd_clear(pmd); 398 pmd_clear(pmd);
399 pte_free_tlb(tlb, token, addr); 399 pte_free_tlb(tlb, token, addr);
400 atomic_long_dec(&tlb->mm->nr_ptes); 400 mm_dec_nr_ptes(tlb->mm);
401} 401}
402 402
403static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 403static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
472 pud = pud_offset(pgd, start); 472 pud = pud_offset(pgd, start);
473 pgd_clear(pgd); 473 pgd_clear(pgd);
474 pud_free_tlb(tlb, pud, start); 474 pud_free_tlb(tlb, pud, start);
475 mm_dec_nr_puds(tlb->mm);
475} 476}
476 477
477void hugetlb_free_pgd_range(struct mmu_gather *tlb, 478void hugetlb_free_pgd_range(struct mmu_gather *tlb,
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 61bdc1270d19..55ba62957e64 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2540,10 +2540,17 @@ void __init mem_init(void)
2540{ 2540{
2541 high_memory = __va(last_valid_pfn << PAGE_SHIFT); 2541 high_memory = __va(last_valid_pfn << PAGE_SHIFT);
2542 2542
2543 register_page_bootmem_info();
2544 free_all_bootmem(); 2543 free_all_bootmem();
2545 2544
2546 /* 2545 /*
2546 * Must be done after boot memory is put on freelist, because here we
2547 * might set fields in deferred struct pages that have not yet been
2548 * initialized, and free_all_bootmem() initializes all the reserved
2549 * deferred pages for us.
2550 */
2551 register_page_bootmem_info();
2552
2553 /*
2547 * Set up the zero page, mark it reserved, so that page count 2554 * Set up the zero page, mark it reserved, so that page count
2548 * is not manipulated when freeing the page from user ptes. 2555 * is not manipulated when freeing the page from user ptes.
2549 */ 2556 */
@@ -2637,30 +2644,19 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
2637 vstart = vstart & PMD_MASK; 2644 vstart = vstart & PMD_MASK;
2638 vend = ALIGN(vend, PMD_SIZE); 2645 vend = ALIGN(vend, PMD_SIZE);
2639 for (; vstart < vend; vstart += PMD_SIZE) { 2646 for (; vstart < vend; vstart += PMD_SIZE) {
2640 pgd_t *pgd = pgd_offset_k(vstart); 2647 pgd_t *pgd = vmemmap_pgd_populate(vstart, node);
2641 unsigned long pte; 2648 unsigned long pte;
2642 pud_t *pud; 2649 pud_t *pud;
2643 pmd_t *pmd; 2650 pmd_t *pmd;
2644 2651
2645 if (pgd_none(*pgd)) { 2652 if (!pgd)
2646 pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node); 2653 return -ENOMEM;
2647
2648 if (!new)
2649 return -ENOMEM;
2650 pgd_populate(&init_mm, pgd, new);
2651 }
2652
2653 pud = pud_offset(pgd, vstart);
2654 if (pud_none(*pud)) {
2655 pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
2656 2654
2657 if (!new) 2655 pud = vmemmap_pud_populate(pgd, vstart, node);
2658 return -ENOMEM; 2656 if (!pud)
2659 pud_populate(&init_mm, pud, new); 2657 return -ENOMEM;
2660 }
2661 2658
2662 pmd = pmd_offset(pud, vstart); 2659 pmd = pmd_offset(pud, vstart);
2663
2664 pte = pmd_val(*pmd); 2660 pte = pmd_val(*pmd);
2665 if (!(pte & _PAGE_VALID)) { 2661 if (!(pte & _PAGE_VALID)) {
2666 void *block = vmemmap_alloc_block(PMD_SIZE, node); 2662 void *block = vmemmap_alloc_block(PMD_SIZE, node);
@@ -2927,7 +2923,7 @@ void __flush_tlb_all(void)
2927pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 2923pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
2928 unsigned long address) 2924 unsigned long address)
2929{ 2925{
2930 struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 2926 struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2931 pte_t *pte = NULL; 2927 pte_t *pte = NULL;
2932 2928
2933 if (page) 2929 if (page)
@@ -2939,11 +2935,11 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
2939pgtable_t pte_alloc_one(struct mm_struct *mm, 2935pgtable_t pte_alloc_one(struct mm_struct *mm,
2940 unsigned long address) 2936 unsigned long address)
2941{ 2937{
2942 struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 2938 struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2943 if (!page) 2939 if (!page)
2944 return NULL; 2940 return NULL;
2945 if (!pgtable_page_ctor(page)) { 2941 if (!pgtable_page_ctor(page)) {
2946 free_hot_cold_page(page, 0); 2942 free_unref_page(page);
2947 return NULL; 2943 return NULL;
2948 } 2944 }
2949 return (pte_t *) page_address(page); 2945 return (pte_t *) page_address(page);
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index b51cc28acd0a..4432f31e8479 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -409,7 +409,7 @@ void __homecache_free_pages(struct page *page, unsigned int order)
409 if (put_page_testzero(page)) { 409 if (put_page_testzero(page)) {
410 homecache_change_page_home(page, order, PAGE_HOME_HASH); 410 homecache_change_page_home(page, order, PAGE_HOME_HASH);
411 if (order == 0) { 411 if (order == 0) {
412 free_hot_cold_page(page, false); 412 free_unref_page(page);
413 } else { 413 } else {
414 init_page_count(page); 414 init_page_count(page);
415 __free_pages(page, order); 415 __free_pages(page, order);
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index e7437ec62710..3c0e470ea646 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -22,8 +22,6 @@
22/* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ 22/* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */
23unsigned long *empty_zero_page = NULL; 23unsigned long *empty_zero_page = NULL;
24EXPORT_SYMBOL(empty_zero_page); 24EXPORT_SYMBOL(empty_zero_page);
25/* allocated in paging_init and unchanged thereafter */
26static unsigned long *empty_bad_page = NULL;
27 25
28/* 26/*
29 * Initialized during boot, and readonly for initializing page tables 27 * Initialized during boot, and readonly for initializing page tables
@@ -146,7 +144,6 @@ void __init paging_init(void)
146 int i; 144 int i;
147 145
148 empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); 146 empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
149 empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
150 for (i = 0; i < ARRAY_SIZE(zones_size); i++) 147 for (i = 0; i < ARRAY_SIZE(zones_size); i++)
151 zones_size[i] = 0; 148 zones_size[i] = 0;
152 149
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
index 26775793c204..f0fdb268f8f2 100644
--- a/arch/unicore32/include/asm/pgalloc.h
+++ b/arch/unicore32/include/asm/pgalloc.h
@@ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd);
28#define pgd_alloc(mm) get_pgd_slow(mm) 28#define pgd_alloc(mm) get_pgd_slow(mm)
29#define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) 29#define pgd_free(mm, pgd) free_pgd_slow(mm, pgd)
30 30
31#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) 31#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
32 32
33/* 33/*
34 * Allocate one PTE table. 34 * Allocate one PTE table.
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
index c572a28c76c9..a830a300aaa1 100644
--- a/arch/unicore32/mm/pgd.c
+++ b/arch/unicore32/mm/pgd.c
@@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
97 pte = pmd_pgtable(*pmd); 97 pte = pmd_pgtable(*pmd);
98 pmd_clear(pmd); 98 pmd_clear(pmd);
99 pte_free(mm, pte); 99 pte_free(mm, pte);
100 atomic_long_dec(&mm->nr_ptes); 100 mm_dec_nr_ptes(mm);
101 pmd_free(mm, pmd); 101 pmd_free(mm, pmd);
102 mm_dec_nr_pmds(mm); 102 mm_dec_nr_pmds(mm);
103free: 103free:
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f08977d82ca0..df3276d6bfe3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -110,9 +110,8 @@ config X86
110 select HAVE_ARCH_AUDITSYSCALL 110 select HAVE_ARCH_AUDITSYSCALL
111 select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE 111 select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
112 select HAVE_ARCH_JUMP_LABEL 112 select HAVE_ARCH_JUMP_LABEL
113 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP 113 select HAVE_ARCH_KASAN if X86_64
114 select HAVE_ARCH_KGDB 114 select HAVE_ARCH_KGDB
115 select HAVE_ARCH_KMEMCHECK
116 select HAVE_ARCH_MMAP_RND_BITS if MMU 115 select HAVE_ARCH_MMAP_RND_BITS if MMU
117 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT 116 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
118 select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT 117 select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
@@ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT
1430 1429
1431config X86_DIRECT_GBPAGES 1430config X86_DIRECT_GBPAGES
1432 def_bool y 1431 def_bool y
1433 depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK 1432 depends on X86_64 && !DEBUG_PAGEALLOC
1434 ---help--- 1433 ---help---
1435 Certain kernel features effectively disable kernel 1434 Certain kernel features effectively disable kernel
1436 linear 1 GB mappings (even if the CPU otherwise 1435 linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a20eacd9c7e9..3e73bc255e4e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32
158endif 158endif
159export CONFIG_X86_X32_ABI 159export CONFIG_X86_X32_ABI
160 160
161# Don't unroll struct assignments with kmemcheck enabled
162ifeq ($(CONFIG_KMEMCHECK),y)
163 KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
164endif
165
166# 161#
167# If the function graph tracer is used with mcount instead of fentry, 162# If the function graph tracer is used with mcount instead of fentry,
168# '-maccumulate-outgoing-args' is needed to prevent a GCC bug 163# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 43cbe843de8d..0350d99bb8fd 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -7,7 +7,6 @@
7 * Documentation/DMA-API.txt for documentation. 7 * Documentation/DMA-API.txt for documentation.
8 */ 8 */
9 9
10#include <linux/kmemcheck.h>
11#include <linux/scatterlist.h> 10#include <linux/scatterlist.h>
12#include <linux/dma-debug.h> 11#include <linux/dma-debug.h>
13#include <asm/io.h> 12#include <asm/io.h>
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
index 945a0337fbcf..ea32a7d3cf1b 100644
--- a/arch/x86/include/asm/kmemcheck.h
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -1,43 +1 @@
1/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
2#ifndef ASM_X86_KMEMCHECK_H
3#define ASM_X86_KMEMCHECK_H
4
5#include <linux/types.h>
6#include <asm/ptrace.h>
7
8#ifdef CONFIG_KMEMCHECK
9bool kmemcheck_active(struct pt_regs *regs);
10
11void kmemcheck_show(struct pt_regs *regs);
12void kmemcheck_hide(struct pt_regs *regs);
13
14bool kmemcheck_fault(struct pt_regs *regs,
15 unsigned long address, unsigned long error_code);
16bool kmemcheck_trap(struct pt_regs *regs);
17#else
18static inline bool kmemcheck_active(struct pt_regs *regs)
19{
20 return false;
21}
22
23static inline void kmemcheck_show(struct pt_regs *regs)
24{
25}
26
27static inline void kmemcheck_hide(struct pt_regs *regs)
28{
29}
30
31static inline bool kmemcheck_fault(struct pt_regs *regs,
32 unsigned long address, unsigned long error_code)
33{
34 return false;
35}
36
37static inline bool kmemcheck_trap(struct pt_regs *regs)
38{
39 return false;
40}
41#endif /* CONFIG_KMEMCHECK */
42
43#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f735c3016325..09f9e1e00e3b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -667,11 +667,6 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
667 return false; 667 return false;
668} 668}
669 669
670static inline int pte_hidden(pte_t pte)
671{
672 return pte_flags(pte) & _PAGE_HIDDEN;
673}
674
675static inline int pmd_present(pmd_t pmd) 670static inline int pmd_present(pmd_t pmd)
676{ 671{
677 /* 672 /*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 9e9b05fc4860..3696398a9475 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -32,7 +32,6 @@
32 32
33#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 33#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
34#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 34#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
35#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
36#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 35#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
37#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 36#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
38 37
@@ -79,18 +78,6 @@
79#define _PAGE_KNL_ERRATUM_MASK 0 78#define _PAGE_KNL_ERRATUM_MASK 0
80#endif 79#endif
81 80
82#ifdef CONFIG_KMEMCHECK
83#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
84#else
85#define _PAGE_HIDDEN (_AT(pteval_t, 0))
86#endif
87
88/*
89 * The same hidden bit is used by kmemcheck, but since kmemcheck
90 * works on kernel pages while soft-dirty engine on user space,
91 * they do not conflict with each other.
92 */
93
94#ifdef CONFIG_MEM_SOFT_DIRTY 81#ifdef CONFIG_MEM_SOFT_DIRTY
95#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) 82#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
96#else 83#else
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 076502241eae..55d392c6bd29 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
179 * No 3D Now! 179 * No 3D Now!
180 */ 180 */
181 181
182#ifndef CONFIG_KMEMCHECK
183
184#if (__GNUC__ >= 4) 182#if (__GNUC__ >= 4)
185#define memcpy(t, f, n) __builtin_memcpy(t, f, n) 183#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
186#else 184#else
@@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
189 ? __constant_memcpy((t), (f), (n)) \ 187 ? __constant_memcpy((t), (f), (n)) \
190 : __memcpy((t), (f), (n))) 188 : __memcpy((t), (f), (n)))
191#endif 189#endif
192#else
193/*
194 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
195 * because it means that we know both memory operands in advance.
196 */
197#define memcpy(t, f, n) __memcpy((t), (f), (n))
198#endif
199 190
200#endif 191#endif
201#endif /* !CONFIG_FORTIFY_SOURCE */ 192#endif /* !CONFIG_FORTIFY_SOURCE */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 0b1b4445f4c5..533f74c300c2 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len);
33extern void *__memcpy(void *to, const void *from, size_t len); 33extern void *__memcpy(void *to, const void *from, size_t len);
34 34
35#ifndef CONFIG_FORTIFY_SOURCE 35#ifndef CONFIG_FORTIFY_SOURCE
36#ifndef CONFIG_KMEMCHECK
37#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 36#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
38#define memcpy(dst, src, len) \ 37#define memcpy(dst, src, len) \
39({ \ 38({ \
@@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len);
46 __ret; \ 45 __ret; \
47}) 46})
48#endif 47#endif
49#else
50/*
51 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
52 * because it means that we know both memory operands in advance.
53 */
54#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
55#endif
56#endif /* !CONFIG_FORTIFY_SOURCE */ 48#endif /* !CONFIG_FORTIFY_SOURCE */
57 49
58#define __HAVE_ARCH_MEMSET 50#define __HAVE_ARCH_MEMSET
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 1f5c5161ead6..45c8605467f1 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,7 +1,4 @@
1#ifdef CONFIG_KMEMCHECK 1#ifndef _ASM_X86_XOR_H
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
4#elif !defined(_ASM_X86_XOR_H)
5#define _ASM_X86_XOR_H 2#define _ASM_X86_XOR_H
6 3
7/* 4/*
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b720dacac051..b1af22073e28 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
187 if (c->x86 == 6 && c->x86_model < 15) 187 if (c->x86 == 6 && c->x86_model < 15)
188 clear_cpu_cap(c, X86_FEATURE_PAT); 188 clear_cpu_cap(c, X86_FEATURE_PAT);
189 189
190#ifdef CONFIG_KMEMCHECK
191 /*
192 * P4s have a "fast strings" feature which causes single-
193 * stepping REP instructions to only generate a #DB on
194 * cache-line boundaries.
195 *
196 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
197 * (model 2) with the same problem.
198 */
199 if (c->x86 == 15)
200 if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
201 MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
202 pr_info("kmemcheck: Disabling fast string operations\n");
203#endif
204
205 /* 190 /*
206 * If fast string is not enabled in IA32_MISC_ENABLE for any reason, 191 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
207 * clear the fast string and enhanced fast string CPU capabilities. 192 * clear the fast string and enhanced fast string CPU capabilities.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 7d7715dde901..e5ec3cafa72e 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,7 +57,7 @@
57# error "Need more virtual address space for the ESPFIX hack" 57# error "Need more virtual address space for the ESPFIX hack"
58#endif 58#endif
59 59
60#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) 60#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
61 61
62/* This contains the *bottom* address of the espfix stack */ 62/* This contains the *bottom* address of the espfix stack */
63DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); 63DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7b0f74a2150..989514c94a55 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -42,7 +42,6 @@
42#include <linux/edac.h> 42#include <linux/edac.h>
43#endif 43#endif
44 44
45#include <asm/kmemcheck.h>
46#include <asm/stacktrace.h> 45#include <asm/stacktrace.h>
47#include <asm/processor.h> 46#include <asm/processor.h>
48#include <asm/debugreg.h> 47#include <asm/debugreg.h>
@@ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
749 if (!dr6 && user_mode(regs)) 748 if (!dr6 && user_mode(regs))
750 user_icebp = 1; 749 user_icebp = 1;
751 750
752 /* Catch kmemcheck conditions! */
753 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
754 goto exit;
755
756 /* Store the virtualized DR6 value */ 751 /* Store the virtualized DR6 value */
757 tsk->thread.debugreg6 = dr6; 752 tsk->thread.debugreg6 = dr6;
758 753
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7ba7f3d7f477..8e13b8cc6bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
29 29
30obj-$(CONFIG_HIGHMEM) += highmem_32.o 30obj-$(CONFIG_HIGHMEM) += highmem_32.o
31 31
32obj-$(CONFIG_KMEMCHECK) += kmemcheck/
33
34KASAN_SANITIZE_kasan_init_$(BITS).o := n 32KASAN_SANITIZE_kasan_init_$(BITS).o := n
35obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o 33obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
36 34
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3109ba6c6ede..78ca9a8ee454 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,7 +20,6 @@
20#include <asm/cpufeature.h> /* boot_cpu_has, ... */ 20#include <asm/cpufeature.h> /* boot_cpu_has, ... */
21#include <asm/traps.h> /* dotraplinkage, ... */ 21#include <asm/traps.h> /* dotraplinkage, ... */
22#include <asm/pgalloc.h> /* pgd_*(), ... */ 22#include <asm/pgalloc.h> /* pgd_*(), ... */
23#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
24#include <asm/fixmap.h> /* VSYSCALL_ADDR */ 23#include <asm/fixmap.h> /* VSYSCALL_ADDR */
25#include <asm/vsyscall.h> /* emulate_vsyscall */ 24#include <asm/vsyscall.h> /* emulate_vsyscall */
26#include <asm/vm86.h> /* struct vm86 */ 25#include <asm/vm86.h> /* struct vm86 */
@@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1256 * Detect and handle instructions that would cause a page fault for 1255 * Detect and handle instructions that would cause a page fault for
1257 * both a tracked kernel page and a userspace page. 1256 * both a tracked kernel page and a userspace page.
1258 */ 1257 */
1259 if (kmemcheck_active(regs))
1260 kmemcheck_hide(regs);
1261 prefetchw(&mm->mmap_sem); 1258 prefetchw(&mm->mmap_sem);
1262 1259
1263 if (unlikely(kmmio_fault(regs, address))) 1260 if (unlikely(kmmio_fault(regs, address)))
@@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1280 if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { 1277 if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1281 if (vmalloc_fault(address) >= 0) 1278 if (vmalloc_fault(address) >= 0)
1282 return; 1279 return;
1283
1284 if (kmemcheck_fault(regs, address, error_code))
1285 return;
1286 } 1280 }
1287 1281
1288 /* Can handle a stale RO->RW TLB: */ 1282 /* Can handle a stale RO->RW TLB: */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..6fdf91ef130a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num)
92 unsigned int order; 92 unsigned int order;
93 93
94 order = get_order((unsigned long)num << PAGE_SHIFT); 94 order = get_order((unsigned long)num << PAGE_SHIFT);
95 return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | 95 return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
96 __GFP_ZERO, order);
97 } 96 }
98 97
99 if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { 98 if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
@@ -164,12 +163,11 @@ static int page_size_mask;
164static void __init probe_page_size_mask(void) 163static void __init probe_page_size_mask(void)
165{ 164{
166 /* 165 /*
167 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will 166 * For pagealloc debugging, identity mapping will use small pages.
168 * use small pages.
169 * This will simplify cpa(), which otherwise needs to support splitting 167 * This will simplify cpa(), which otherwise needs to support splitting
170 * large pages into small in interrupt context, etc. 168 * large pages into small in interrupt context, etc.
171 */ 169 */
172 if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK)) 170 if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
173 page_size_mask |= 1 << PG_LEVEL_2M; 171 page_size_mask |= 1 << PG_LEVEL_2M;
174 else 172 else
175 direct_gbpages = 0; 173 direct_gbpages = 0;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index adcea90a2046..4a837289f2ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
184 void *ptr; 184 void *ptr;
185 185
186 if (after_bootmem) 186 if (after_bootmem)
187 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); 187 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
188 else 188 else
189 ptr = alloc_bootmem_pages(PAGE_SIZE); 189 ptr = alloc_bootmem_pages(PAGE_SIZE);
190 190
@@ -1173,12 +1173,18 @@ void __init mem_init(void)
1173 1173
1174 /* clear_bss() already clear the empty_zero_page */ 1174 /* clear_bss() already clear the empty_zero_page */
1175 1175
1176 register_page_bootmem_info();
1177
1178 /* this will put all memory onto the freelists */ 1176 /* this will put all memory onto the freelists */
1179 free_all_bootmem(); 1177 free_all_bootmem();
1180 after_bootmem = 1; 1178 after_bootmem = 1;
1181 1179
1180 /*
1181 * Must be done after boot memory is put on freelist, because here we
1182 * might set fields in deferred struct pages that have not yet been
1183 * initialized, and free_all_bootmem() initializes all the reserved
1184 * deferred pages for us.
1185 */
1186 register_page_bootmem_info();
1187
1182 /* Register memory areas for /proc/kcore */ 1188 /* Register memory areas for /proc/kcore */
1183 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, 1189 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
1184 PAGE_SIZE, KCORE_OTHER); 1190 PAGE_SIZE, KCORE_OTHER);
@@ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
1399 vmemmap_verify((pte_t *)pmd, node, addr, next); 1405 vmemmap_verify((pte_t *)pmd, node, addr, next);
1400 continue; 1406 continue;
1401 } 1407 }
1402 pr_warn_once("vmemmap: falling back to regular page backing\n");
1403 if (vmemmap_populate_basepages(addr, next, node)) 1408 if (vmemmap_populate_basepages(addr, next, node))
1404 return -ENOMEM; 1409 return -ENOMEM;
1405 } 1410 }
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 2b60dc6e64b1..99dfed6dfef8 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -4,12 +4,14 @@
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/kasan.h> 5#include <linux/kasan.h>
6#include <linux/kdebug.h> 6#include <linux/kdebug.h>
7#include <linux/memblock.h>
7#include <linux/mm.h> 8#include <linux/mm.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9#include <linux/sched/task.h> 10#include <linux/sched/task.h>
10#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
11 12
12#include <asm/e820/types.h> 13#include <asm/e820/types.h>
14#include <asm/pgalloc.h>
13#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
14#include <asm/sections.h> 16#include <asm/sections.h>
15#include <asm/pgtable.h> 17#include <asm/pgtable.h>
@@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES];
18 20
19static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); 21static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
20 22
21static int __init map_range(struct range *range) 23static __init void *early_alloc(size_t size, int nid)
24{
25 return memblock_virt_alloc_try_nid_nopanic(size, size,
26 __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
27}
28
29static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
30 unsigned long end, int nid)
31{
32 pte_t *pte;
33
34 if (pmd_none(*pmd)) {
35 void *p;
36
37 if (boot_cpu_has(X86_FEATURE_PSE) &&
38 ((end - addr) == PMD_SIZE) &&
39 IS_ALIGNED(addr, PMD_SIZE)) {
40 p = early_alloc(PMD_SIZE, nid);
41 if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
42 return;
43 else if (p)
44 memblock_free(__pa(p), PMD_SIZE);
45 }
46
47 p = early_alloc(PAGE_SIZE, nid);
48 pmd_populate_kernel(&init_mm, pmd, p);
49 }
50
51 pte = pte_offset_kernel(pmd, addr);
52 do {
53 pte_t entry;
54 void *p;
55
56 if (!pte_none(*pte))
57 continue;
58
59 p = early_alloc(PAGE_SIZE, nid);
60 entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
61 set_pte_at(&init_mm, addr, pte, entry);
62 } while (pte++, addr += PAGE_SIZE, addr != end);
63}
64
65static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
66 unsigned long end, int nid)
67{
68 pmd_t *pmd;
69 unsigned long next;
70
71 if (pud_none(*pud)) {
72 void *p;
73
74 if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
75 ((end - addr) == PUD_SIZE) &&
76 IS_ALIGNED(addr, PUD_SIZE)) {
77 p = early_alloc(PUD_SIZE, nid);
78 if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
79 return;
80 else if (p)
81 memblock_free(__pa(p), PUD_SIZE);
82 }
83
84 p = early_alloc(PAGE_SIZE, nid);
85 pud_populate(&init_mm, pud, p);
86 }
87
88 pmd = pmd_offset(pud, addr);
89 do {
90 next = pmd_addr_end(addr, end);
91 if (!pmd_large(*pmd))
92 kasan_populate_pmd(pmd, addr, next, nid);
93 } while (pmd++, addr = next, addr != end);
94}
95
96static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
97 unsigned long end, int nid)
98{
99 pud_t *pud;
100 unsigned long next;
101
102 if (p4d_none(*p4d)) {
103 void *p = early_alloc(PAGE_SIZE, nid);
104
105 p4d_populate(&init_mm, p4d, p);
106 }
107
108 pud = pud_offset(p4d, addr);
109 do {
110 next = pud_addr_end(addr, end);
111 if (!pud_large(*pud))
112 kasan_populate_pud(pud, addr, next, nid);
113 } while (pud++, addr = next, addr != end);
114}
115
116static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
117 unsigned long end, int nid)
118{
119 void *p;
120 p4d_t *p4d;
121 unsigned long next;
122
123 if (pgd_none(*pgd)) {
124 p = early_alloc(PAGE_SIZE, nid);
125 pgd_populate(&init_mm, pgd, p);
126 }
127
128 p4d = p4d_offset(pgd, addr);
129 do {
130 next = p4d_addr_end(addr, end);
131 kasan_populate_p4d(p4d, addr, next, nid);
132 } while (p4d++, addr = next, addr != end);
133}
134
135static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
136 int nid)
137{
138 pgd_t *pgd;
139 unsigned long next;
140
141 addr = addr & PAGE_MASK;
142 end = round_up(end, PAGE_SIZE);
143 pgd = pgd_offset_k(addr);
144 do {
145 next = pgd_addr_end(addr, end);
146 kasan_populate_pgd(pgd, addr, next, nid);
147 } while (pgd++, addr = next, addr != end);
148}
149
150static void __init map_range(struct range *range)
22{ 151{
23 unsigned long start; 152 unsigned long start;
24 unsigned long end; 153 unsigned long end;
@@ -26,7 +155,7 @@ static int __init map_range(struct range *range)
26 start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); 155 start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
27 end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); 156 end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
28 157
29 return vmemmap_populate(start, end, NUMA_NO_NODE); 158 kasan_populate_shadow(start, end, early_pfn_to_nid(range->start));
30} 159}
31 160
32static void __init clear_pgds(unsigned long start, 161static void __init clear_pgds(unsigned long start,
@@ -189,16 +318,16 @@ void __init kasan_init(void)
189 if (pfn_mapped[i].end == 0) 318 if (pfn_mapped[i].end == 0)
190 break; 319 break;
191 320
192 if (map_range(&pfn_mapped[i])) 321 map_range(&pfn_mapped[i]);
193 panic("kasan: unable to allocate shadow!");
194 } 322 }
323
195 kasan_populate_zero_shadow( 324 kasan_populate_zero_shadow(
196 kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), 325 kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
197 kasan_mem_to_shadow((void *)__START_KERNEL_map)); 326 kasan_mem_to_shadow((void *)__START_KERNEL_map));
198 327
199 vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), 328 kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
200 (unsigned long)kasan_mem_to_shadow(_end), 329 (unsigned long)kasan_mem_to_shadow(_end),
201 NUMA_NO_NODE); 330 early_pfn_to_nid(__pa(_stext)));
202 331
203 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), 332 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
204 (void *)KASAN_SHADOW_END); 333 (void *)KASAN_SHADOW_END);
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
1obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 872ec4159a68..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -1,228 +1 @@
1// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
2#include <linux/interrupt.h>
3#include <linux/kdebug.h>
4#include <linux/kmemcheck.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7#include <linux/ptrace.h>
8#include <linux/stacktrace.h>
9#include <linux/string.h>
10
11#include "error.h"
12#include "shadow.h"
13
14enum kmemcheck_error_type {
15 KMEMCHECK_ERROR_INVALID_ACCESS,
16 KMEMCHECK_ERROR_BUG,
17};
18
19#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
20
21struct kmemcheck_error {
22 enum kmemcheck_error_type type;
23
24 union {
25 /* KMEMCHECK_ERROR_INVALID_ACCESS */
26 struct {
27 /* Kind of access that caused the error */
28 enum kmemcheck_shadow state;
29 /* Address and size of the erroneous read */
30 unsigned long address;
31 unsigned int size;
32 };
33 };
34
35 struct pt_regs regs;
36 struct stack_trace trace;
37 unsigned long trace_entries[32];
38
39 /* We compress it to a char. */
40 unsigned char shadow_copy[SHADOW_COPY_SIZE];
41 unsigned char memory_copy[SHADOW_COPY_SIZE];
42};
43
44/*
45 * Create a ring queue of errors to output. We can't call printk() directly
46 * from the kmemcheck traps, since this may call the console drivers and
47 * result in a recursive fault.
48 */
49static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
50static unsigned int error_count;
51static unsigned int error_rd;
52static unsigned int error_wr;
53static unsigned int error_missed_count;
54
55static struct kmemcheck_error *error_next_wr(void)
56{
57 struct kmemcheck_error *e;
58
59 if (error_count == ARRAY_SIZE(error_fifo)) {
60 ++error_missed_count;
61 return NULL;
62 }
63
64 e = &error_fifo[error_wr];
65 if (++error_wr == ARRAY_SIZE(error_fifo))
66 error_wr = 0;
67 ++error_count;
68 return e;
69}
70
71static struct kmemcheck_error *error_next_rd(void)
72{
73 struct kmemcheck_error *e;
74
75 if (error_count == 0)
76 return NULL;
77
78 e = &error_fifo[error_rd];
79 if (++error_rd == ARRAY_SIZE(error_fifo))
80 error_rd = 0;
81 --error_count;
82 return e;
83}
84
85void kmemcheck_error_recall(void)
86{
87 static const char *desc[] = {
88 [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
89 [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
90 [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
91 [KMEMCHECK_SHADOW_FREED] = "freed",
92 };
93
94 static const char short_desc[] = {
95 [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
96 [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
97 [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
98 [KMEMCHECK_SHADOW_FREED] = 'f',
99 };
100
101 struct kmemcheck_error *e;
102 unsigned int i;
103
104 e = error_next_rd();
105 if (!e)
106 return;
107
108 switch (e->type) {
109 case KMEMCHECK_ERROR_INVALID_ACCESS:
110 printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address);
114
115 printk(KERN_WARNING);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk(KERN_CONT "%02x", e->memory_copy[i]);
118 printk(KERN_CONT "\n");
119
120 printk(KERN_WARNING);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
124 else
125 printk(KERN_CONT " ?");
126 }
127 printk(KERN_CONT "\n");
128 printk(KERN_WARNING "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break;
131 case KMEMCHECK_ERROR_BUG:
132 printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
133 break;
134 }
135
136 __show_regs(&e->regs, 1);
137 print_stack_trace(&e->trace, 0);
138}
139
140static void do_wakeup(unsigned long data)
141{
142 while (error_count > 0)
143 kmemcheck_error_recall();
144
145 if (error_missed_count > 0) {
146 printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
147 "the queue was too small\n", error_missed_count);
148 error_missed_count = 0;
149 }
150}
151
152static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
153
154/*
155 * Save the context of an error report.
156 */
157void kmemcheck_error_save(enum kmemcheck_shadow state,
158 unsigned long address, unsigned int size, struct pt_regs *regs)
159{
160 static unsigned long prev_ip;
161
162 struct kmemcheck_error *e;
163 void *shadow_copy;
164 void *memory_copy;
165
166 /* Don't report several adjacent errors from the same EIP. */
167 if (regs->ip == prev_ip)
168 return;
169 prev_ip = regs->ip;
170
171 e = error_next_wr();
172 if (!e)
173 return;
174
175 e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
176
177 e->state = state;
178 e->address = address;
179 e->size = size;
180
181 /* Save regs */
182 memcpy(&e->regs, regs, sizeof(*regs));
183
184 /* Save stack trace */
185 e->trace.nr_entries = 0;
186 e->trace.entries = e->trace_entries;
187 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
188 e->trace.skip = 0;
189 save_stack_trace_regs(regs, &e->trace);
190
191 /* Round address down to nearest 16 bytes */
192 shadow_copy = kmemcheck_shadow_lookup(address
193 & ~(SHADOW_COPY_SIZE - 1));
194 BUG_ON(!shadow_copy);
195
196 memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
197
198 kmemcheck_show_addr(address);
199 memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
200 memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
201 kmemcheck_hide_addr(address);
202
203 tasklet_hi_schedule_first(&kmemcheck_tasklet);
204}
205
206/*
207 * Save the context of a kmemcheck bug.
208 */
209void kmemcheck_error_save_bug(struct pt_regs *regs)
210{
211 struct kmemcheck_error *e;
212
213 e = error_next_wr();
214 if (!e)
215 return;
216
217 e->type = KMEMCHECK_ERROR_BUG;
218
219 memcpy(&e->regs, regs, sizeof(*regs));
220
221 e->trace.nr_entries = 0;
222 e->trace.entries = e->trace_entries;
223 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
224 e->trace.skip = 1;
225 save_stack_trace(&e->trace);
226
227 tasklet_hi_schedule_first(&kmemcheck_tasklet);
228}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
index 39f80d7a874d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/error.h
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -1,16 +1 @@
1/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
2#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
3#define ARCH__X86__MM__KMEMCHECK__ERROR_H
4
5#include <linux/ptrace.h>
6
7#include "shadow.h"
8
9void kmemcheck_error_save(enum kmemcheck_shadow state,
10 unsigned long address, unsigned int size, struct pt_regs *regs);
11
12void kmemcheck_error_save_bug(struct pt_regs *regs);
13
14void kmemcheck_error_recall(void);
15
16#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
1/**
2 * kmemcheck - a heavyweight memory checker for the linux kernel
3 * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
4 * (With a lot of help from Ingo Molnar and Pekka Enberg.)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2) as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/init.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/kernel.h>
15#include <linux/kmemcheck.h>
16#include <linux/mm.h>
17#include <linux/page-flags.h>
18#include <linux/percpu.h>
19#include <linux/ptrace.h>
20#include <linux/string.h>
21#include <linux/types.h>
22
23#include <asm/cacheflush.h>
24#include <asm/kmemcheck.h>
25#include <asm/pgtable.h>
26#include <asm/tlbflush.h>
27
28#include "error.h"
29#include "opcode.h"
30#include "pte.h"
31#include "selftest.h"
32#include "shadow.h"
33
34
35#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
36# define KMEMCHECK_ENABLED 0
37#endif
38
39#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
40# define KMEMCHECK_ENABLED 1
41#endif
42
43#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
44# define KMEMCHECK_ENABLED 2
45#endif
46
47int kmemcheck_enabled = KMEMCHECK_ENABLED;
48
49int __init kmemcheck_init(void)
50{
51#ifdef CONFIG_SMP
52 /*
53 * Limit SMP to use a single CPU. We rely on the fact that this code
54 * runs before SMP is set up.
55 */
56 if (setup_max_cpus > 1) {
57 printk(KERN_INFO
58 "kmemcheck: Limiting number of CPUs to 1.\n");
59 setup_max_cpus = 1;
60 }
61#endif
62
63 if (!kmemcheck_selftest()) {
64 printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
65 kmemcheck_enabled = 0;
66 return -EINVAL;
67 }
68
69 printk(KERN_INFO "kmemcheck: Initialized\n");
70 return 0;
71}
72
73early_initcall(kmemcheck_init);
74
75/*
76 * We need to parse the kmemcheck= option before any memory is allocated.
77 */
78static int __init param_kmemcheck(char *str)
79{
80 int val;
81 int ret;
82
83 if (!str)
84 return -EINVAL;
85
86 ret = kstrtoint(str, 0, &val);
87 if (ret)
88 return ret;
89 kmemcheck_enabled = val;
90 return 0;
91}
92
93early_param("kmemcheck", param_kmemcheck);
94
95int kmemcheck_show_addr(unsigned long address)
96{
97 pte_t *pte;
98
99 pte = kmemcheck_pte_lookup(address);
100 if (!pte)
101 return 0;
102
103 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
104 __flush_tlb_one(address);
105 return 1;
106}
107
108int kmemcheck_hide_addr(unsigned long address)
109{
110 pte_t *pte;
111
112 pte = kmemcheck_pte_lookup(address);
113 if (!pte)
114 return 0;
115
116 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
117 __flush_tlb_one(address);
118 return 1;
119}
120
121struct kmemcheck_context {
122 bool busy;
123 int balance;
124
125 /*
126 * There can be at most two memory operands to an instruction, but
127 * each address can cross a page boundary -- so we may need up to
128 * four addresses that must be hidden/revealed for each fault.
129 */
130 unsigned long addr[4];
131 unsigned long n_addrs;
132 unsigned long flags;
133
134 /* Data size of the instruction that caused a fault. */
135 unsigned int size;
136};
137
138static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
139
140bool kmemcheck_active(struct pt_regs *regs)
141{
142 struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
143
144 return data->balance > 0;
145}
146
147/* Save an address that needs to be shown/hidden */
148static void kmemcheck_save_addr(unsigned long addr)
149{
150 struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
151
152 BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
153 data->addr[data->n_addrs++] = addr;
154}
155
156static unsigned int kmemcheck_show_all(void)
157{
158 struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
159 unsigned int i;
160 unsigned int n;
161
162 n = 0;
163 for (i = 0; i < data->n_addrs; ++i)
164 n += kmemcheck_show_addr(data->addr[i]);
165
166 return n;
167}
168
169static unsigned int kmemcheck_hide_all(void)
170{
171 struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
172 unsigned int i;
173 unsigned int n;
174
175 n = 0;
176 for (i = 0; i < data->n_addrs; ++i)
177 n += kmemcheck_hide_addr(data->addr[i]);
178
179 return n;
180}
181
182/*
183 * Called from the #PF handler.
184 */
185void kmemcheck_show(struct pt_regs *regs)
186{
187 struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
188
189 BUG_ON(!irqs_disabled());
190
191 if (unlikely(data->balance != 0)) {
192 kmemcheck_show_all();
193 kmemcheck_error_save_bug(regs);
194 data->balance = 0;
195 return;
196 }
197
198 /*
199 * None of the addresses actually belonged to kmemcheck. Note that
200 * this is not an error.
201 */
202 if (kmemcheck_show_all() == 0)
203 return;
204
205 ++data->balance;
206
207 /*
208 * The IF needs to be cleared as well, so that the faulting
209 * instruction can run "uninterrupted". Otherwise, we might take
210 * an interrupt and start executing that before we've had a chance
211 * to hide the page again.
212 *
213 * NOTE: In the rare case of multiple faults, we must not override
214 * the original flags:
215 */
216 if (!(regs->flags & X86_EFLAGS_TF))
217 data->flags = regs->flags;
218
219 regs->flags |= X86_EFLAGS_TF;
220 regs->flags &= ~X86_EFLAGS_IF;
221}
222
223/*
224 * Called from the #DB handler.
225 */
226void kmemcheck_hide(struct pt_regs *regs)
227{
228 struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
229 int n;
230
231 BUG_ON(!irqs_disabled());
232
233 if (unlikely(data->balance != 1)) {
234 kmemcheck_show_all();
235 kmemcheck_error_save_bug(regs);
236 data->n_addrs = 0;
237 data->balance = 0;
238
239 if (!(data->flags & X86_EFLAGS_TF))
240 regs->flags &= ~X86_EFLAGS_TF;
241 if (data->flags & X86_EFLAGS_IF)
242 regs->flags |= X86_EFLAGS_IF;
243 return;
244 }
245
246 if (kmemcheck_enabled)
247 n = kmemcheck_hide_all();
248 else
249 n = kmemcheck_show_all();
250
251 if (n == 0)
252 return;
253
254 --data->balance;
255
256 data->n_addrs = 0;
257
258 if (!(data->flags & X86_EFLAGS_TF))
259 regs->flags &= ~X86_EFLAGS_TF;
260 if (data->flags & X86_EFLAGS_IF)
261 regs->flags |= X86_EFLAGS_IF;
262}
263
264void kmemcheck_show_pages(struct page *p, unsigned int n)
265{
266 unsigned int i;
267
268 for (i = 0; i < n; ++i) {
269 unsigned long address;
270 pte_t *pte;
271 unsigned int level;
272
273 address = (unsigned long) page_address(&p[i]);
274 pte = lookup_address(address, &level);
275 BUG_ON(!pte);
276 BUG_ON(level != PG_LEVEL_4K);
277
278 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
279 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
280 __flush_tlb_one(address);
281 }
282}
283
284bool kmemcheck_page_is_tracked(struct page *p)
285{
286 /* This will also check the "hidden" flag of the PTE. */
287 return kmemcheck_pte_lookup((unsigned long) page_address(p));
288}
289
290void kmemcheck_hide_pages(struct page *p, unsigned int n)
291{
292 unsigned int i;
293
294 for (i = 0; i < n; ++i) {
295 unsigned long address;
296 pte_t *pte;
297 unsigned int level;
298
299 address = (unsigned long) page_address(&p[i]);
300 pte = lookup_address(address, &level);
301 BUG_ON(!pte);
302 BUG_ON(level != PG_LEVEL_4K);
303
304 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
305 set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
306 __flush_tlb_one(address);
307 }
308}
309
310/* Access may NOT cross page boundary */
311static void kmemcheck_read_strict(struct pt_regs *regs,
312 unsigned long addr, unsigned int size)
313{
314 void *shadow;
315 enum kmemcheck_shadow status;
316
317 shadow = kmemcheck_shadow_lookup(addr);
318 if (!shadow)
319 return;
320
321 kmemcheck_save_addr(addr);
322 status = kmemcheck_shadow_test(shadow, size);
323 if (status == KMEMCHECK_SHADOW_INITIALIZED)
324 return;
325
326 if (kmemcheck_enabled)
327 kmemcheck_error_save(status, addr, size, regs);
328
329 if (kmemcheck_enabled == 2)
330 kmemcheck_enabled = 0;
331
332 /* Don't warn about it again. */
333 kmemcheck_shadow_set(shadow, size);
334}
335
336bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
337{
338 enum kmemcheck_shadow status;
339 void *shadow;
340
341 shadow = kmemcheck_shadow_lookup(addr);
342 if (!shadow)
343 return true;
344
345 status = kmemcheck_shadow_test_all(shadow, size);
346
347 return status == KMEMCHECK_SHADOW_INITIALIZED;
348}
349
350/* Access may cross page boundary */
351static void kmemcheck_read(struct pt_regs *regs,
352 unsigned long addr, unsigned int size)
353{
354 unsigned long page = addr & PAGE_MASK;
355 unsigned long next_addr = addr + size - 1;
356 unsigned long next_page = next_addr & PAGE_MASK;
357
358 if (likely(page == next_page)) {
359 kmemcheck_read_strict(regs, addr, size);
360 return;
361 }
362
363 /*
364 * What we do is basically to split the access across the
365 * two pages and handle each part separately. Yes, this means
366 * that we may now see reads that are 3 + 5 bytes, for
367 * example (and if both are uninitialized, there will be two
368 * reports), but it makes the code a lot simpler.
369 */
370 kmemcheck_read_strict(regs, addr, next_page - addr);
371 kmemcheck_read_strict(regs, next_page, next_addr - next_page);
372}
373
374static void kmemcheck_write_strict(struct pt_regs *regs,
375 unsigned long addr, unsigned int size)
376{
377 void *shadow;
378
379 shadow = kmemcheck_shadow_lookup(addr);
380 if (!shadow)
381 return;
382
383 kmemcheck_save_addr(addr);
384 kmemcheck_shadow_set(shadow, size);
385}
386
387static void kmemcheck_write(struct pt_regs *regs,
388 unsigned long addr, unsigned int size)
389{
390 unsigned long page = addr & PAGE_MASK;
391 unsigned long next_addr = addr + size - 1;
392 unsigned long next_page = next_addr & PAGE_MASK;
393
394 if (likely(page == next_page)) {
395 kmemcheck_write_strict(regs, addr, size);
396 return;
397 }
398
399 /* See comment in kmemcheck_read(). */
400 kmemcheck_write_strict(regs, addr, next_page - addr);
401 kmemcheck_write_strict(regs, next_page, next_addr - next_page);
402}
403
404/*
405 * Copying is hard. We have two addresses, each of which may be split across
406 * a page (and each page will have different shadow addresses).
407 */
408static void kmemcheck_copy(struct pt_regs *regs,
409 unsigned long src_addr, unsigned long dst_addr, unsigned int size)
410{
411 uint8_t shadow[8];
412 enum kmemcheck_shadow status;
413
414 unsigned long page;
415 unsigned long next_addr;
416 unsigned long next_page;
417
418 uint8_t *x;
419 unsigned int i;
420 unsigned int n;
421
422 BUG_ON(size > sizeof(shadow));
423
424 page = src_addr & PAGE_MASK;
425 next_addr = src_addr + size - 1;
426 next_page = next_addr & PAGE_MASK;
427
428 if (likely(page == next_page)) {
429 /* Same page */
430 x = kmemcheck_shadow_lookup(src_addr);
431 if (x) {
432 kmemcheck_save_addr(src_addr);
433 for (i = 0; i < size; ++i)
434 shadow[i] = x[i];
435 } else {
436 for (i = 0; i < size; ++i)
437 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
438 }
439 } else {
440 n = next_page - src_addr;
441 BUG_ON(n > sizeof(shadow));
442
443 /* First page */
444 x = kmemcheck_shadow_lookup(src_addr);
445 if (x) {
446 kmemcheck_save_addr(src_addr);
447 for (i = 0; i < n; ++i)
448 shadow[i] = x[i];
449 } else {
450 /* Not tracked */
451 for (i = 0; i < n; ++i)
452 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
453 }
454
455 /* Second page */
456 x = kmemcheck_shadow_lookup(next_page);
457 if (x) {
458 kmemcheck_save_addr(next_page);
459 for (i = n; i < size; ++i)
460 shadow[i] = x[i - n];
461 } else {
462 /* Not tracked */
463 for (i = n; i < size; ++i)
464 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
465 }
466 }
467
468 page = dst_addr & PAGE_MASK;
469 next_addr = dst_addr + size - 1;
470 next_page = next_addr & PAGE_MASK;
471
472 if (likely(page == next_page)) {
473 /* Same page */
474 x = kmemcheck_shadow_lookup(dst_addr);
475 if (x) {
476 kmemcheck_save_addr(dst_addr);
477 for (i = 0; i < size; ++i) {
478 x[i] = shadow[i];
479 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
480 }
481 }
482 } else {
483 n = next_page - dst_addr;
484 BUG_ON(n > sizeof(shadow));
485
486 /* First page */
487 x = kmemcheck_shadow_lookup(dst_addr);
488 if (x) {
489 kmemcheck_save_addr(dst_addr);
490 for (i = 0; i < n; ++i) {
491 x[i] = shadow[i];
492 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
493 }
494 }
495
496 /* Second page */
497 x = kmemcheck_shadow_lookup(next_page);
498 if (x) {
499 kmemcheck_save_addr(next_page);
500 for (i = n; i < size; ++i) {
501 x[i - n] = shadow[i];
502 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
503 }
504 }
505 }
506
507 status = kmemcheck_shadow_test(shadow, size);
508 if (status == KMEMCHECK_SHADOW_INITIALIZED)
509 return;
510
511 if (kmemcheck_enabled)
512 kmemcheck_error_save(status, src_addr, size, regs);
513
514 if (kmemcheck_enabled == 2)
515 kmemcheck_enabled = 0;
516}
517
518enum kmemcheck_method {
519 KMEMCHECK_READ,
520 KMEMCHECK_WRITE,
521};
522
523static void kmemcheck_access(struct pt_regs *regs,
524 unsigned long fallback_address, enum kmemcheck_method fallback_method)
525{
526 const uint8_t *insn;
527 const uint8_t *insn_primary;
528 unsigned int size;
529
530 struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
531
532 /* Recursive fault -- ouch. */
533 if (data->busy) {
534 kmemcheck_show_addr(fallback_address);
535 kmemcheck_error_save_bug(regs);
536 return;
537 }
538
539 data->busy = true;
540
541 insn = (const uint8_t *) regs->ip;
542 insn_primary = kmemcheck_opcode_get_primary(insn);
543
544 kmemcheck_opcode_decode(insn, &size);
545
546 switch (insn_primary[0]) {
547#ifdef CONFIG_KMEMCHECK_BITOPS_OK
548 /* AND, OR, XOR */
549 /*
550 * Unfortunately, these instructions have to be excluded from
551 * our regular checking since they access only some (and not
552 * all) bits. This clears out "bogus" bitfield-access warnings.
553 */
554 case 0x80:
555 case 0x81:
556 case 0x82:
557 case 0x83:
558 switch ((insn_primary[1] >> 3) & 7) {
559 /* OR */
560 case 1:
561 /* AND */
562 case 4:
563 /* XOR */
564 case 6:
565 kmemcheck_write(regs, fallback_address, size);
566 goto out;
567
568 /* ADD */
569 case 0:
570 /* ADC */
571 case 2:
572 /* SBB */
573 case 3:
574 /* SUB */
575 case 5:
576 /* CMP */
577 case 7:
578 break;
579 }
580 break;
581#endif
582
583 /* MOVS, MOVSB, MOVSW, MOVSD */
584 case 0xa4:
585 case 0xa5:
586 /*
587 * These instructions are special because they take two
588 * addresses, but we only get one page fault.
589 */
590 kmemcheck_copy(regs, regs->si, regs->di, size);
591 goto out;
592
593 /* CMPS, CMPSB, CMPSW, CMPSD */
594 case 0xa6:
595 case 0xa7:
596 kmemcheck_read(regs, regs->si, size);
597 kmemcheck_read(regs, regs->di, size);
598 goto out;
599 }
600
601 /*
602 * If the opcode isn't special in any way, we use the data from the
603 * page fault handler to determine the address and type of memory
604 * access.
605 */
606 switch (fallback_method) {
607 case KMEMCHECK_READ:
608 kmemcheck_read(regs, fallback_address, size);
609 goto out;
610 case KMEMCHECK_WRITE:
611 kmemcheck_write(regs, fallback_address, size);
612 goto out;
613 }
614
615out:
616 data->busy = false;
617}
618
619bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
620 unsigned long error_code)
621{
622 pte_t *pte;
623
624 /*
625 * XXX: Is it safe to assume that memory accesses from virtual 86
626 * mode or non-kernel code segments will _never_ access kernel
627 * memory (e.g. tracked pages)? For now, we need this to avoid
628 * invoking kmemcheck for PnP BIOS calls.
629 */
630 if (regs->flags & X86_VM_MASK)
631 return false;
632 if (regs->cs != __KERNEL_CS)
633 return false;
634
635 pte = kmemcheck_pte_lookup(address);
636 if (!pte)
637 return false;
638
639 WARN_ON_ONCE(in_nmi());
640
641 if (error_code & 2)
642 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
643 else
644 kmemcheck_access(regs, address, KMEMCHECK_READ);
645
646 kmemcheck_show(regs);
647 return true;
648}
649
650bool kmemcheck_trap(struct pt_regs *regs)
651{
652 if (!kmemcheck_active(regs))
653 return false;
654
655 /* We're done. */
656 kmemcheck_hide(regs);
657 return true;
658}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index df8109ddf7fe..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -1,107 +1 @@
1// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
2#include <linux/types.h>
3
4#include "opcode.h"
5
6static bool opcode_is_prefix(uint8_t b)
7{
8 return
9 /* Group 1 */
10 b == 0xf0 || b == 0xf2 || b == 0xf3
11 /* Group 2 */
12 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
13 || b == 0x64 || b == 0x65
14 /* Group 3 */
15 || b == 0x66
16 /* Group 4 */
17 || b == 0x67;
18}
19
20#ifdef CONFIG_X86_64
21static bool opcode_is_rex_prefix(uint8_t b)
22{
23 return (b & 0xf0) == 0x40;
24}
25#else
26static bool opcode_is_rex_prefix(uint8_t b)
27{
28 return false;
29}
30#endif
31
32#define REX_W (1 << 3)
33
34/*
35 * This is a VERY crude opcode decoder. We only need to find the size of the
36 * load/store that caused our #PF and this should work for all the opcodes
37 * that we care about. Moreover, the ones who invented this instruction set
38 * should be shot.
39 */
40void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
41{
42 /* Default operand size */
43 int operand_size_override = 4;
44
45 /* prefixes */
46 for (; opcode_is_prefix(*op); ++op) {
47 if (*op == 0x66)
48 operand_size_override = 2;
49 }
50
51 /* REX prefix */
52 if (opcode_is_rex_prefix(*op)) {
53 uint8_t rex = *op;
54
55 ++op;
56 if (rex & REX_W) {
57 switch (*op) {
58 case 0x63:
59 *size = 4;
60 return;
61 case 0x0f:
62 ++op;
63
64 switch (*op) {
65 case 0xb6:
66 case 0xbe:
67 *size = 1;
68 return;
69 case 0xb7:
70 case 0xbf:
71 *size = 2;
72 return;
73 }
74
75 break;
76 }
77
78 *size = 8;
79 return;
80 }
81 }
82
83 /* escape opcode */
84 if (*op == 0x0f) {
85 ++op;
86
87 /*
88 * This is move with zero-extend and sign-extend, respectively;
89 * we don't have to think about 0xb6/0xbe, because this is
90 * already handled in the conditional below.
91 */
92 if (*op == 0xb7 || *op == 0xbf)
93 operand_size_override = 2;
94 }
95
96 *size = (*op & 1) ? operand_size_override : 1;
97}
98
99const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
100{
101 /* skip prefixes */
102 while (opcode_is_prefix(*op))
103 ++op;
104 if (opcode_is_rex_prefix(*op))
105 ++op;
106 return op;
107}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
index 51a1ce94c24a..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -1,10 +1 @@
1/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
2#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
3#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
4
5#include <linux/types.h>
6
7void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
8const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
9
10#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
index 8a03be90272a..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/pte.c
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -1,23 +1 @@
1// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
2#include <linux/mm.h>
3
4#include <asm/pgtable.h>
5
6#include "pte.h"
7
8pte_t *kmemcheck_pte_lookup(unsigned long address)
9{
10 pte_t *pte;
11 unsigned int level;
12
13 pte = lookup_address(address, &level);
14 if (!pte)
15 return NULL;
16 if (level != PG_LEVEL_4K)
17 return NULL;
18 if (!pte_hidden(*pte))
19 return NULL;
20
21 return pte;
22}
23
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
index b595612382c2..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/pte.h
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -1,11 +1 @@
1/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
2#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
3#define ARCH__X86__MM__KMEMCHECK__PTE_H
4
5#include <linux/mm.h>
6
7#include <asm/pgtable.h>
8
9pte_t *kmemcheck_pte_lookup(unsigned long address);
10
11#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 7ce0be1f99eb..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,71 +1 @@
1// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
2#include <linux/bug.h>
3#include <linux/kernel.h>
4
5#include "opcode.h"
6#include "selftest.h"
7
8struct selftest_opcode {
9 unsigned int expected_size;
10 const uint8_t *insn;
11 const char *desc;
12};
13
14static const struct selftest_opcode selftest_opcodes[] = {
15 /* REP MOVS */
16 {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
17 {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
18
19 /* MOVZX / MOVZXD */
20 {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
21 {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
22
23 /* MOVSX / MOVSXD */
24 {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
25 {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
26
27#ifdef CONFIG_X86_64
28 /* MOVZX / MOVZXD */
29 {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
30 {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
31
32 /* MOVSX / MOVSXD */
33 {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
34 {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
35 {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
36#endif
37};
38
39static bool selftest_opcode_one(const struct selftest_opcode *op)
40{
41 unsigned size;
42
43 kmemcheck_opcode_decode(op->insn, &size);
44
45 if (size == op->expected_size)
46 return true;
47
48 printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
49 op->desc, op->expected_size, size);
50 return false;
51}
52
53static bool selftest_opcodes_all(void)
54{
55 bool pass = true;
56 unsigned int i;
57
58 for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
59 pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
60
61 return pass;
62}
63
64bool kmemcheck_selftest(void)
65{
66 bool pass = true;
67
68 pass = pass && selftest_opcodes_all();
69
70 return pass;
71}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
index 8d759aae453d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -1,7 +1 @@
1/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
2#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
3#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
4
5bool kmemcheck_selftest(void);
6
7#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
1#include <linux/kmemcheck.h>
2#include <linux/export.h>
3#include <linux/mm.h>
4
5#include <asm/page.h>
6#include <asm/pgtable.h>
7
8#include "pte.h"
9#include "shadow.h"
10
11/*
12 * Return the shadow address for the given address. Returns NULL if the
13 * address is not tracked.
14 *
15 * We need to be extremely careful not to follow any invalid pointers,
16 * because this function can be called for *any* possible address.
17 */
18void *kmemcheck_shadow_lookup(unsigned long address)
19{
20 pte_t *pte;
21 struct page *page;
22
23 if (!virt_addr_valid(address))
24 return NULL;
25
26 pte = kmemcheck_pte_lookup(address);
27 if (!pte)
28 return NULL;
29
30 page = virt_to_page(address);
31 if (!page->shadow)
32 return NULL;
33 return page->shadow + (address & (PAGE_SIZE - 1));
34}
35
36static void mark_shadow(void *address, unsigned int n,
37 enum kmemcheck_shadow status)
38{
39 unsigned long addr = (unsigned long) address;
40 unsigned long last_addr = addr + n - 1;
41 unsigned long page = addr & PAGE_MASK;
42 unsigned long last_page = last_addr & PAGE_MASK;
43 unsigned int first_n;
44 void *shadow;
45
46 /* If the memory range crosses a page boundary, stop there. */
47 if (page == last_page)
48 first_n = n;
49 else
50 first_n = page + PAGE_SIZE - addr;
51
52 shadow = kmemcheck_shadow_lookup(addr);
53 if (shadow)
54 memset(shadow, status, first_n);
55
56 addr += first_n;
57 n -= first_n;
58
59 /* Do full-page memset()s. */
60 while (n >= PAGE_SIZE) {
61 shadow = kmemcheck_shadow_lookup(addr);
62 if (shadow)
63 memset(shadow, status, PAGE_SIZE);
64
65 addr += PAGE_SIZE;
66 n -= PAGE_SIZE;
67 }
68
69 /* Do the remaining page, if any. */
70 if (n > 0) {
71 shadow = kmemcheck_shadow_lookup(addr);
72 if (shadow)
73 memset(shadow, status, n);
74 }
75}
76
77void kmemcheck_mark_unallocated(void *address, unsigned int n)
78{
79 mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
80}
81
82void kmemcheck_mark_uninitialized(void *address, unsigned int n)
83{
84 mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
85}
86
87/*
88 * Fill the shadow memory of the given address such that the memory at that
89 * address is marked as being initialized.
90 */
91void kmemcheck_mark_initialized(void *address, unsigned int n)
92{
93 mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
94}
95EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
96
97void kmemcheck_mark_freed(void *address, unsigned int n)
98{
99 mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
100}
101
102void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
103{
104 unsigned int i;
105
106 for (i = 0; i < n; ++i)
107 kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
108}
109
110void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
111{
112 unsigned int i;
113
114 for (i = 0; i < n; ++i)
115 kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
116}
117
118void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
119{
120 unsigned int i;
121
122 for (i = 0; i < n; ++i)
123 kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
124}
125
126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
127{
128#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
129 uint8_t *x;
130 unsigned int i;
131
132 x = shadow;
133
134 /*
135 * Make sure _some_ bytes are initialized. Gcc frequently generates
136 * code to access neighboring bytes.
137 */
138 for (i = 0; i < size; ++i) {
139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
140 return x[i];
141 }
142
143 return x[0];
144#else
145 return kmemcheck_shadow_test_all(shadow, size);
146#endif
147}
148
149enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
150{
151 uint8_t *x;
152 unsigned int i;
153
154 x = shadow;
155
156 /* All bytes must be initialized. */
157 for (i = 0; i < size; ++i) {
158 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
159 return x[i];
160 }
161
162 return x[0];
163}
164
165void kmemcheck_shadow_set(void *shadow, unsigned int size)
166{
167 uint8_t *x;
168 unsigned int i;
169
170 x = shadow;
171 for (i = 0; i < size; ++i)
172 x[i] = KMEMCHECK_SHADOW_INITIALIZED;
173}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index 49768dc18664..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -1,19 +1 @@
1/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
2#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
3#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
4
5enum kmemcheck_shadow {
6 KMEMCHECK_SHADOW_UNALLOCATED,
7 KMEMCHECK_SHADOW_UNINITIALIZED,
8 KMEMCHECK_SHADOW_INITIALIZED,
9 KMEMCHECK_SHADOW_FREED,
10};
11
12void *kmemcheck_shadow_lookup(unsigned long address);
13
14enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
15enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
16 unsigned int size);
17void kmemcheck_shadow_set(void *shadow, unsigned int size);
18
19#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
753 753
754 if (!debug_pagealloc_enabled()) 754 if (!debug_pagealloc_enabled())
755 spin_unlock(&cpa_lock); 755 spin_unlock(&cpa_lock);
756 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); 756 base = alloc_pages(GFP_KERNEL, 0);
757 if (!debug_pagealloc_enabled()) 757 if (!debug_pagealloc_enabled())
758 spin_lock(&cpa_lock); 758 spin_lock(&cpa_lock);
759 if (!base) 759 if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
904 904
905static int alloc_pte_page(pmd_t *pmd) 905static int alloc_pte_page(pmd_t *pmd)
906{ 906{
907 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 907 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
908 if (!pte) 908 if (!pte)
909 return -1; 909 return -1;
910 910
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
914 914
915static int alloc_pmd_page(pud_t *pud) 915static int alloc_pmd_page(pud_t *pud)
916{ 916{
917 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 917 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
918 if (!pmd) 918 if (!pmd)
919 return -1; 919 return -1;
920 920
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1120 pgd_entry = cpa->pgd + pgd_index(addr); 1120 pgd_entry = cpa->pgd + pgd_index(addr);
1121 1121
1122 if (pgd_none(*pgd_entry)) { 1122 if (pgd_none(*pgd_entry)) {
1123 p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 1123 p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
1124 if (!p4d) 1124 if (!p4d)
1125 return -1; 1125 return -1;
1126 1126
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1132 */ 1132 */
1133 p4d = p4d_offset(pgd_entry, addr); 1133 p4d = p4d_offset(pgd_entry, addr);
1134 if (p4d_none(*p4d)) { 1134 if (p4d_none(*p4d)) {
1135 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 1135 pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
1136 if (!pud) 1136 if (!pud)
1137 return -1; 1137 return -1;
1138 1138
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..96d456a94b03 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,7 +7,7 @@
7#include <asm/fixmap.h> 7#include <asm/fixmap.h>
8#include <asm/mtrr.h> 8#include <asm/mtrr.h>
9 9
10#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) 10#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
11 11
12#ifdef CONFIG_HIGHPTE 12#ifdef CONFIG_HIGHPTE
13#define PGALLOC_USER_GFP __GFP_HIGHMEM 13#define PGALLOC_USER_GFP __GFP_HIGHMEM
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9e4ee5b04b2d..6a151ce70e86 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void)
207 if (efi_enabled(EFI_OLD_MEMMAP)) 207 if (efi_enabled(EFI_OLD_MEMMAP))
208 return 0; 208 return 0;
209 209
210 gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; 210 gfp_mask = GFP_KERNEL | __GFP_ZERO;
211 efi_pgd = (pgd_t *)__get_free_page(gfp_mask); 211 efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
212 if (!efi_pgd) 212 if (!efi_pgd)
213 return -ENOMEM; 213 return -ENOMEM;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b600463791ec..11097477eeab 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2047,7 +2047,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
2047 * Allocate space for all possible cpus to avoid allocation at 2047 * Allocate space for all possible cpus to avoid allocation at
2048 * runtime 2048 * runtime
2049 */ 2049 */
2050 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 2050 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2051 GFP_KERNEL, node); 2051 GFP_KERNEL, node);
2052 if (!hctx->ctxs) 2052 if (!hctx->ctxs)
2053 goto unregister_cpu_notifier; 2053 goto unregister_cpu_notifier;
diff --git a/crypto/xor.c b/crypto/xor.c
index 263af9fb45ea..bce9fe7af40a 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -122,12 +122,7 @@ calibrate_xor_blocks(void)
122 goto out; 122 goto out;
123 } 123 }
124 124
125 /* 125 b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
126 * Note: Since the memory is not actually used for _anything_ but to
127 * test the XOR speed, we don't really want kmemcheck to warn about
128 * reading uninitialized bytes here.
129 */
130 b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2);
131 if (!b1) { 126 if (!b1) {
132 printk(KERN_WARNING "xor: Yikes! No memory available.\n"); 127 printk(KERN_WARNING "xor: Yikes! No memory available.\n");
133 return -ENOMEM; 128 return -ENOMEM;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c1cf87718c2e..588360d79fca 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -20,6 +20,7 @@
20#include <linux/radix-tree.h> 20#include <linux/radix-tree.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/backing-dev.h>
23#ifdef CONFIG_BLK_DEV_RAM_DAX 24#ifdef CONFIG_BLK_DEV_RAM_DAX
24#include <linux/pfn_t.h> 25#include <linux/pfn_t.h>
25#include <linux/dax.h> 26#include <linux/dax.h>
@@ -448,6 +449,7 @@ static struct brd_device *brd_alloc(int i)
448 disk->flags = GENHD_FL_EXT_DEVT; 449 disk->flags = GENHD_FL_EXT_DEVT;
449 sprintf(disk->disk_name, "ram%d", i); 450 sprintf(disk->disk_name, "ram%d", i);
450 set_capacity(disk, rd_size * 2); 451 set_capacity(disk, rd_size * 2);
452 disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
451 453
452#ifdef CONFIG_BLK_DEV_RAM_DAX 454#ifdef CONFIG_BLK_DEV_RAM_DAX
453 queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); 455 queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 5b8992beffec..4ed0a78fdc09 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -23,15 +23,15 @@ static const char * const backends[] = {
23#if IS_ENABLED(CONFIG_CRYPTO_LZ4) 23#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
24 "lz4", 24 "lz4",
25#endif 25#endif
26#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE)
27 "deflate",
28#endif
29#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) 26#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC)
30 "lz4hc", 27 "lz4hc",
31#endif 28#endif
32#if IS_ENABLED(CONFIG_CRYPTO_842) 29#if IS_ENABLED(CONFIG_CRYPTO_842)
33 "842", 30 "842",
34#endif 31#endif
32#if IS_ENABLED(CONFIG_CRYPTO_ZSTD)
33 "zstd",
34#endif
35 NULL 35 NULL
36}; 36};
37 37
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f149d3e61234..d70eba30003a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -122,14 +122,6 @@ static inline bool is_partial_io(struct bio_vec *bvec)
122} 122}
123#endif 123#endif
124 124
125static void zram_revalidate_disk(struct zram *zram)
126{
127 revalidate_disk(zram->disk);
128 /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */
129 zram->disk->queue->backing_dev_info->capabilities |=
130 BDI_CAP_STABLE_WRITES;
131}
132
133/* 125/*
134 * Check if request is within bounds and aligned on zram logical blocks. 126 * Check if request is within bounds and aligned on zram logical blocks.
135 */ 127 */
@@ -436,7 +428,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
436 WARN_ON_ONCE(!was_set); 428 WARN_ON_ONCE(!was_set);
437} 429}
438 430
439void zram_page_end_io(struct bio *bio) 431static void zram_page_end_io(struct bio *bio)
440{ 432{
441 struct page *page = bio->bi_io_vec[0].bv_page; 433 struct page *page = bio->bi_io_vec[0].bv_page;
442 434
@@ -1373,7 +1365,8 @@ static ssize_t disksize_store(struct device *dev,
1373 zram->comp = comp; 1365 zram->comp = comp;
1374 zram->disksize = disksize; 1366 zram->disksize = disksize;
1375 set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); 1367 set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
1376 zram_revalidate_disk(zram); 1368
1369 revalidate_disk(zram->disk);
1377 up_write(&zram->init_lock); 1370 up_write(&zram->init_lock);
1378 1371
1379 return len; 1372 return len;
@@ -1420,7 +1413,7 @@ static ssize_t reset_store(struct device *dev,
1420 /* Make sure all the pending I/O are finished */ 1413 /* Make sure all the pending I/O are finished */
1421 fsync_bdev(bdev); 1414 fsync_bdev(bdev);
1422 zram_reset_device(zram); 1415 zram_reset_device(zram);
1423 zram_revalidate_disk(zram); 1416 revalidate_disk(zram->disk);
1424 bdput(bdev); 1417 bdput(bdev);
1425 1418
1426 mutex_lock(&bdev->bd_mutex); 1419 mutex_lock(&bdev->bd_mutex);
@@ -1539,6 +1532,7 @@ static int zram_add(void)
1539 /* zram devices sort of resembles non-rotational disks */ 1532 /* zram devices sort of resembles non-rotational disks */
1540 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); 1533 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
1541 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); 1534 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
1535
1542 /* 1536 /*
1543 * To ensure that we always get PAGE_SIZE aligned 1537 * To ensure that we always get PAGE_SIZE aligned
1544 * and n*PAGE_SIZED sized I/O requests. 1538 * and n*PAGE_SIZED sized I/O requests.
@@ -1563,6 +1557,8 @@ static int zram_add(void)
1563 if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) 1557 if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
1564 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); 1558 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
1565 1559
1560 zram->disk->queue->backing_dev_info->capabilities |=
1561 (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO);
1566 add_disk(zram->disk); 1562 add_disk(zram->disk);
1567 1563
1568 ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, 1564 ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 6c7ccac2679e..ec42c8bb9b0d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -259,7 +259,6 @@
259#include <linux/cryptohash.h> 259#include <linux/cryptohash.h>
260#include <linux/fips.h> 260#include <linux/fips.h>
261#include <linux/ptrace.h> 261#include <linux/ptrace.h>
262#include <linux/kmemcheck.h>
263#include <linux/workqueue.h> 262#include <linux/workqueue.h>
264#include <linux/irq.h> 263#include <linux/irq.h>
265#include <linux/syscalls.h> 264#include <linux/syscalls.h>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 60d8bedb694d..cd664832f9e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -553,8 +553,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
553 * invalidated it. Free it and try again 553 * invalidated it. Free it and try again
554 */ 554 */
555 release_pages(e->user_pages, 555 release_pages(e->user_pages,
556 e->robj->tbo.ttm->num_pages, 556 e->robj->tbo.ttm->num_pages);
557 false);
558 kvfree(e->user_pages); 557 kvfree(e->user_pages);
559 e->user_pages = NULL; 558 e->user_pages = NULL;
560 } 559 }
@@ -691,8 +690,7 @@ error_free_pages:
691 continue; 690 continue;
692 691
693 release_pages(e->user_pages, 692 release_pages(e->user_pages,
694 e->robj->tbo.ttm->num_pages, 693 e->robj->tbo.ttm->num_pages);
695 false);
696 kvfree(e->user_pages); 694 kvfree(e->user_pages);
697 } 695 }
698 } 696 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 6149a47fe63d..0bda8f2a188a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -347,7 +347,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
347 return 0; 347 return 0;
348 348
349free_pages: 349free_pages:
350 release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages, false); 350 release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages);
351 351
352unlock_mmap_sem: 352unlock_mmap_sem:
353 up_read(&current->mm->mmap_sem); 353 up_read(&current->mm->mmap_sem);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index bc746131987f..d792959fac43 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -659,7 +659,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
659 return 0; 659 return 0;
660 660
661release_pages: 661release_pages:
662 release_pages(pages, pinned, 0); 662 release_pages(pages, pinned);
663 return r; 663 return r;
664} 664}
665 665
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 57881167ccd2..bcc8c2d7c7c9 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -779,7 +779,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
779 up_read(&mm->mmap_sem); 779 up_read(&mm->mmap_sem);
780 780
781 if (ret < 0) { 781 if (ret < 0) {
782 release_pages(pvec, pinned, 0); 782 release_pages(pvec, pinned);
783 kvfree(pvec); 783 kvfree(pvec);
784 return ERR_PTR(ret); 784 return ERR_PTR(ret);
785 } 785 }
@@ -852,7 +852,7 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj)
852 } 852 }
853 } 853 }
854 854
855 release_pages(pvec, pinned, 0); 855 release_pages(pvec, pinned);
856 kvfree(pvec); 856 kvfree(pvec);
857 857
858 work = kmalloc(sizeof(*work), GFP_KERNEL); 858 work = kmalloc(sizeof(*work), GFP_KERNEL);
@@ -886,7 +886,7 @@ static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj)
886 if (etnaviv_obj->pages) { 886 if (etnaviv_obj->pages) {
887 int npages = etnaviv_obj->base.size >> PAGE_SHIFT; 887 int npages = etnaviv_obj->base.size >> PAGE_SHIFT;
888 888
889 release_pages(etnaviv_obj->pages, npages, 0); 889 release_pages(etnaviv_obj->pages, npages);
890 kvfree(etnaviv_obj->pages); 890 kvfree(etnaviv_obj->pages);
891 } 891 }
892 put_task_struct(etnaviv_obj->userptr.task); 892 put_task_struct(etnaviv_obj->userptr.task);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index ad524cb0f6fc..7982ad817c11 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1859,7 +1859,7 @@ static void i915_address_space_init(struct i915_address_space *vm,
1859 INIT_LIST_HEAD(&vm->unbound_list); 1859 INIT_LIST_HEAD(&vm->unbound_list);
1860 1860
1861 list_add_tail(&vm->global_link, &dev_priv->vm_list); 1861 list_add_tail(&vm->global_link, &dev_priv->vm_list);
1862 pagevec_init(&vm->free_pages, false); 1862 pagevec_init(&vm->free_pages);
1863} 1863}
1864 1864
1865static void i915_address_space_fini(struct i915_address_space *vm) 1865static void i915_address_space_fini(struct i915_address_space *vm)
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 709efe2357ea..aa22361bd5a1 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -554,7 +554,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
554 } 554 }
555 mutex_unlock(&obj->mm.lock); 555 mutex_unlock(&obj->mm.lock);
556 556
557 release_pages(pvec, pinned, 0); 557 release_pages(pvec, pinned);
558 kvfree(pvec); 558 kvfree(pvec);
559 559
560 i915_gem_object_put(obj); 560 i915_gem_object_put(obj);
@@ -668,7 +668,7 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
668 __i915_gem_userptr_set_active(obj, true); 668 __i915_gem_userptr_set_active(obj, true);
669 669
670 if (IS_ERR(pages)) 670 if (IS_ERR(pages))
671 release_pages(pvec, pinned, 0); 671 release_pages(pvec, pinned);
672 kvfree(pvec); 672 kvfree(pvec);
673 673
674 return pages; 674 return pages;
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index bf69bf9086bf..1fdfc7a46072 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -597,7 +597,7 @@ release_sg:
597 kfree(ttm->sg); 597 kfree(ttm->sg);
598 598
599release_pages: 599release_pages:
600 release_pages(ttm->pages, pinned, 0); 600 release_pages(ttm->pages, pinned);
601 return r; 601 return r;
602} 602}
603 603
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
index 5243ad30dfc0..85dfbba427f6 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -1667,8 +1667,9 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
1667 } 1667 }
1668 if (!rcd->rcvegrbuf_phys) { 1668 if (!rcd->rcvegrbuf_phys) {
1669 rcd->rcvegrbuf_phys = 1669 rcd->rcvegrbuf_phys =
1670 kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), 1670 kmalloc_array_node(chunk,
1671 GFP_KERNEL, rcd->node_id); 1671 sizeof(rcd->rcvegrbuf_phys[0]),
1672 GFP_KERNEL, rcd->node_id);
1672 if (!rcd->rcvegrbuf_phys) 1673 if (!rcd->rcvegrbuf_phys)
1673 goto bail_rcvegrbuf; 1674 goto bail_rcvegrbuf;
1674 } 1675 }
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 410025a19729..9177df60742a 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -238,7 +238,7 @@ int rvt_driver_qp_init(struct rvt_dev_info *rdi)
238 rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size; 238 rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
239 rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size); 239 rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
240 rdi->qp_dev->qp_table = 240 rdi->qp_dev->qp_table =
241 kmalloc_node(rdi->qp_dev->qp_table_size * 241 kmalloc_array_node(rdi->qp_dev->qp_table_size,
242 sizeof(*rdi->qp_dev->qp_table), 242 sizeof(*rdi->qp_dev->qp_table),
243 GFP_KERNEL, rdi->dparms.node); 243 GFP_KERNEL, rdi->dparms.node);
244 if (!rdi->qp_dev->qp_table) 244 if (!rdi->qp_dev->qp_table)
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index 1922cb8f6b88..1c5b7aec13d4 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -15,7 +15,6 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/kmemcheck.h>
19#include <linux/ctype.h> 18#include <linux/ctype.h>
20#include <linux/delay.h> 19#include <linux/delay.h>
21#include <linux/idr.h> 20#include <linux/idr.h>
@@ -904,7 +903,6 @@ struct c2port_device *c2port_device_register(char *name,
904 return ERR_PTR(-EINVAL); 903 return ERR_PTR(-EINVAL);
905 904
906 c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL); 905 c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL);
907 kmemcheck_annotate_bitfield(c2dev, flags);
908 if (unlikely(!c2dev)) 906 if (unlikely(!c2dev))
909 return ERR_PTR(-ENOMEM); 907 return ERR_PTR(-ENOMEM);
910 908
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 5417e4da64ca..7451922c209d 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -517,7 +517,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
517 517
518 518
519 rc = ena_alloc_rx_page(rx_ring, rx_info, 519 rc = ena_alloc_rx_page(rx_ring, rx_info,
520 __GFP_COLD | GFP_ATOMIC | __GFP_COMP); 520 GFP_ATOMIC | __GFP_COMP);
521 if (unlikely(rc < 0)) { 521 if (unlikely(rc < 0)) {
522 netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, 522 netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
523 "failed to alloc buffer for rx queue %d\n", 523 "failed to alloc buffer for rx queue %d\n",
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
index 45d92304068e..cc1e4f820e64 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
@@ -295,7 +295,7 @@ again:
295 order = alloc_order; 295 order = alloc_order;
296 296
297 /* Try to obtain pages, decreasing order if necessary */ 297 /* Try to obtain pages, decreasing order if necessary */
298 gfp = GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_NOWARN; 298 gfp = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN;
299 while (order >= 0) { 299 while (order >= 0) {
300 pages = alloc_pages_node(node, gfp, order); 300 pages = alloc_pages_node(node, gfp, order);
301 if (pages) 301 if (pages)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 0654e0c76bc2..519ca6534b85 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -304,8 +304,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self)
304 buff->flags = 0U; 304 buff->flags = 0U;
305 buff->len = AQ_CFG_RX_FRAME_MAX; 305 buff->len = AQ_CFG_RX_FRAME_MAX;
306 306
307 buff->page = alloc_pages(GFP_ATOMIC | __GFP_COLD | 307 buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order);
308 __GFP_COMP, pages_order);
309 if (!buff->page) { 308 if (!buff->page) {
310 err = -ENOMEM; 309 err = -ENOMEM;
311 goto err_exit; 310 goto err_exit;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 433f3619de8f..f2d1a076a038 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -198,7 +198,7 @@ static inline void
198 struct sk_buff *skb; 198 struct sk_buff *skb;
199 struct octeon_skb_page_info *skb_pg_info; 199 struct octeon_skb_page_info *skb_pg_info;
200 200
201 page = alloc_page(GFP_ATOMIC | __GFP_COLD); 201 page = alloc_page(GFP_ATOMIC);
202 if (unlikely(!page)) 202 if (unlikely(!page))
203 return NULL; 203 return NULL;
204 204
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 92aec17f4b4d..85e28efcda33 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -193,7 +193,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
193 193
194 if (mlx4_en_prepare_rx_desc(priv, ring, 194 if (mlx4_en_prepare_rx_desc(priv, ring,
195 ring->actual_size, 195 ring->actual_size,
196 GFP_KERNEL | __GFP_COLD)) { 196 GFP_KERNEL)) {
197 if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { 197 if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
198 en_err(priv, "Failed to allocate enough rx buffers\n"); 198 en_err(priv, "Failed to allocate enough rx buffers\n");
199 return -ENOMEM; 199 return -ENOMEM;
@@ -551,8 +551,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
551 do { 551 do {
552 if (mlx4_en_prepare_rx_desc(priv, ring, 552 if (mlx4_en_prepare_rx_desc(priv, ring,
553 ring->prod & ring->size_mask, 553 ring->prod & ring->size_mask,
554 GFP_ATOMIC | __GFP_COLD | 554 GFP_ATOMIC | __GFP_MEMALLOC))
555 __GFP_MEMALLOC))
556 break; 555 break;
557 ring->prod++; 556 ring->prod++;
558 } while (likely(--missing)); 557 } while (likely(--missing));
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 232044b1b7aa..1a603fdd9e80 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1185,7 +1185,7 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
1185 } else { 1185 } else {
1186 struct page *page; 1186 struct page *page;
1187 1187
1188 page = alloc_page(GFP_KERNEL | __GFP_COLD); 1188 page = alloc_page(GFP_KERNEL);
1189 frag = page ? page_address(page) : NULL; 1189 frag = page ? page_address(page) : NULL;
1190 } 1190 }
1191 if (!frag) { 1191 if (!frag) {
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 29fea74bff2e..7b97a9969046 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -1092,8 +1092,7 @@ static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring,
1092{ 1092{
1093 if (!rx_ring->pg_chunk.page) { 1093 if (!rx_ring->pg_chunk.page) {
1094 u64 map; 1094 u64 map;
1095 rx_ring->pg_chunk.page = alloc_pages(__GFP_COLD | __GFP_COMP | 1095 rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC,
1096 GFP_ATOMIC,
1097 qdev->lbq_buf_order); 1096 qdev->lbq_buf_order);
1098 if (unlikely(!rx_ring->pg_chunk.page)) { 1097 if (unlikely(!rx_ring->pg_chunk.page)) {
1099 netif_err(qdev, drv, qdev->ndev, 1098 netif_err(qdev, drv, qdev->ndev,
diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c
index 382019b302db..02456ed13a7d 100644
--- a/drivers/net/ethernet/sfc/falcon/rx.c
+++ b/drivers/net/ethernet/sfc/falcon/rx.c
@@ -163,7 +163,7 @@ static int ef4_init_rx_buffers(struct ef4_rx_queue *rx_queue, bool atomic)
163 do { 163 do {
164 page = ef4_reuse_page(rx_queue); 164 page = ef4_reuse_page(rx_queue);
165 if (page == NULL) { 165 if (page == NULL) {
166 page = alloc_pages(__GFP_COLD | __GFP_COMP | 166 page = alloc_pages(__GFP_COMP |
167 (atomic ? GFP_ATOMIC : GFP_KERNEL), 167 (atomic ? GFP_ATOMIC : GFP_KERNEL),
168 efx->rx_buffer_order); 168 efx->rx_buffer_order);
169 if (unlikely(page == NULL)) 169 if (unlikely(page == NULL))
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 8cb60513dca2..cfe76aad79ee 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -163,7 +163,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic)
163 do { 163 do {
164 page = efx_reuse_page(rx_queue); 164 page = efx_reuse_page(rx_queue);
165 if (page == NULL) { 165 if (page == NULL) {
166 page = alloc_pages(__GFP_COLD | __GFP_COMP | 166 page = alloc_pages(__GFP_COMP |
167 (atomic ? GFP_ATOMIC : GFP_KERNEL), 167 (atomic ? GFP_ATOMIC : GFP_KERNEL),
168 efx->rx_buffer_order); 168 efx->rx_buffer_order);
169 if (unlikely(page == NULL)) 169 if (unlikely(page == NULL))
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
index e9672b1f9968..031cf9c3435a 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
@@ -335,7 +335,7 @@ static int xlgmac_alloc_pages(struct xlgmac_pdata *pdata,
335 dma_addr_t pages_dma; 335 dma_addr_t pages_dma;
336 336
337 /* Try to obtain pages, decreasing order if necessary */ 337 /* Try to obtain pages, decreasing order if necessary */
338 gfp |= __GFP_COLD | __GFP_COMP | __GFP_NOWARN; 338 gfp |= __GFP_COMP | __GFP_NOWARN;
339 while (order >= 0) { 339 while (order >= 0) {
340 pages = alloc_pages(gfp, order); 340 pages = alloc_pages(gfp, order);
341 if (pages) 341 if (pages)
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 15e2e3031d36..ed58c746e4af 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -906,7 +906,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
906 sw_data[0] = (u32)bufptr; 906 sw_data[0] = (u32)bufptr;
907 } else { 907 } else {
908 /* Allocate a secondary receive queue entry */ 908 /* Allocate a secondary receive queue entry */
909 page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD); 909 page = alloc_page(GFP_ATOMIC | GFP_DMA);
910 if (unlikely(!page)) { 910 if (unlikely(!page)) {
911 dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n"); 911 dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n");
912 goto fail; 912 goto fail;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index edf984406ba0..19a985ef9104 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1030,7 +1030,6 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
1030 int err; 1030 int err;
1031 bool oom; 1031 bool oom;
1032 1032
1033 gfp |= __GFP_COLD;
1034 do { 1033 do {
1035 if (vi->mergeable_rx_bufs) 1034 if (vi->mergeable_rx_bufs)
1036 err = add_recvbuf_mergeable(vi, rq, gfp); 1035 err = add_recvbuf_mergeable(vi, rq, gfp);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index d5612bd1cc81..e949e3302af4 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -23,6 +23,7 @@
23#include <linux/ndctl.h> 23#include <linux/ndctl.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/nd.h> 25#include <linux/nd.h>
26#include <linux/backing-dev.h>
26#include "btt.h" 27#include "btt.h"
27#include "nd.h" 28#include "nd.h"
28 29
@@ -1402,6 +1403,8 @@ static int btt_blk_init(struct btt *btt)
1402 btt->btt_disk->private_data = btt; 1403 btt->btt_disk->private_data = btt;
1403 btt->btt_disk->queue = btt->btt_queue; 1404 btt->btt_disk->queue = btt->btt_queue;
1404 btt->btt_disk->flags = GENHD_FL_EXT_DEVT; 1405 btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
1406 btt->btt_disk->queue->backing_dev_info->capabilities |=
1407 BDI_CAP_SYNCHRONOUS_IO;
1405 1408
1406 blk_queue_make_request(btt->btt_queue, btt_make_request); 1409 blk_queue_make_request(btt->btt_queue, btt_make_request);
1407 blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); 1410 blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 39dfd7affa31..7fbc5c5dc8e1 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -31,6 +31,7 @@
31#include <linux/uio.h> 31#include <linux/uio.h>
32#include <linux/dax.h> 32#include <linux/dax.h>
33#include <linux/nd.h> 33#include <linux/nd.h>
34#include <linux/backing-dev.h>
34#include "pmem.h" 35#include "pmem.h"
35#include "pfn.h" 36#include "pfn.h"
36#include "nd.h" 37#include "nd.h"
@@ -394,6 +395,7 @@ static int pmem_attach_disk(struct device *dev,
394 disk->fops = &pmem_fops; 395 disk->fops = &pmem_fops;
395 disk->queue = q; 396 disk->queue = q;
396 disk->flags = GENHD_FL_EXT_DEVT; 397 disk->flags = GENHD_FL_EXT_DEVT;
398 disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
397 nvdimm_namespace_disk_name(ndns, disk->disk_name); 399 nvdimm_namespace_disk_name(ndns, disk->disk_name);
398 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) 400 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
399 / 512); 401 / 512);
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
index 9e538a59f09d..03e55bca4ada 100644
--- a/drivers/staging/lustre/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -1152,7 +1152,7 @@ static int mdc_read_page_remote(void *data, struct page *page0)
1152 } 1152 }
1153 1153
1154 for (npages = 1; npages < max_pages; npages++) { 1154 for (npages = 1; npages < max_pages; npages++) {
1155 page = page_cache_alloc_cold(inode->i_mapping); 1155 page = page_cache_alloc(inode->i_mapping);
1156 if (!page) 1156 if (!page)
1157 break; 1157 break;
1158 page_pool[npages] = page; 1158 page_pool[npages] = page;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 106e43db1115..11dd0526b96b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -308,7 +308,7 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error,
308 _enter("{%x:%u},%lx-%lx", 308 _enter("{%x:%u},%lx-%lx",
309 vnode->fid.vid, vnode->fid.vnode, first, last); 309 vnode->fid.vid, vnode->fid.vnode, first, last);
310 310
311 pagevec_init(&pv, 0); 311 pagevec_init(&pv);
312 312
313 do { 313 do {
314 _debug("kill %lx-%lx", first, last); 314 _debug("kill %lx-%lx", first, last);
@@ -497,20 +497,13 @@ static int afs_writepages_region(struct address_space *mapping,
497 _enter(",,%lx,%lx,", index, end); 497 _enter(",,%lx,%lx,", index, end);
498 498
499 do { 499 do {
500 n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, 500 n = find_get_pages_range_tag(mapping, &index, end,
501 1, &page); 501 PAGECACHE_TAG_DIRTY, 1, &page);
502 if (!n) 502 if (!n)
503 break; 503 break;
504 504
505 _debug("wback %lx", page->index); 505 _debug("wback %lx", page->index);
506 506
507 if (page->index > end) {
508 *_next = index;
509 put_page(page);
510 _leave(" = 0 [%lx]", *_next);
511 return 0;
512 }
513
514 /* at this point we hold neither mapping->tree_lock nor lock on 507 /* at this point we hold neither mapping->tree_lock nor lock on
515 * the page itself: the page may be truncated or invalidated 508 * the page itself: the page may be truncated or invalidated
516 * (changing page->mapping to NULL), or even swizzled back from 509 * (changing page->mapping to NULL), or even swizzled back from
@@ -609,7 +602,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
609 602
610 ASSERT(wb != NULL); 603 ASSERT(wb != NULL);
611 604
612 pagevec_init(&pv, 0); 605 pagevec_init(&pv);
613 606
614 do { 607 do {
615 _debug("done %lx-%lx", first, last); 608 _debug("done %lx-%lx", first, last);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index adbbc017191c..16045ea86fc1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3797,7 +3797,7 @@ int btree_write_cache_pages(struct address_space *mapping,
3797 int scanned = 0; 3797 int scanned = 0;
3798 int tag; 3798 int tag;
3799 3799
3800 pagevec_init(&pvec, 0); 3800 pagevec_init(&pvec);
3801 if (wbc->range_cyclic) { 3801 if (wbc->range_cyclic) {
3802 index = mapping->writeback_index; /* Start from prev offset */ 3802 index = mapping->writeback_index; /* Start from prev offset */
3803 end = -1; 3803 end = -1;
@@ -3814,8 +3814,8 @@ retry:
3814 if (wbc->sync_mode == WB_SYNC_ALL) 3814 if (wbc->sync_mode == WB_SYNC_ALL)
3815 tag_pages_for_writeback(mapping, index, end); 3815 tag_pages_for_writeback(mapping, index, end);
3816 while (!done && !nr_to_write_done && (index <= end) && 3816 while (!done && !nr_to_write_done && (index <= end) &&
3817 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3817 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
3818 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3818 tag))) {
3819 unsigned i; 3819 unsigned i;
3820 3820
3821 scanned = 1; 3821 scanned = 1;
@@ -3825,11 +3825,6 @@ retry:
3825 if (!PagePrivate(page)) 3825 if (!PagePrivate(page))
3826 continue; 3826 continue;
3827 3827
3828 if (!wbc->range_cyclic && page->index > end) {
3829 done = 1;
3830 break;
3831 }
3832
3833 spin_lock(&mapping->private_lock); 3828 spin_lock(&mapping->private_lock);
3834 if (!PagePrivate(page)) { 3829 if (!PagePrivate(page)) {
3835 spin_unlock(&mapping->private_lock); 3830 spin_unlock(&mapping->private_lock);
@@ -3941,7 +3936,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
3941 if (!igrab(inode)) 3936 if (!igrab(inode))
3942 return 0; 3937 return 0;
3943 3938
3944 pagevec_init(&pvec, 0); 3939 pagevec_init(&pvec);
3945 if (wbc->range_cyclic) { 3940 if (wbc->range_cyclic) {
3946 index = mapping->writeback_index; /* Start from prev offset */ 3941 index = mapping->writeback_index; /* Start from prev offset */
3947 end = -1; 3942 end = -1;
@@ -3961,8 +3956,8 @@ retry:
3961 tag_pages_for_writeback(mapping, index, end); 3956 tag_pages_for_writeback(mapping, index, end);
3962 done_index = index; 3957 done_index = index;
3963 while (!done && !nr_to_write_done && (index <= end) && 3958 while (!done && !nr_to_write_done && (index <= end) &&
3964 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3959 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
3965 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3960 &index, end, tag))) {
3966 unsigned i; 3961 unsigned i;
3967 3962
3968 scanned = 1; 3963 scanned = 1;
@@ -3987,12 +3982,6 @@ retry:
3987 continue; 3982 continue;
3988 } 3983 }
3989 3984
3990 if (!wbc->range_cyclic && page->index > end) {
3991 done = 1;
3992 unlock_page(page);
3993 continue;
3994 }
3995
3996 if (wbc->sync_mode != WB_SYNC_NONE) { 3985 if (wbc->sync_mode != WB_SYNC_NONE) {
3997 if (PageWriteback(page)) 3986 if (PageWriteback(page))
3998 flush_fn(data); 3987 flush_fn(data);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1c18a22a6013..0736a6a2e2f0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1592,7 +1592,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1592 struct buffer_head *head; 1592 struct buffer_head *head;
1593 1593
1594 end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); 1594 end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1595 pagevec_init(&pvec, 0); 1595 pagevec_init(&pvec);
1596 while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { 1596 while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
1597 count = pagevec_count(&pvec); 1597 count = pagevec_count(&pvec);
1598 for (i = 0; i < count; i++) { 1598 for (i = 0; i < count; i++) {
@@ -3514,7 +3514,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
3514 if (length <= 0) 3514 if (length <= 0)
3515 return -ENOENT; 3515 return -ENOENT;
3516 3516
3517 pagevec_init(&pvec, 0); 3517 pagevec_init(&pvec);
3518 3518
3519 do { 3519 do {
3520 unsigned nr_pages, i; 3520 unsigned nr_pages, i;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 18d7aa61ef0f..883bc7bb12c5 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
256 goto backing_page_already_present; 256 goto backing_page_already_present;
257 257
258 if (!newpage) { 258 if (!newpage) {
259 newpage = __page_cache_alloc(cachefiles_gfp | 259 newpage = __page_cache_alloc(cachefiles_gfp);
260 __GFP_COLD);
261 if (!newpage) 260 if (!newpage)
262 goto nomem_monitor; 261 goto nomem_monitor;
263 } 262 }
@@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
493 goto backing_page_already_present; 492 goto backing_page_already_present;
494 493
495 if (!newpage) { 494 if (!newpage) {
496 newpage = __page_cache_alloc(cachefiles_gfp | 495 newpage = __page_cache_alloc(cachefiles_gfp);
497 __GFP_COLD);
498 if (!newpage) 496 if (!newpage)
499 goto nomem; 497 goto nomem;
500 } 498 }
@@ -710,7 +708,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
710 /* calculate the shift required to use bmap */ 708 /* calculate the shift required to use bmap */
711 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 709 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
712 710
713 pagevec_init(&pagevec, 0); 711 pagevec_init(&pagevec);
714 712
715 op->op.flags &= FSCACHE_OP_KEEP_FLAGS; 713 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
716 op->op.flags |= FSCACHE_OP_ASYNC; 714 op->op.flags |= FSCACHE_OP_ASYNC;
@@ -844,7 +842,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
844 842
845 ret = cachefiles_has_space(cache, 0, *nr_pages); 843 ret = cachefiles_has_space(cache, 0, *nr_pages);
846 if (ret == 0) { 844 if (ret == 0) {
847 pagevec_init(&pagevec, 0); 845 pagevec_init(&pagevec);
848 846
849 list_for_each_entry(page, pages, lru) { 847 list_for_each_entry(page, pages, lru) {
850 if (pagevec_add(&pagevec, page) == 0) 848 if (pagevec_add(&pagevec, page) == 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4d622654bfbc..dbf07051aacd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -680,7 +680,7 @@ static void ceph_release_pages(struct page **pages, int num)
680 struct pagevec pvec; 680 struct pagevec pvec;
681 int i; 681 int i;
682 682
683 pagevec_init(&pvec, 0); 683 pagevec_init(&pvec);
684 for (i = 0; i < num; i++) { 684 for (i = 0; i < num; i++) {
685 if (pagevec_add(&pvec, pages[i]) == 0) 685 if (pagevec_add(&pvec, pages[i]) == 0)
686 pagevec_release(&pvec); 686 pagevec_release(&pvec);
@@ -811,7 +811,7 @@ static int ceph_writepages_start(struct address_space *mapping,
811 if (fsc->mount_options->wsize < wsize) 811 if (fsc->mount_options->wsize < wsize)
812 wsize = fsc->mount_options->wsize; 812 wsize = fsc->mount_options->wsize;
813 813
814 pagevec_init(&pvec, 0); 814 pagevec_init(&pvec);
815 815
816 start_index = wbc->range_cyclic ? mapping->writeback_index : 0; 816 start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
817 index = start_index; 817 index = start_index;
@@ -870,15 +870,10 @@ retry:
870 max_pages = wsize >> PAGE_SHIFT; 870 max_pages = wsize >> PAGE_SHIFT;
871 871
872get_more_pages: 872get_more_pages:
873 pvec_pages = min_t(unsigned, PAGEVEC_SIZE, 873 pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
874 max_pages - locked_pages); 874 end, PAGECACHE_TAG_DIRTY,
875 if (end - index < (u64)(pvec_pages - 1)) 875 max_pages - locked_pages);
876 pvec_pages = (unsigned)(end - index) + 1; 876 dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
877
878 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
879 PAGECACHE_TAG_DIRTY,
880 pvec_pages);
881 dout("pagevec_lookup_tag got %d\n", pvec_pages);
882 if (!pvec_pages && !locked_pages) 877 if (!pvec_pages && !locked_pages)
883 break; 878 break;
884 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { 879 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
@@ -896,16 +891,6 @@ get_more_pages:
896 unlock_page(page); 891 unlock_page(page);
897 continue; 892 continue;
898 } 893 }
899 if (page->index > end) {
900 dout("end of range %p\n", page);
901 /* can't be range_cyclic (1st pass) because
902 * end == -1 in that case. */
903 stop = true;
904 if (ceph_wbc.head_snapc)
905 done = true;
906 unlock_page(page);
907 break;
908 }
909 if (strip_unit_end && (page->index > strip_unit_end)) { 894 if (strip_unit_end && (page->index > strip_unit_end)) {
910 dout("end of strip unit %p\n", page); 895 dout("end of strip unit %p\n", page);
911 unlock_page(page); 896 unlock_page(page);
@@ -1177,8 +1162,7 @@ release_pvec_pages:
1177 index = 0; 1162 index = 0;
1178 while ((index <= end) && 1163 while ((index <= end) &&
1179 (nr = pagevec_lookup_tag(&pvec, mapping, &index, 1164 (nr = pagevec_lookup_tag(&pvec, mapping, &index,
1180 PAGECACHE_TAG_WRITEBACK, 1165 PAGECACHE_TAG_WRITEBACK))) {
1181 PAGEVEC_SIZE))) {
1182 for (i = 0; i < nr; i++) { 1166 for (i = 0; i < nr; i++) {
1183 page = pvec.pages[i]; 1167 page = pvec.pages[i];
1184 if (page_snap_context(page) != snapc) 1168 if (page_snap_context(page) != snapc)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 92fdf9c35de2..df9f682708c6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1963,8 +1963,6 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
1963 pgoff_t end, pgoff_t *index, 1963 pgoff_t end, pgoff_t *index,
1964 unsigned int *found_pages) 1964 unsigned int *found_pages)
1965{ 1965{
1966 unsigned int nr_pages;
1967 struct page **pages;
1968 struct cifs_writedata *wdata; 1966 struct cifs_writedata *wdata;
1969 1967
1970 wdata = cifs_writedata_alloc((unsigned int)tofind, 1968 wdata = cifs_writedata_alloc((unsigned int)tofind,
@@ -1972,23 +1970,8 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
1972 if (!wdata) 1970 if (!wdata)
1973 return NULL; 1971 return NULL;
1974 1972
1975 /* 1973 *found_pages = find_get_pages_range_tag(mapping, index, end,
1976 * find_get_pages_tag seems to return a max of 256 on each 1974 PAGECACHE_TAG_DIRTY, tofind, wdata->pages);
1977 * iteration, so we must call it several times in order to
1978 * fill the array or the wsize is effectively limited to
1979 * 256 * PAGE_SIZE.
1980 */
1981 *found_pages = 0;
1982 pages = wdata->pages;
1983 do {
1984 nr_pages = find_get_pages_tag(mapping, index,
1985 PAGECACHE_TAG_DIRTY, tofind,
1986 pages);
1987 *found_pages += nr_pages;
1988 tofind -= nr_pages;
1989 pages += nr_pages;
1990 } while (nr_pages && tofind && *index <= end);
1991
1992 return wdata; 1975 return wdata;
1993} 1976}
1994 1977
diff --git a/fs/dax.c b/fs/dax.c
index f3a44a7c14b3..3652b26a0048 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -565,7 +565,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
565 ret = __radix_tree_lookup(page_tree, index, &node, &slot); 565 ret = __radix_tree_lookup(page_tree, index, &node, &slot);
566 WARN_ON_ONCE(ret != entry); 566 WARN_ON_ONCE(ret != entry);
567 __radix_tree_replace(page_tree, node, slot, 567 __radix_tree_replace(page_tree, node, slot,
568 new_entry, NULL, NULL); 568 new_entry, NULL);
569 entry = new_entry; 569 entry = new_entry;
570 } 570 }
571 571
@@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
614 if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) 614 if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
615 continue; 615 continue;
616 616
617 /*
618 * No need to call mmu_notifier_invalidate_range() as we are
619 * downgrading page table protection not changing it to point
620 * to a new page.
621 *
622 * See Documentation/vm/mmu_notifier.txt
623 */
617 if (pmdp) { 624 if (pmdp) {
618#ifdef CONFIG_FS_DAX_PMD 625#ifdef CONFIG_FS_DAX_PMD
619 pmd_t pmd; 626 pmd_t pmd;
@@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
628 pmd = pmd_wrprotect(pmd); 635 pmd = pmd_wrprotect(pmd);
629 pmd = pmd_mkclean(pmd); 636 pmd = pmd_mkclean(pmd);
630 set_pmd_at(vma->vm_mm, address, pmdp, pmd); 637 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
631 mmu_notifier_invalidate_range(vma->vm_mm, start, end);
632unlock_pmd: 638unlock_pmd:
633 spin_unlock(ptl); 639 spin_unlock(ptl);
634#endif 640#endif
@@ -643,7 +649,6 @@ unlock_pmd:
643 pte = pte_wrprotect(pte); 649 pte = pte_wrprotect(pte);
644 pte = pte_mkclean(pte); 650 pte = pte_mkclean(pte);
645 set_pte_at(vma->vm_mm, address, ptep, pte); 651 set_pte_at(vma->vm_mm, address, ptep, pte);
646 mmu_notifier_invalidate_range(vma->vm_mm, start, end);
647unlock_pte: 652unlock_pte:
648 pte_unmap_unlock(ptep, ptl); 653 pte_unmap_unlock(ptep, ptl);
649 } 654 }
@@ -789,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
789 794
790 tag_pages_for_writeback(mapping, start_index, end_index); 795 tag_pages_for_writeback(mapping, start_index, end_index);
791 796
792 pagevec_init(&pvec, 0); 797 pagevec_init(&pvec);
793 while (!done) { 798 while (!done) {
794 pvec.nr = find_get_entries_tag(mapping, start_index, 799 pvec.nr = find_get_entries_tag(mapping, start_index,
795 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 800 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
diff --git a/fs/dcache.c b/fs/dcache.c
index bcc9f6981569..5c7df1df81ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
2705 */ 2705 */
2706 unsigned int i; 2706 unsigned int i;
2707 BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); 2707 BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
2708 kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
2709 kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
2710 for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { 2708 for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
2711 swap(((long *) &dentry->d_iname)[i], 2709 swap(((long *) &dentry->d_iname)[i],
2712 ((long *) &target->d_iname)[i]); 2710 ((long *) &target->d_iname)[i]);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6b801186baa5..25aeaa7328ba 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -660,7 +660,7 @@ static struct ecryptfs_cache_info {
660 struct kmem_cache **cache; 660 struct kmem_cache **cache;
661 const char *name; 661 const char *name;
662 size_t size; 662 size_t size;
663 unsigned long flags; 663 slab_flags_t flags;
664 void (*ctor)(void *obj); 664 void (*ctor)(void *obj);
665} ecryptfs_cache_infos[] = { 665} ecryptfs_cache_infos[] = {
666 { 666 {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2633150e41b9..8d2b582fb141 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1719,7 +1719,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1719 ext4_es_remove_extent(inode, start, last - start + 1); 1719 ext4_es_remove_extent(inode, start, last - start + 1);
1720 } 1720 }
1721 1721
1722 pagevec_init(&pvec, 0); 1722 pagevec_init(&pvec);
1723 while (index <= end) { 1723 while (index <= end) {
1724 nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); 1724 nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
1725 if (nr_pages == 0) 1725 if (nr_pages == 0)
@@ -2345,7 +2345,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2345 lblk = start << bpp_bits; 2345 lblk = start << bpp_bits;
2346 pblock = mpd->map.m_pblk; 2346 pblock = mpd->map.m_pblk;
2347 2347
2348 pagevec_init(&pvec, 0); 2348 pagevec_init(&pvec);
2349 while (start <= end) { 2349 while (start <= end) {
2350 nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, 2350 nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
2351 &start, end); 2351 &start, end);
@@ -2616,12 +2616,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2616 else 2616 else
2617 tag = PAGECACHE_TAG_DIRTY; 2617 tag = PAGECACHE_TAG_DIRTY;
2618 2618
2619 pagevec_init(&pvec, 0); 2619 pagevec_init(&pvec);
2620 mpd->map.m_len = 0; 2620 mpd->map.m_len = 0;
2621 mpd->next_page = index; 2621 mpd->next_page = index;
2622 while (index <= end) { 2622 while (index <= end) {
2623 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2623 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
2624 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2624 tag);
2625 if (nr_pages == 0) 2625 if (nr_pages == 0)
2626 goto out; 2626 goto out;
2627 2627
@@ -2629,16 +2629,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2629 struct page *page = pvec.pages[i]; 2629 struct page *page = pvec.pages[i];
2630 2630
2631 /* 2631 /*
2632 * At this point, the page may be truncated or
2633 * invalidated (changing page->mapping to NULL), or
2634 * even swizzled back from swapper_space to tmpfs file
2635 * mapping. However, page->index will not change
2636 * because we have a reference on the page.
2637 */
2638 if (page->index > end)
2639 goto out;
2640
2641 /*
2642 * Accumulated enough dirty pages? This doesn't apply 2632 * Accumulated enough dirty pages? This doesn't apply
2643 * to WB_SYNC_ALL mode. For integrity sync we have to 2633 * to WB_SYNC_ALL mode. For integrity sync we have to
2644 * keep going because someone may be concurrently 2634 * keep going because someone may be concurrently
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 04fe1df052b2..0bb8e2c022d3 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -305,25 +305,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
305 long nr_to_write, enum iostat_type io_type) 305 long nr_to_write, enum iostat_type io_type)
306{ 306{
307 struct address_space *mapping = META_MAPPING(sbi); 307 struct address_space *mapping = META_MAPPING(sbi);
308 pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; 308 pgoff_t index = 0, prev = ULONG_MAX;
309 struct pagevec pvec; 309 struct pagevec pvec;
310 long nwritten = 0; 310 long nwritten = 0;
311 int nr_pages;
311 struct writeback_control wbc = { 312 struct writeback_control wbc = {
312 .for_reclaim = 0, 313 .for_reclaim = 0,
313 }; 314 };
314 struct blk_plug plug; 315 struct blk_plug plug;
315 316
316 pagevec_init(&pvec, 0); 317 pagevec_init(&pvec);
317 318
318 blk_start_plug(&plug); 319 blk_start_plug(&plug);
319 320
320 while (index <= end) { 321 while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
321 int i, nr_pages; 322 PAGECACHE_TAG_DIRTY))) {
322 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 323 int i;
323 PAGECACHE_TAG_DIRTY,
324 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
325 if (unlikely(nr_pages == 0))
326 break;
327 324
328 for (i = 0; i < nr_pages; i++) { 325 for (i = 0; i < nr_pages; i++) {
329 struct page *page = pvec.pages[i]; 326 struct page *page = pvec.pages[i];
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 36b535207c88..7b3ad5d8e2e9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1635,7 +1635,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
1635 int range_whole = 0; 1635 int range_whole = 0;
1636 int tag; 1636 int tag;
1637 1637
1638 pagevec_init(&pvec, 0); 1638 pagevec_init(&pvec);
1639 1639
1640 if (get_dirty_pages(mapping->host) <= 1640 if (get_dirty_pages(mapping->host) <=
1641 SM_I(F2FS_M_SB(mapping))->min_hot_blocks) 1641 SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
@@ -1669,8 +1669,8 @@ retry:
1669 while (!done && (index <= end)) { 1669 while (!done && (index <= end)) {
1670 int i; 1670 int i;
1671 1671
1672 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 1672 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
1673 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); 1673 tag);
1674 if (nr_pages == 0) 1674 if (nr_pages == 0)
1675 break; 1675 break;
1676 1676
@@ -1678,11 +1678,6 @@ retry:
1678 struct page *page = pvec.pages[i]; 1678 struct page *page = pvec.pages[i];
1679 bool submitted = false; 1679 bool submitted = false;
1680 1680
1681 if (page->index > end) {
1682 done = 1;
1683 break;
1684 }
1685
1686 done_index = page->index; 1681 done_index = page->index;
1687retry_write: 1682retry_write:
1688 lock_page(page); 1683 lock_page(page);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 517e112c8a9a..f78b76ec4707 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -313,18 +313,19 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
313static pgoff_t __get_first_dirty_index(struct address_space *mapping, 313static pgoff_t __get_first_dirty_index(struct address_space *mapping,
314 pgoff_t pgofs, int whence) 314 pgoff_t pgofs, int whence)
315{ 315{
316 struct pagevec pvec; 316 struct page *page;
317 int nr_pages; 317 int nr_pages;
318 318
319 if (whence != SEEK_DATA) 319 if (whence != SEEK_DATA)
320 return 0; 320 return 0;
321 321
322 /* find first dirty page index */ 322 /* find first dirty page index */
323 pagevec_init(&pvec, 0); 323 nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY,
324 nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, 324 1, &page);
325 PAGECACHE_TAG_DIRTY, 1); 325 if (!nr_pages)
326 pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; 326 return ULONG_MAX;
327 pagevec_release(&pvec); 327 pgofs = page->index;
328 put_page(page);
328 return pgofs; 329 return pgofs;
329} 330}
330 331
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index fca87835a1da..b33dac9592ca 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1277,21 +1277,17 @@ release_page:
1277 1277
1278static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) 1278static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
1279{ 1279{
1280 pgoff_t index, end; 1280 pgoff_t index;
1281 struct pagevec pvec; 1281 struct pagevec pvec;
1282 struct page *last_page = NULL; 1282 struct page *last_page = NULL;
1283 int nr_pages;
1283 1284
1284 pagevec_init(&pvec, 0); 1285 pagevec_init(&pvec);
1285 index = 0; 1286 index = 0;
1286 end = ULONG_MAX; 1287
1287 1288 while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1288 while (index <= end) { 1289 PAGECACHE_TAG_DIRTY))) {
1289 int i, nr_pages; 1290 int i;
1290 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1291 PAGECACHE_TAG_DIRTY,
1292 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1293 if (nr_pages == 0)
1294 break;
1295 1291
1296 for (i = 0; i < nr_pages; i++) { 1292 for (i = 0; i < nr_pages; i++) {
1297 struct page *page = pvec.pages[i]; 1293 struct page *page = pvec.pages[i];
@@ -1425,13 +1421,14 @@ static int f2fs_write_node_page(struct page *page,
1425int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, 1421int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
1426 struct writeback_control *wbc, bool atomic) 1422 struct writeback_control *wbc, bool atomic)
1427{ 1423{
1428 pgoff_t index, end; 1424 pgoff_t index;
1429 pgoff_t last_idx = ULONG_MAX; 1425 pgoff_t last_idx = ULONG_MAX;
1430 struct pagevec pvec; 1426 struct pagevec pvec;
1431 int ret = 0; 1427 int ret = 0;
1432 struct page *last_page = NULL; 1428 struct page *last_page = NULL;
1433 bool marked = false; 1429 bool marked = false;
1434 nid_t ino = inode->i_ino; 1430 nid_t ino = inode->i_ino;
1431 int nr_pages;
1435 1432
1436 if (atomic) { 1433 if (atomic) {
1437 last_page = last_fsync_dnode(sbi, ino); 1434 last_page = last_fsync_dnode(sbi, ino);
@@ -1439,17 +1436,12 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
1439 return PTR_ERR_OR_ZERO(last_page); 1436 return PTR_ERR_OR_ZERO(last_page);
1440 } 1437 }
1441retry: 1438retry:
1442 pagevec_init(&pvec, 0); 1439 pagevec_init(&pvec);
1443 index = 0; 1440 index = 0;
1444 end = ULONG_MAX; 1441
1445 1442 while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1446 while (index <= end) { 1443 PAGECACHE_TAG_DIRTY))) {
1447 int i, nr_pages; 1444 int i;
1448 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1449 PAGECACHE_TAG_DIRTY,
1450 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1451 if (nr_pages == 0)
1452 break;
1453 1445
1454 for (i = 0; i < nr_pages; i++) { 1446 for (i = 0; i < nr_pages; i++) {
1455 struct page *page = pvec.pages[i]; 1447 struct page *page = pvec.pages[i];
@@ -1548,25 +1540,21 @@ out:
1548int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, 1540int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
1549 bool do_balance, enum iostat_type io_type) 1541 bool do_balance, enum iostat_type io_type)
1550{ 1542{
1551 pgoff_t index, end; 1543 pgoff_t index;
1552 struct pagevec pvec; 1544 struct pagevec pvec;
1553 int step = 0; 1545 int step = 0;
1554 int nwritten = 0; 1546 int nwritten = 0;
1555 int ret = 0; 1547 int ret = 0;
1548 int nr_pages;
1556 1549
1557 pagevec_init(&pvec, 0); 1550 pagevec_init(&pvec);
1558 1551
1559next_step: 1552next_step:
1560 index = 0; 1553 index = 0;
1561 end = ULONG_MAX; 1554
1562 1555 while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1563 while (index <= end) { 1556 PAGECACHE_TAG_DIRTY))) {
1564 int i, nr_pages; 1557 int i;
1565 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1566 PAGECACHE_TAG_DIRTY,
1567 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1568 if (nr_pages == 0)
1569 break;
1570 1558
1571 for (i = 0; i < nr_pages; i++) { 1559 for (i = 0; i < nr_pages; i++) {
1572 struct page *page = pvec.pages[i]; 1560 struct page *page = pvec.pages[i];
@@ -1655,27 +1643,20 @@ out:
1655 1643
1656int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) 1644int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1657{ 1645{
1658 pgoff_t index = 0, end = ULONG_MAX; 1646 pgoff_t index = 0;
1659 struct pagevec pvec; 1647 struct pagevec pvec;
1660 int ret2, ret = 0; 1648 int ret2, ret = 0;
1649 int nr_pages;
1661 1650
1662 pagevec_init(&pvec, 0); 1651 pagevec_init(&pvec);
1663 1652
1664 while (index <= end) { 1653 while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1665 int i, nr_pages; 1654 PAGECACHE_TAG_WRITEBACK))) {
1666 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, 1655 int i;
1667 PAGECACHE_TAG_WRITEBACK,
1668 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1669 if (nr_pages == 0)
1670 break;
1671 1656
1672 for (i = 0; i < nr_pages; i++) { 1657 for (i = 0; i < nr_pages; i++) {
1673 struct page *page = pvec.pages[i]; 1658 struct page *page = pvec.pages[i];
1674 1659
1675 /* until radix tree lookup accepts end_index */
1676 if (unlikely(page->index > end))
1677 continue;
1678
1679 if (ino && ino_of_node(page) == ino) { 1660 if (ino && ino_of_node(page) == ino) {
1680 f2fs_wait_on_page_writeback(page, NODE, true); 1661 f2fs_wait_on_page_writeback(page, NODE, true);
1681 if (TestClearPageError(page)) 1662 if (TestClearPageError(page))
diff --git a/fs/file_table.c b/fs/file_table.c
index 49e1f2f1a4cb..2dc9f38bd195 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -312,7 +312,7 @@ void put_filp(struct file *file)
312void __init files_init(void) 312void __init files_init(void)
313{ 313{
314 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, 314 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
315 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 315 SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
316 percpu_counter_init(&nr_files, 0, GFP_KERNEL); 316 percpu_counter_init(&nr_files, 0, GFP_KERNEL);
317} 317}
318 318
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 0ad3fd3ad0b4..961029e04027 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1175,7 +1175,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
1175 return; 1175 return;
1176 } 1176 }
1177 1177
1178 pagevec_init(&pvec, 0); 1178 pagevec_init(&pvec);
1179 next = 0; 1179 next = 0;
1180 do { 1180 do {
1181 if (!pagevec_lookup(&pvec, mapping, &next)) 1181 if (!pagevec_lookup(&pvec, mapping, &next))
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a42d89371748..17f0d05bfd4c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1636,7 +1636,7 @@ out_finish:
1636 1636
1637static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) 1637static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1638{ 1638{
1639 release_pages(req->pages, req->num_pages, false); 1639 release_pages(req->pages, req->num_pages);
1640} 1640}
1641 1641
1642static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, 1642static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a79e320349cd..2f504d615d92 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1273,9 +1273,9 @@ static int __init fuse_fs_init(void)
1273 int err; 1273 int err;
1274 1274
1275 fuse_inode_cachep = kmem_cache_create("fuse_inode", 1275 fuse_inode_cachep = kmem_cache_create("fuse_inode",
1276 sizeof(struct fuse_inode), 0, 1276 sizeof(struct fuse_inode), 0,
1277 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, 1277 SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
1278 fuse_inode_init_once); 1278 fuse_inode_init_once);
1279 err = -ENOMEM; 1279 err = -ENOMEM;
1280 if (!fuse_inode_cachep) 1280 if (!fuse_inode_cachep)
1281 goto out; 1281 goto out;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 68ed06962537..1daf15a1f00c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -280,22 +280,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
280 for(i = 0; i < nr_pages; i++) { 280 for(i = 0; i < nr_pages; i++) {
281 struct page *page = pvec->pages[i]; 281 struct page *page = pvec->pages[i];
282 282
283 /*
284 * At this point, the page may be truncated or
285 * invalidated (changing page->mapping to NULL), or
286 * even swizzled back from swapper_space to tmpfs file
287 * mapping. However, page->index will not change
288 * because we have a reference on the page.
289 */
290 if (page->index > end) {
291 /*
292 * can't be range_cyclic (1st pass) because
293 * end == -1 in that case.
294 */
295 ret = 1;
296 break;
297 }
298
299 *done_index = page->index; 283 *done_index = page->index;
300 284
301 lock_page(page); 285 lock_page(page);
@@ -387,7 +371,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
387 int range_whole = 0; 371 int range_whole = 0;
388 int tag; 372 int tag;
389 373
390 pagevec_init(&pvec, 0); 374 pagevec_init(&pvec);
391 if (wbc->range_cyclic) { 375 if (wbc->range_cyclic) {
392 writeback_index = mapping->writeback_index; /* prev offset */ 376 writeback_index = mapping->writeback_index; /* prev offset */
393 index = writeback_index; 377 index = writeback_index;
@@ -413,8 +397,8 @@ retry:
413 tag_pages_for_writeback(mapping, index, end); 397 tag_pages_for_writeback(mapping, index, end);
414 done_index = index; 398 done_index = index;
415 while (!done && (index <= end)) { 399 while (!done && (index <= end)) {
416 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 400 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
417 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 401 tag);
418 if (nr_pages == 0) 402 if (nr_pages == 0)
419 break; 403 break;
420 404
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ed113ea17aff..1e76730aac0d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -407,7 +407,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
407 407
408 memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); 408 memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
409 pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); 409 pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
410 pagevec_init(&pvec, 0); 410 pagevec_init(&pvec);
411 next = start; 411 next = start;
412 while (next < end) { 412 while (next < end) {
413 /* 413 /*
@@ -668,7 +668,6 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
668 return error; 668 return error;
669 669
670 if (ia_valid & ATTR_SIZE) { 670 if (ia_valid & ATTR_SIZE) {
671 error = -EINVAL;
672 if (attr->ia_size & ~huge_page_mask(h)) 671 if (attr->ia_size & ~huge_page_mask(h))
673 return -EINVAL; 672 return -EINVAL;
674 error = hugetlb_vmtruncate(inode, attr->ia_size); 673 error = hugetlb_vmtruncate(inode, attr->ia_size);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 06ffa135dfa6..16a7a67a11c9 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2156,10 +2156,10 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
2156 level++) 2156 level++)
2157 INIT_LIST_HEAD(&lists[level]); 2157 INIT_LIST_HEAD(&lists[level]);
2158 2158
2159 pagevec_init(&pvec, 0); 2159 pagevec_init(&pvec);
2160 2160
2161 while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, 2161 while (pagevec_lookup_tag(&pvec, btcache, &index,
2162 PAGEVEC_SIZE)) { 2162 PAGECACHE_TAG_DIRTY)) {
2163 for (i = 0; i < pagevec_count(&pvec); i++) { 2163 for (i = 0; i < pagevec_count(&pvec); i++) {
2164 bh = head = page_buffers(pvec.pages[i]); 2164 bh = head = page_buffers(pvec.pages[i]);
2165 do { 2165 do {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8616c46d33da..68241512d7c1 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -255,10 +255,9 @@ int nilfs_copy_dirty_pages(struct address_space *dmap,
255 pgoff_t index = 0; 255 pgoff_t index = 0;
256 int err = 0; 256 int err = 0;
257 257
258 pagevec_init(&pvec, 0); 258 pagevec_init(&pvec);
259repeat: 259repeat:
260 if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, 260 if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY))
261 PAGEVEC_SIZE))
262 return 0; 261 return 0;
263 262
264 for (i = 0; i < pagevec_count(&pvec); i++) { 263 for (i = 0; i < pagevec_count(&pvec); i++) {
@@ -310,7 +309,7 @@ void nilfs_copy_back_pages(struct address_space *dmap,
310 pgoff_t index = 0; 309 pgoff_t index = 0;
311 int err; 310 int err;
312 311
313 pagevec_init(&pvec, 0); 312 pagevec_init(&pvec);
314repeat: 313repeat:
315 n = pagevec_lookup(&pvec, smap, &index); 314 n = pagevec_lookup(&pvec, smap, &index);
316 if (!n) 315 if (!n)
@@ -374,10 +373,10 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
374 unsigned int i; 373 unsigned int i;
375 pgoff_t index = 0; 374 pgoff_t index = 0;
376 375
377 pagevec_init(&pvec, 0); 376 pagevec_init(&pvec);
378 377
379 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, 378 while (pagevec_lookup_tag(&pvec, mapping, &index,
380 PAGEVEC_SIZE)) { 379 PAGECACHE_TAG_DIRTY)) {
381 for (i = 0; i < pagevec_count(&pvec); i++) { 380 for (i = 0; i < pagevec_count(&pvec); i++) {
382 struct page *page = pvec.pages[i]; 381 struct page *page = pvec.pages[i];
383 382
@@ -519,7 +518,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
519 index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); 518 index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
520 nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); 519 nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
521 520
522 pagevec_init(&pvec, 0); 521 pagevec_init(&pvec);
523 522
524repeat: 523repeat:
525 pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, 524 pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 70ded52dc1dd..f65392fecb5c 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -708,21 +708,17 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
708 index = start >> PAGE_SHIFT; 708 index = start >> PAGE_SHIFT;
709 last = end >> PAGE_SHIFT; 709 last = end >> PAGE_SHIFT;
710 } 710 }
711 pagevec_init(&pvec, 0); 711 pagevec_init(&pvec);
712 repeat: 712 repeat:
713 if (unlikely(index > last) || 713 if (unlikely(index > last) ||
714 !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, 714 !pagevec_lookup_range_tag(&pvec, mapping, &index, last,
715 min_t(pgoff_t, last - index, 715 PAGECACHE_TAG_DIRTY))
716 PAGEVEC_SIZE - 1) + 1))
717 return ndirties; 716 return ndirties;
718 717
719 for (i = 0; i < pagevec_count(&pvec); i++) { 718 for (i = 0; i < pagevec_count(&pvec); i++) {
720 struct buffer_head *bh, *head; 719 struct buffer_head *bh, *head;
721 struct page *page = pvec.pages[i]; 720 struct page *page = pvec.pages[i];
722 721
723 if (unlikely(page->index > last))
724 break;
725
726 lock_page(page); 722 lock_page(page);
727 if (!page_has_buffers(page)) 723 if (!page_has_buffers(page))
728 create_empty_buffers(page, i_blocksize(inode), 0); 724 create_empty_buffers(page, i_blocksize(inode), 0);
@@ -757,10 +753,10 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
757 unsigned int i; 753 unsigned int i;
758 pgoff_t index = 0; 754 pgoff_t index = 0;
759 755
760 pagevec_init(&pvec, 0); 756 pagevec_init(&pvec);
761 757
762 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, 758 while (pagevec_lookup_tag(&pvec, mapping, &index,
763 PAGEVEC_SIZE)) { 759 PAGECACHE_TAG_DIRTY)) {
764 for (i = 0; i < pagevec_count(&pvec); i++) { 760 for (i = 0; i < pagevec_count(&pvec); i++) {
765 bh = head = page_buffers(pvec.pages[i]); 761 bh = head = page_buffers(pvec.pages[i]);
766 do { 762 do {
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index addd7c5f2d3e..ab5105f9767e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3585,8 +3585,6 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3585 * The easy case - we can just plop the record right in. 3585 * The easy case - we can just plop the record right in.
3586 */ 3586 */
3587 *left_rec = *split_rec; 3587 *left_rec = *split_rec;
3588
3589 has_empty_extent = 0;
3590 } else 3588 } else
3591 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3589 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3592 3590
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 88a31e9340a0..d1516327b787 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -134,6 +134,19 @@ bail:
134 return err; 134 return err;
135} 135}
136 136
137static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock,
138 struct buffer_head *bh_result, int create)
139{
140 int ret = 0;
141 struct ocfs2_inode_info *oi = OCFS2_I(inode);
142
143 down_read(&oi->ip_alloc_sem);
144 ret = ocfs2_get_block(inode, iblock, bh_result, create);
145 up_read(&oi->ip_alloc_sem);
146
147 return ret;
148}
149
137int ocfs2_get_block(struct inode *inode, sector_t iblock, 150int ocfs2_get_block(struct inode *inode, sector_t iblock,
138 struct buffer_head *bh_result, int create) 151 struct buffer_head *bh_result, int create)
139{ 152{
@@ -2128,7 +2141,7 @@ static void ocfs2_dio_free_write_ctx(struct inode *inode,
2128 * called like this: dio->get_blocks(dio->inode, fs_startblk, 2141 * called like this: dio->get_blocks(dio->inode, fs_startblk,
2129 * fs_count, map_bh, dio->rw == WRITE); 2142 * fs_count, map_bh, dio->rw == WRITE);
2130 */ 2143 */
2131static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, 2144static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
2132 struct buffer_head *bh_result, int create) 2145 struct buffer_head *bh_result, int create)
2133{ 2146{
2134 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2147 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2154,12 +2167,9 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
2154 * while file size will be changed. 2167 * while file size will be changed.
2155 */ 2168 */
2156 if (pos + total_len <= i_size_read(inode)) { 2169 if (pos + total_len <= i_size_read(inode)) {
2157 down_read(&oi->ip_alloc_sem);
2158 /* This is the fast path for re-write. */
2159 ret = ocfs2_get_block(inode, iblock, bh_result, create);
2160
2161 up_read(&oi->ip_alloc_sem);
2162 2170
2171 /* This is the fast path for re-write. */
2172 ret = ocfs2_lock_get_block(inode, iblock, bh_result, create);
2163 if (buffer_mapped(bh_result) && 2173 if (buffer_mapped(bh_result) &&
2164 !buffer_new(bh_result) && 2174 !buffer_new(bh_result) &&
2165 ret == 0) 2175 ret == 0)
@@ -2424,9 +2434,9 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
2424 return 0; 2434 return 0;
2425 2435
2426 if (iov_iter_rw(iter) == READ) 2436 if (iov_iter_rw(iter) == READ)
2427 get_block = ocfs2_get_block; 2437 get_block = ocfs2_lock_get_block;
2428 else 2438 else
2429 get_block = ocfs2_dio_get_block; 2439 get_block = ocfs2_dio_wr_get_block;
2430 2440
2431 return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, 2441 return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
2432 iter, get_block, 2442 iter, get_block,
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index b97bcc6dde7c..b1bb70c8ca4d 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -28,9 +28,6 @@
28 28
29#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
30 30
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate);
33
34int ocfs2_write_block(struct ocfs2_super *osb, 31int ocfs2_write_block(struct ocfs2_super *osb,
35 struct buffer_head *bh, 32 struct buffer_head *bh,
36 struct ocfs2_caching_info *ci); 33 struct ocfs2_caching_info *ci);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 3ef5137dc362..a9e67efc0004 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -79,10 +79,8 @@ void o2hb_fill_node_map(unsigned long *map,
79 unsigned bytes); 79 unsigned bytes);
80void o2hb_exit(void); 80void o2hb_exit(void);
81int o2hb_init(void); 81int o2hb_init(void);
82int o2hb_check_node_heartbeating(u8 node_num);
83int o2hb_check_node_heartbeating_no_sem(u8 node_num); 82int o2hb_check_node_heartbeating_no_sem(u8 node_num);
84int o2hb_check_node_heartbeating_from_callback(u8 node_num); 83int o2hb_check_node_heartbeating_from_callback(u8 node_num);
85int o2hb_check_local_node_heartbeating(void);
86void o2hb_stop_all_regions(void); 84void o2hb_stop_all_regions(void);
87int o2hb_get_all_regions(char *region_uuids, u8 numregions); 85int o2hb_get_all_regions(char *region_uuids, u8 numregions);
88int o2hb_global_heartbeat_active(void); 86int o2hb_global_heartbeat_active(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index a51200ece93d..da64c3a20eeb 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -40,6 +40,9 @@ char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
40 "panic", /* O2NM_FENCE_PANIC */ 40 "panic", /* O2NM_FENCE_PANIC */
41}; 41};
42 42
43static inline void o2nm_lock_subsystem(void);
44static inline void o2nm_unlock_subsystem(void);
45
43struct o2nm_node *o2nm_get_node_by_num(u8 node_num) 46struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
44{ 47{
45 struct o2nm_node *node = NULL; 48 struct o2nm_node *node = NULL;
@@ -181,7 +184,10 @@ static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
181{ 184{
182 /* through the first node_set .parent 185 /* through the first node_set .parent
183 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ 186 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
184 return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); 187 if (node->nd_item.ci_parent)
188 return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
189 else
190 return NULL;
185} 191}
186 192
187enum { 193enum {
@@ -194,7 +200,7 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
194 size_t count) 200 size_t count)
195{ 201{
196 struct o2nm_node *node = to_o2nm_node(item); 202 struct o2nm_node *node = to_o2nm_node(item);
197 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); 203 struct o2nm_cluster *cluster;
198 unsigned long tmp; 204 unsigned long tmp;
199 char *p = (char *)page; 205 char *p = (char *)page;
200 int ret = 0; 206 int ret = 0;
@@ -214,6 +220,13 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
214 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) 220 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
215 return -EINVAL; /* XXX */ 221 return -EINVAL; /* XXX */
216 222
223 o2nm_lock_subsystem();
224 cluster = to_o2nm_cluster_from_node(node);
225 if (!cluster) {
226 o2nm_unlock_subsystem();
227 return -EINVAL;
228 }
229
217 write_lock(&cluster->cl_nodes_lock); 230 write_lock(&cluster->cl_nodes_lock);
218 if (cluster->cl_nodes[tmp]) 231 if (cluster->cl_nodes[tmp])
219 ret = -EEXIST; 232 ret = -EEXIST;
@@ -226,6 +239,8 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
226 set_bit(tmp, cluster->cl_nodes_bitmap); 239 set_bit(tmp, cluster->cl_nodes_bitmap);
227 } 240 }
228 write_unlock(&cluster->cl_nodes_lock); 241 write_unlock(&cluster->cl_nodes_lock);
242 o2nm_unlock_subsystem();
243
229 if (ret) 244 if (ret)
230 return ret; 245 return ret;
231 246
@@ -269,7 +284,7 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
269 size_t count) 284 size_t count)
270{ 285{
271 struct o2nm_node *node = to_o2nm_node(item); 286 struct o2nm_node *node = to_o2nm_node(item);
272 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); 287 struct o2nm_cluster *cluster;
273 int ret, i; 288 int ret, i;
274 struct rb_node **p, *parent; 289 struct rb_node **p, *parent;
275 unsigned int octets[4]; 290 unsigned int octets[4];
@@ -286,6 +301,13 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
286 be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); 301 be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
287 } 302 }
288 303
304 o2nm_lock_subsystem();
305 cluster = to_o2nm_cluster_from_node(node);
306 if (!cluster) {
307 o2nm_unlock_subsystem();
308 return -EINVAL;
309 }
310
289 ret = 0; 311 ret = 0;
290 write_lock(&cluster->cl_nodes_lock); 312 write_lock(&cluster->cl_nodes_lock);
291 if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) 313 if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
@@ -298,6 +320,8 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
298 rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); 320 rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
299 } 321 }
300 write_unlock(&cluster->cl_nodes_lock); 322 write_unlock(&cluster->cl_nodes_lock);
323 o2nm_unlock_subsystem();
324
301 if (ret) 325 if (ret)
302 return ret; 326 return ret;
303 327
@@ -315,7 +339,7 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
315 size_t count) 339 size_t count)
316{ 340{
317 struct o2nm_node *node = to_o2nm_node(item); 341 struct o2nm_node *node = to_o2nm_node(item);
318 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); 342 struct o2nm_cluster *cluster;
319 unsigned long tmp; 343 unsigned long tmp;
320 char *p = (char *)page; 344 char *p = (char *)page;
321 ssize_t ret; 345 ssize_t ret;
@@ -333,17 +357,26 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
333 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) 357 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
334 return -EINVAL; /* XXX */ 358 return -EINVAL; /* XXX */
335 359
360 o2nm_lock_subsystem();
361 cluster = to_o2nm_cluster_from_node(node);
362 if (!cluster) {
363 ret = -EINVAL;
364 goto out;
365 }
366
336 /* the only failure case is trying to set a new local node 367 /* the only failure case is trying to set a new local node
337 * when a different one is already set */ 368 * when a different one is already set */
338 if (tmp && tmp == cluster->cl_has_local && 369 if (tmp && tmp == cluster->cl_has_local &&
339 cluster->cl_local_node != node->nd_num) 370 cluster->cl_local_node != node->nd_num) {
340 return -EBUSY; 371 ret = -EBUSY;
372 goto out;
373 }
341 374
342 /* bring up the rx thread if we're setting the new local node. */ 375 /* bring up the rx thread if we're setting the new local node. */
343 if (tmp && !cluster->cl_has_local) { 376 if (tmp && !cluster->cl_has_local) {
344 ret = o2net_start_listening(node); 377 ret = o2net_start_listening(node);
345 if (ret) 378 if (ret)
346 return ret; 379 goto out;
347 } 380 }
348 381
349 if (!tmp && cluster->cl_has_local && 382 if (!tmp && cluster->cl_has_local &&
@@ -358,7 +391,11 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
358 cluster->cl_local_node = node->nd_num; 391 cluster->cl_local_node = node->nd_num;
359 } 392 }
360 393
361 return count; 394 ret = count;
395
396out:
397 o2nm_unlock_subsystem();
398 return ret;
362} 399}
363 400
364CONFIGFS_ATTR(o2nm_node_, num); 401CONFIGFS_ATTR(o2nm_node_, num);
@@ -738,6 +775,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
738 }, 775 },
739}; 776};
740 777
778static inline void o2nm_lock_subsystem(void)
779{
780 mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex);
781}
782
783static inline void o2nm_unlock_subsystem(void)
784{
785 mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex);
786}
787
741int o2nm_depend_item(struct config_item *item) 788int o2nm_depend_item(struct config_item *item)
742{ 789{
743 return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); 790 return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index a2b19fbdcf46..e1fea149f50b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -394,7 +394,6 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
394static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) 394static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
395{ 395{
396 if (dlm->dlm_worker) { 396 if (dlm->dlm_worker) {
397 flush_workqueue(dlm->dlm_worker);
398 destroy_workqueue(dlm->dlm_worker); 397 destroy_workqueue(dlm->dlm_worker);
399 dlm->dlm_worker = NULL; 398 dlm->dlm_worker = NULL;
400 } 399 }
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3e04279446e8..9c3e0f13ca87 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2616,7 +2616,9 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2616 * otherwise the assert_master from the new 2616 * otherwise the assert_master from the new
2617 * master will destroy this. 2617 * master will destroy this.
2618 */ 2618 */
2619 dlm_get_mle_inuse(mle); 2619 if (ret != -EEXIST)
2620 dlm_get_mle_inuse(mle);
2621
2620 spin_unlock(&dlm->master_lock); 2622 spin_unlock(&dlm->master_lock);
2621 spin_unlock(&dlm->spinlock); 2623 spin_unlock(&dlm->spinlock);
2622 2624
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6dd592..ec8f75813beb 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2419,6 +2419,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2419 dlm_lockres_put(res); 2419 dlm_lockres_put(res);
2420 continue; 2420 continue;
2421 } 2421 }
2422 dlm_move_lockres_to_recovery_list(dlm, res);
2422 } else if (res->owner == dlm->node_num) { 2423 } else if (res->owner == dlm->node_num) {
2423 dlm_free_dead_locks(dlm, res, dead_node); 2424 dlm_free_dead_locks(dlm, res, dead_node);
2424 __dlm_lockres_calc_usage(dlm, res); 2425 __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 988137de08f5..9c7c18c0e129 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -670,7 +670,6 @@ static void __exit exit_dlmfs_fs(void)
670{ 670{
671 unregister_filesystem(&dlmfs_fs_type); 671 unregister_filesystem(&dlmfs_fs_type);
672 672
673 flush_workqueue(user_dlm_worker);
674 destroy_workqueue(user_dlm_worker); 673 destroy_workqueue(user_dlm_worker);
675 674
676 /* 675 /*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6e41fc8fabbe..dc455d45a66a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1161,6 +1161,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1161 } 1161 }
1162 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 1162 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1163 if (size_change) { 1163 if (size_change) {
1164 /*
1165 * Here we should wait dio to finish before inode lock
1166 * to avoid a deadlock between ocfs2_setattr() and
1167 * ocfs2_dio_end_io_write()
1168 */
1169 inode_dio_wait(inode);
1170
1164 status = ocfs2_rw_lock(inode, 1); 1171 status = ocfs2_rw_lock(inode, 1);
1165 if (status < 0) { 1172 if (status < 0) {
1166 mlog_errno(status); 1173 mlog_errno(status);
@@ -1200,8 +1207,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1200 if (status) 1207 if (status)
1201 goto bail_unlock; 1208 goto bail_unlock;
1202 1209
1203 inode_dio_wait(inode);
1204
1205 if (i_size_read(inode) >= attr->ia_size) { 1210 if (i_size_read(inode) >= attr->ia_size) {
1206 if (ocfs2_should_order_data(inode)) { 1211 if (ocfs2_should_order_data(inode)) {
1207 status = ocfs2_begin_ordered_truncate(inode, 1212 status = ocfs2_begin_ordered_truncate(inode,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 71f22c8fbffd..9f0b95abc09f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1147,12 +1147,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1147 GLOBAL_BITMAP_SYSTEM_INODE, 1147 GLOBAL_BITMAP_SYSTEM_INODE,
1148 OCFS2_INVALID_SLOT, NULL, 1148 OCFS2_INVALID_SLOT, NULL,
1149 ALLOC_NEW_GROUP); 1149 ALLOC_NEW_GROUP);
1150 if (status < 0 && status != -ENOSPC) { 1150 if (status < 0 && status != -ENOSPC)
1151 mlog_errno(status); 1151 mlog_errno(status);
1152 goto bail;
1153 }
1154 1152
1155bail:
1156 return status; 1153 return status;
1157} 1154}
1158 1155
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 80733496b22a..040bbb6a6e4b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2521,10 +2521,8 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2521 /* This function assumes that the caller has the main osb resource */ 2521 /* This function assumes that the caller has the main osb resource */
2522 2522
2523 /* ocfs2_initializer_super have already created this workqueue */ 2523 /* ocfs2_initializer_super have already created this workqueue */
2524 if (osb->ocfs2_wq) { 2524 if (osb->ocfs2_wq)
2525 flush_workqueue(osb->ocfs2_wq);
2526 destroy_workqueue(osb->ocfs2_wq); 2525 destroy_workqueue(osb->ocfs2_wq);
2527 }
2528 2526
2529 ocfs2_free_slot_info(osb); 2527 ocfs2_free_slot_info(osb);
2530 2528
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b023e4f3d740..d4550c8bbc41 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,9 +26,6 @@
26#ifndef OCFS2_SUPER_H 26#ifndef OCFS2_SUPER_H
27#define OCFS2_SUPER_H 27#define OCFS2_SUPER_H
28 28
29int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
30 int node_num);
31
32__printf(3, 4) 29__printf(3, 4)
33int __ocfs2_error(struct super_block *sb, const char *function, 30int __ocfs2_error(struct super_block *sb, const char *function,
34 const char *fmt, ...); 31 const char *fmt, ...);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6744bd706ecf..875231c36cb3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
26 26
27void task_mem(struct seq_file *m, struct mm_struct *mm) 27void task_mem(struct seq_file *m, struct mm_struct *mm)
28{ 28{
29 unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; 29 unsigned long text, lib, swap, anon, file, shmem;
30 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 30 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
31 31
32 anon = get_mm_counter(mm, MM_ANONPAGES); 32 anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -50,8 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
50 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 50 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
51 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 51 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
52 swap = get_mm_counter(mm, MM_SWAPENTS); 52 swap = get_mm_counter(mm, MM_SWAPENTS);
53 ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
54 pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
55 seq_printf(m, 53 seq_printf(m,
56 "VmPeak:\t%8lu kB\n" 54 "VmPeak:\t%8lu kB\n"
57 "VmSize:\t%8lu kB\n" 55 "VmSize:\t%8lu kB\n"
@@ -67,7 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
67 "VmExe:\t%8lu kB\n" 65 "VmExe:\t%8lu kB\n"
68 "VmLib:\t%8lu kB\n" 66 "VmLib:\t%8lu kB\n"
69 "VmPTE:\t%8lu kB\n" 67 "VmPTE:\t%8lu kB\n"
70 "VmPMD:\t%8lu kB\n"
71 "VmSwap:\t%8lu kB\n", 68 "VmSwap:\t%8lu kB\n",
72 hiwater_vm << (PAGE_SHIFT-10), 69 hiwater_vm << (PAGE_SHIFT-10),
73 total_vm << (PAGE_SHIFT-10), 70 total_vm << (PAGE_SHIFT-10),
@@ -80,8 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
80 shmem << (PAGE_SHIFT-10), 77 shmem << (PAGE_SHIFT-10),
81 mm->data_vm << (PAGE_SHIFT-10), 78 mm->data_vm << (PAGE_SHIFT-10),
82 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 79 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
83 ptes >> 10, 80 mm_pgtables_bytes(mm) >> 10,
84 pmds >> 10,
85 swap << (PAGE_SHIFT-10)); 81 swap << (PAGE_SHIFT-10));
86 hugetlb_report_usage(m, mm); 82 hugetlb_report_usage(m, mm);
87} 83}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f46d133c0949..ac9a4e65ca49 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -668,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
668 ctx->features = octx->features; 668 ctx->features = octx->features;
669 ctx->released = false; 669 ctx->released = false;
670 ctx->mm = vma->vm_mm; 670 ctx->mm = vma->vm_mm;
671 atomic_inc(&ctx->mm->mm_count); 671 mmgrab(ctx->mm);
672 672
673 userfaultfd_ctx_get(octx); 673 userfaultfd_ctx_get(octx);
674 fctx->orig = octx; 674 fctx->orig = octx;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 758f37ac5ad3..4b87472f35bc 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -104,7 +104,7 @@ kmem_zone_init(int size, char *zone_name)
104} 104}
105 105
106static inline kmem_zone_t * 106static inline kmem_zone_t *
107kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, 107kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags,
108 void (*construct)(void *)) 108 void (*construct)(void *))
109{ 109{
110 return kmem_cache_create(zone_name, size, 0, flags, construct); 110 return kmem_cache_create(zone_name, size, 0, flags, construct);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f41ca8486e02..e54e7e0033eb 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -93,7 +93,7 @@ extern void wb_writeout_inc(struct bdi_writeback *wb);
93/* 93/*
94 * maximal error of a stat counter. 94 * maximal error of a stat counter.
95 */ 95 */
96static inline unsigned long wb_stat_error(struct bdi_writeback *wb) 96static inline unsigned long wb_stat_error(void)
97{ 97{
98#ifdef CONFIG_SMP 98#ifdef CONFIG_SMP
99 return nr_cpu_ids * WB_STAT_BATCH; 99 return nr_cpu_ids * WB_STAT_BATCH;
@@ -122,6 +122,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
122 * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. 122 * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold.
123 * 123 *
124 * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. 124 * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
125 * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be
126 * inefficient.
125 */ 127 */
126#define BDI_CAP_NO_ACCT_DIRTY 0x00000001 128#define BDI_CAP_NO_ACCT_DIRTY 0x00000001
127#define BDI_CAP_NO_WRITEBACK 0x00000002 129#define BDI_CAP_NO_WRITEBACK 0x00000002
@@ -129,6 +131,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
129#define BDI_CAP_STABLE_WRITES 0x00000008 131#define BDI_CAP_STABLE_WRITES 0x00000008
130#define BDI_CAP_STRICTLIMIT 0x00000010 132#define BDI_CAP_STRICTLIMIT 0x00000010
131#define BDI_CAP_CGROUP_WRITEBACK 0x00000020 133#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
134#define BDI_CAP_SYNCHRONOUS_IO 0x00000040
132 135
133#define BDI_CAP_NO_ACCT_AND_WRITEBACK \ 136#define BDI_CAP_NO_ACCT_AND_WRITEBACK \
134 (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) 137 (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
@@ -174,6 +177,11 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
174long congestion_wait(int sync, long timeout); 177long congestion_wait(int sync, long timeout);
175long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); 178long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
176 179
180static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
181{
182 return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO;
183}
184
177static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) 185static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
178{ 186{
179 return bdi->capabilities & BDI_CAP_STABLE_WRITES; 187 return bdi->capabilities & BDI_CAP_STABLE_WRITES;
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index fdf40ca04b3c..a53063e9d7d8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -161,6 +161,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
161#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) 161#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
162 162
163/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ 163/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
164void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
165 phys_addr_t min_addr,
166 phys_addr_t max_addr, int nid);
164void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, 167void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
165 phys_addr_t align, phys_addr_t min_addr, 168 phys_addr_t align, phys_addr_t min_addr,
166 phys_addr_t max_addr, int nid); 169 phys_addr_t max_addr, int nid);
@@ -177,6 +180,14 @@ static inline void * __init memblock_virt_alloc(
177 NUMA_NO_NODE); 180 NUMA_NO_NODE);
178} 181}
179 182
183static inline void * __init memblock_virt_alloc_raw(
184 phys_addr_t size, phys_addr_t align)
185{
186 return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT,
187 BOOTMEM_ALLOC_ACCESSIBLE,
188 NUMA_NO_NODE);
189}
190
180static inline void * __init memblock_virt_alloc_nopanic( 191static inline void * __init memblock_virt_alloc_nopanic(
181 phys_addr_t size, phys_addr_t align) 192 phys_addr_t size, phys_addr_t align)
182{ 193{
@@ -258,6 +269,14 @@ static inline void * __init memblock_virt_alloc(
258 return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT); 269 return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT);
259} 270}
260 271
272static inline void * __init memblock_virt_alloc_raw(
273 phys_addr_t size, phys_addr_t align)
274{
275 if (!align)
276 align = SMP_CACHE_BYTES;
277 return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT);
278}
279
261static inline void * __init memblock_virt_alloc_nopanic( 280static inline void * __init memblock_virt_alloc_nopanic(
262 phys_addr_t size, phys_addr_t align) 281 phys_addr_t size, phys_addr_t align)
263{ 282{
@@ -310,6 +329,14 @@ static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size,
310 min_addr); 329 min_addr);
311} 330}
312 331
332static inline void * __init memblock_virt_alloc_try_nid_raw(
333 phys_addr_t size, phys_addr_t align,
334 phys_addr_t min_addr, phys_addr_t max_addr, int nid)
335{
336 return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align,
337 min_addr, max_addr);
338}
339
313static inline void * __init memblock_virt_alloc_try_nid_nopanic( 340static inline void * __init memblock_virt_alloc_try_nid_nopanic(
314 phys_addr_t size, phys_addr_t align, 341 phys_addr_t size, phys_addr_t align,
315 phys_addr_t min_addr, phys_addr_t max_addr, int nid) 342 phys_addr_t min_addr, phys_addr_t max_addr, int nid)
diff --git a/include/linux/c2port.h b/include/linux/c2port.h
index 4efabcb51347..f2736348ca26 100644
--- a/include/linux/c2port.h
+++ b/include/linux/c2port.h
@@ -9,8 +9,6 @@
9 * the Free Software Foundation 9 * the Free Software Foundation
10 */ 10 */
11 11
12#include <linux/kmemcheck.h>
13
14#define C2PORT_NAME_LEN 32 12#define C2PORT_NAME_LEN 32
15 13
16struct device; 14struct device;
@@ -22,10 +20,8 @@ struct device;
22/* Main struct */ 20/* Main struct */
23struct c2port_ops; 21struct c2port_ops;
24struct c2port_device { 22struct c2port_device {
25 kmemcheck_bitfield_begin(flags);
26 unsigned int access:1; 23 unsigned int access:1;
27 unsigned int flash_access:1; 24 unsigned int flash_access:1;
28 kmemcheck_bitfield_end(flags);
29 25
30 int id; 26 int id;
31 char name[C2PORT_NAME_LEN]; 27 char name[C2PORT_NAME_LEN];
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index eee1499db396..e8f8e8fb244d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -9,7 +9,6 @@
9#include <linux/dma-debug.h> 9#include <linux/dma-debug.h>
10#include <linux/dma-direction.h> 10#include <linux/dma-direction.h>
11#include <linux/scatterlist.h> 11#include <linux/scatterlist.h>
12#include <linux/kmemcheck.h>
13#include <linux/bug.h> 12#include <linux/bug.h>
14#include <linux/mem_encrypt.h> 13#include <linux/mem_encrypt.h>
15 14
@@ -232,7 +231,6 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
232 const struct dma_map_ops *ops = get_dma_ops(dev); 231 const struct dma_map_ops *ops = get_dma_ops(dev);
233 dma_addr_t addr; 232 dma_addr_t addr;
234 233
235 kmemcheck_mark_initialized(ptr, size);
236 BUG_ON(!valid_dma_direction(dir)); 234 BUG_ON(!valid_dma_direction(dir));
237 addr = ops->map_page(dev, virt_to_page(ptr), 235 addr = ops->map_page(dev, virt_to_page(ptr),
238 offset_in_page(ptr), size, 236 offset_in_page(ptr), size,
@@ -265,11 +263,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
265 unsigned long attrs) 263 unsigned long attrs)
266{ 264{
267 const struct dma_map_ops *ops = get_dma_ops(dev); 265 const struct dma_map_ops *ops = get_dma_ops(dev);
268 int i, ents; 266 int ents;
269 struct scatterlist *s;
270 267
271 for_each_sg(sg, s, nents, i)
272 kmemcheck_mark_initialized(sg_virt(s), s->length);
273 BUG_ON(!valid_dma_direction(dir)); 268 BUG_ON(!valid_dma_direction(dir));
274 ents = ops->map_sg(dev, sg, nents, dir, attrs); 269 ents = ops->map_sg(dev, sg, nents, dir, attrs);
275 BUG_ON(ents < 0); 270 BUG_ON(ents < 0);
@@ -299,7 +294,6 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
299 const struct dma_map_ops *ops = get_dma_ops(dev); 294 const struct dma_map_ops *ops = get_dma_ops(dev);
300 dma_addr_t addr; 295 dma_addr_t addr;
301 296
302 kmemcheck_mark_initialized(page_address(page) + offset, size);
303 BUG_ON(!valid_dma_direction(dir)); 297 BUG_ON(!valid_dma_direction(dir));
304 addr = ops->map_page(dev, page, offset, size, dir, attrs); 298 addr = ops->map_page(dev, page, offset, size, dir, attrs);
305 debug_dma_map_page(dev, page, offset, size, dir, addr, false); 299 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0cd02ff4ae30..80b5b482cb46 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -454,13 +454,11 @@ struct bpf_binary_header {
454 454
455struct bpf_prog { 455struct bpf_prog {
456 u16 pages; /* Number of allocated pages */ 456 u16 pages; /* Number of allocated pages */
457 kmemcheck_bitfield_begin(meta);
458 u16 jited:1, /* Is our filter JIT'ed? */ 457 u16 jited:1, /* Is our filter JIT'ed? */
459 locked:1, /* Program image locked? */ 458 locked:1, /* Program image locked? */
460 gpl_compatible:1, /* Is filter GPL compatible? */ 459 gpl_compatible:1, /* Is filter GPL compatible? */
461 cb_access:1, /* Is control block accessed? */ 460 cb_access:1, /* Is control block accessed? */
462 dst_needed:1; /* Do we need dst entry? */ 461 dst_needed:1; /* Do we need dst entry? */
463 kmemcheck_bitfield_end(meta);
464 enum bpf_prog_type type; /* Type of BPF program */ 462 enum bpf_prog_type type; /* Type of BPF program */
465 u32 len; /* Number of filter blocks */ 463 u32 len; /* Number of filter blocks */
466 u32 jited_len; /* Size of jited insns in bytes */ 464 u32 jited_len; /* Size of jited insns in bytes */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 710143741eb5..1a4582b44d32 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -24,7 +24,6 @@ struct vm_area_struct;
24#define ___GFP_HIGH 0x20u 24#define ___GFP_HIGH 0x20u
25#define ___GFP_IO 0x40u 25#define ___GFP_IO 0x40u
26#define ___GFP_FS 0x80u 26#define ___GFP_FS 0x80u
27#define ___GFP_COLD 0x100u
28#define ___GFP_NOWARN 0x200u 27#define ___GFP_NOWARN 0x200u
29#define ___GFP_RETRY_MAYFAIL 0x400u 28#define ___GFP_RETRY_MAYFAIL 0x400u
30#define ___GFP_NOFAIL 0x800u 29#define ___GFP_NOFAIL 0x800u
@@ -37,7 +36,6 @@ struct vm_area_struct;
37#define ___GFP_THISNODE 0x40000u 36#define ___GFP_THISNODE 0x40000u
38#define ___GFP_ATOMIC 0x80000u 37#define ___GFP_ATOMIC 0x80000u
39#define ___GFP_ACCOUNT 0x100000u 38#define ___GFP_ACCOUNT 0x100000u
40#define ___GFP_NOTRACK 0x200000u
41#define ___GFP_DIRECT_RECLAIM 0x400000u 39#define ___GFP_DIRECT_RECLAIM 0x400000u
42#define ___GFP_WRITE 0x800000u 40#define ___GFP_WRITE 0x800000u
43#define ___GFP_KSWAPD_RECLAIM 0x1000000u 41#define ___GFP_KSWAPD_RECLAIM 0x1000000u
@@ -193,27 +191,15 @@ struct vm_area_struct;
193/* 191/*
194 * Action modifiers 192 * Action modifiers
195 * 193 *
196 * __GFP_COLD indicates that the caller does not expect to be used in the near
197 * future. Where possible, a cache-cold page will be returned.
198 *
199 * __GFP_NOWARN suppresses allocation failure reports. 194 * __GFP_NOWARN suppresses allocation failure reports.
200 * 195 *
201 * __GFP_COMP address compound page metadata. 196 * __GFP_COMP address compound page metadata.
202 * 197 *
203 * __GFP_ZERO returns a zeroed page on success. 198 * __GFP_ZERO returns a zeroed page on success.
204 *
205 * __GFP_NOTRACK avoids tracking with kmemcheck.
206 *
207 * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of
208 * distinguishing in the source between false positives and allocations that
209 * cannot be supported (e.g. page tables).
210 */ 199 */
211#define __GFP_COLD ((__force gfp_t)___GFP_COLD)
212#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) 200#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
213#define __GFP_COMP ((__force gfp_t)___GFP_COMP) 201#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
214#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) 202#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
215#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK)
216#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
217 203
218/* Disable lockdep for GFP context tracking */ 204/* Disable lockdep for GFP context tracking */
219#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) 205#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
@@ -539,8 +525,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
539 525
540extern void __free_pages(struct page *page, unsigned int order); 526extern void __free_pages(struct page *page, unsigned int order);
541extern void free_pages(unsigned long addr, unsigned int order); 527extern void free_pages(unsigned long addr, unsigned int order);
542extern void free_hot_cold_page(struct page *page, bool cold); 528extern void free_unref_page(struct page *page);
543extern void free_hot_cold_page_list(struct list_head *list, bool cold); 529extern void free_unref_page_list(struct list_head *list);
544 530
545struct page_frag_cache; 531struct page_frag_cache;
546extern void __page_frag_cache_drain(struct page *page, unsigned int count); 532extern void __page_frag_cache_drain(struct page *page, unsigned int count);
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 96e69979f84d..325017ad9311 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page,
471 * @page: pointer to struct page 471 * @page: pointer to struct page
472 * Return: driver data value 472 * Return: driver data value
473 */ 473 */
474static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) 474static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
475{ 475{
476 unsigned long *drvdata = (unsigned long *)&page->pgmap; 476 const unsigned long *drvdata = (const unsigned long *)&page->pgmap;
477 477
478 return drvdata[1]; 478 return drvdata[1];
479} 479}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index baeb872283d9..69c238210325 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -594,21 +594,6 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
594 __tasklet_hi_schedule(t); 594 __tasklet_hi_schedule(t);
595} 595}
596 596
597extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
598
599/*
600 * This version avoids touching any other tasklets. Needed for kmemcheck
601 * in order not to take any page faults while enqueueing this tasklet;
602 * consider VERY carefully whether you really need this or
603 * tasklet_hi_schedule()...
604 */
605static inline void tasklet_hi_schedule_first(struct tasklet_struct *t)
606{
607 if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
608 __tasklet_hi_schedule_first(t);
609}
610
611
612static inline void tasklet_disable_nosync(struct tasklet_struct *t) 597static inline void tasklet_disable_nosync(struct tasklet_struct *t)
613{ 598{
614 atomic_inc(&t->count); 599 atomic_inc(&t->count);
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5017269e3f04..e3eb834c9a35 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -46,7 +46,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order);
46void kasan_free_pages(struct page *page, unsigned int order); 46void kasan_free_pages(struct page *page, unsigned int order);
47 47
48void kasan_cache_create(struct kmem_cache *cache, size_t *size, 48void kasan_cache_create(struct kmem_cache *cache, size_t *size,
49 unsigned long *flags); 49 slab_flags_t *flags);
50void kasan_cache_shrink(struct kmem_cache *cache); 50void kasan_cache_shrink(struct kmem_cache *cache);
51void kasan_cache_shutdown(struct kmem_cache *cache); 51void kasan_cache_shutdown(struct kmem_cache *cache);
52 52
@@ -95,7 +95,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
95 95
96static inline void kasan_cache_create(struct kmem_cache *cache, 96static inline void kasan_cache_create(struct kmem_cache *cache,
97 size_t *size, 97 size_t *size,
98 unsigned long *flags) {} 98 slab_flags_t *flags) {}
99static inline void kasan_cache_shrink(struct kmem_cache *cache) {} 99static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
100static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} 100static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
101 101
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 7b1d7bead7d9..ea32a7d3cf1b 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -1,172 +1 @@
1/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
2#ifndef LINUX_KMEMCHECK_H
3#define LINUX_KMEMCHECK_H
4
5#include <linux/mm_types.h>
6#include <linux/types.h>
7
8#ifdef CONFIG_KMEMCHECK
9extern int kmemcheck_enabled;
10
11/* The slab-related functions. */
12void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node);
13void kmemcheck_free_shadow(struct page *page, int order);
14void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
15 size_t size);
16void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
17
18void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order,
19 gfp_t gfpflags);
20
21void kmemcheck_show_pages(struct page *p, unsigned int n);
22void kmemcheck_hide_pages(struct page *p, unsigned int n);
23
24bool kmemcheck_page_is_tracked(struct page *p);
25
26void kmemcheck_mark_unallocated(void *address, unsigned int n);
27void kmemcheck_mark_uninitialized(void *address, unsigned int n);
28void kmemcheck_mark_initialized(void *address, unsigned int n);
29void kmemcheck_mark_freed(void *address, unsigned int n);
30
31void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
32void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
33void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
34
35int kmemcheck_show_addr(unsigned long address);
36int kmemcheck_hide_addr(unsigned long address);
37
38bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size);
39
40/*
41 * Bitfield annotations
42 *
43 * How to use: If you have a struct using bitfields, for example
44 *
45 * struct a {
46 * int x:8, y:8;
47 * };
48 *
49 * then this should be rewritten as
50 *
51 * struct a {
52 * kmemcheck_bitfield_begin(flags);
53 * int x:8, y:8;
54 * kmemcheck_bitfield_end(flags);
55 * };
56 *
57 * Now the "flags_begin" and "flags_end" members may be used to refer to the
58 * beginning and end, respectively, of the bitfield (and things like
59 * &x.flags_begin is allowed). As soon as the struct is allocated, the bit-
60 * fields should be annotated:
61 *
62 * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL);
63 * kmemcheck_annotate_bitfield(a, flags);
64 */
65#define kmemcheck_bitfield_begin(name) \
66 int name##_begin[0];
67
68#define kmemcheck_bitfield_end(name) \
69 int name##_end[0];
70
71#define kmemcheck_annotate_bitfield(ptr, name) \
72 do { \
73 int _n; \
74 \
75 if (!ptr) \
76 break; \
77 \
78 _n = (long) &((ptr)->name##_end) \
79 - (long) &((ptr)->name##_begin); \
80 BUILD_BUG_ON(_n < 0); \
81 \
82 kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \
83 } while (0)
84
85#define kmemcheck_annotate_variable(var) \
86 do { \
87 kmemcheck_mark_initialized(&(var), sizeof(var)); \
88 } while (0) \
89
90#else
91#define kmemcheck_enabled 0
92
93static inline void
94kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
95{
96}
97
98static inline void
99kmemcheck_free_shadow(struct page *page, int order)
100{
101}
102
103static inline void
104kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
105 size_t size)
106{
107}
108
109static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
110 size_t size)
111{
112}
113
114static inline void kmemcheck_pagealloc_alloc(struct page *p,
115 unsigned int order, gfp_t gfpflags)
116{
117}
118
119static inline bool kmemcheck_page_is_tracked(struct page *p)
120{
121 return false;
122}
123
124static inline void kmemcheck_mark_unallocated(void *address, unsigned int n)
125{
126}
127
128static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n)
129{
130}
131
132static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
133{
134}
135
136static inline void kmemcheck_mark_freed(void *address, unsigned int n)
137{
138}
139
140static inline void kmemcheck_mark_unallocated_pages(struct page *p,
141 unsigned int n)
142{
143}
144
145static inline void kmemcheck_mark_uninitialized_pages(struct page *p,
146 unsigned int n)
147{
148}
149
150static inline void kmemcheck_mark_initialized_pages(struct page *p,
151 unsigned int n)
152{
153}
154
155static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
156{
157 return true;
158}
159
160#define kmemcheck_bitfield_begin(name)
161#define kmemcheck_bitfield_end(name)
162#define kmemcheck_annotate_bitfield(ptr, name) \
163 do { \
164 } while (0)
165
166#define kmemcheck_annotate_variable(var) \
167 do { \
168 } while (0)
169
170#endif /* CONFIG_KMEMCHECK */
171
172#endif /* LINUX_KMEMCHECK_H */
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 590343f6c1b1..5ac416e2d339 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -48,14 +48,14 @@ extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref;
48extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; 48extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
49 49
50static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, 50static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
51 int min_count, unsigned long flags, 51 int min_count, slab_flags_t flags,
52 gfp_t gfp) 52 gfp_t gfp)
53{ 53{
54 if (!(flags & SLAB_NOLEAKTRACE)) 54 if (!(flags & SLAB_NOLEAKTRACE))
55 kmemleak_alloc(ptr, size, min_count, gfp); 55 kmemleak_alloc(ptr, size, min_count, gfp);
56} 56}
57 57
58static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) 58static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
59{ 59{
60 if (!(flags & SLAB_NOLEAKTRACE)) 60 if (!(flags & SLAB_NOLEAKTRACE))
61 kmemleak_free(ptr); 61 kmemleak_free(ptr);
@@ -76,7 +76,7 @@ static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
76{ 76{
77} 77}
78static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, 78static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
79 int min_count, unsigned long flags, 79 int min_count, slab_flags_t flags,
80 gfp_t gfp) 80 gfp_t gfp)
81{ 81{
82} 82}
@@ -94,7 +94,7 @@ static inline void kmemleak_free(const void *ptr)
94static inline void kmemleak_free_part(const void *ptr, size_t size) 94static inline void kmemleak_free_part(const void *ptr, size_t size)
95{ 95{
96} 96}
97static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) 97static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
98{ 98{
99} 99}
100static inline void kmemleak_free_percpu(const void __percpu *ptr) 100static inline void kmemleak_free_percpu(const void __percpu *ptr)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bae11c7e7bf3..7ed0f7782d16 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
237 for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ 237 for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
238 nid, flags, p_start, p_end, p_nid) 238 nid, flags, p_start, p_end, p_nid)
239 239
240/**
241 * for_each_resv_unavail_range - iterate through reserved and unavailable memory
242 * @i: u64 used as loop variable
243 * @flags: pick from blocks based on memory attributes
244 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
245 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
246 *
247 * Walks over unavailable but reserved (reserved && !memory) areas of memblock.
248 * Available as soon as memblock is initialized.
249 * Note: because this memory does not belong to any physical node, flags and
250 * nid arguments do not make sense and thus not exported as arguments.
251 */
252#define for_each_resv_unavail_range(i, p_start, p_end) \
253 for_each_mem_range(i, &memblock.reserved, &memblock.memory, \
254 NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
255
240static inline void memblock_set_region_flags(struct memblock_region *r, 256static inline void memblock_set_region_flags(struct memblock_region *r,
241 unsigned long flags) 257 unsigned long flags)
242{ 258{
@@ -389,10 +405,10 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
389 region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ 405 region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
390 region++) 406 region++)
391 407
392#define for_each_memblock_type(memblock_type, rgn) \ 408#define for_each_memblock_type(i, memblock_type, rgn) \
393 for (idx = 0, rgn = &memblock_type->regions[0]; \ 409 for (i = 0, rgn = &memblock_type->regions[0]; \
394 idx < memblock_type->cnt; \ 410 i < memblock_type->cnt; \
395 idx++, rgn = &memblock_type->regions[idx]) 411 i++, rgn = &memblock_type->regions[i])
396 412
397#ifdef CONFIG_MEMTEST 413#ifdef CONFIG_MEMTEST
398extern void early_memtest(phys_addr_t start, phys_addr_t end); 414extern void early_memtest(phys_addr_t start, phys_addr_t end);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 91b46f99b4d2..c7b1d617dff6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -96,6 +96,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
96#endif 96#endif
97 97
98/* 98/*
99 * On some architectures it is expensive to call memset() for small sizes.
100 * Those architectures should provide their own implementation of "struct page"
101 * zeroing by defining this macro in <asm/pgtable.h>.
102 */
103#ifndef mm_zero_struct_page
104#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page)))
105#endif
106
107/*
99 * Default maximum number of active map areas, this limits the number of vmas 108 * Default maximum number of active map areas, this limits the number of vmas
100 * per mm struct. Users can overwrite this number by sysctl but there is a 109 * per mm struct. Users can overwrite this number by sysctl but there is a
101 * problem. 110 * problem.
@@ -1431,7 +1440,13 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
1431 struct bdi_writeback *wb); 1440 struct bdi_writeback *wb);
1432int set_page_dirty(struct page *page); 1441int set_page_dirty(struct page *page);
1433int set_page_dirty_lock(struct page *page); 1442int set_page_dirty_lock(struct page *page);
1434void cancel_dirty_page(struct page *page); 1443void __cancel_dirty_page(struct page *page);
1444static inline void cancel_dirty_page(struct page *page)
1445{
1446 /* Avoid atomic ops, locking, etc. when not actually needed. */
1447 if (PageDirty(page))
1448 __cancel_dirty_page(page);
1449}
1435int clear_page_dirty_for_io(struct page *page); 1450int clear_page_dirty_for_io(struct page *page);
1436 1451
1437int get_cmdline(struct task_struct *task, char *buffer, int buflen); 1452int get_cmdline(struct task_struct *task, char *buffer, int buflen);
@@ -1599,26 +1614,32 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
1599int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); 1614int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
1600#endif 1615#endif
1601 1616
1602#ifdef __PAGETABLE_PUD_FOLDED 1617#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
1603static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, 1618static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
1604 unsigned long address) 1619 unsigned long address)
1605{ 1620{
1606 return 0; 1621 return 0;
1607} 1622}
1623static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
1624static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
1625
1608#else 1626#else
1609int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); 1627int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
1610#endif
1611 1628
1612#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) 1629static inline void mm_inc_nr_puds(struct mm_struct *mm)
1613static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
1614 unsigned long address)
1615{ 1630{
1616 return 0; 1631 atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
1617} 1632}
1618 1633
1619static inline void mm_nr_pmds_init(struct mm_struct *mm) {} 1634static inline void mm_dec_nr_puds(struct mm_struct *mm)
1635{
1636 atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
1637}
1638#endif
1620 1639
1621static inline unsigned long mm_nr_pmds(struct mm_struct *mm) 1640#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
1641static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
1642 unsigned long address)
1622{ 1643{
1623 return 0; 1644 return 0;
1624} 1645}
@@ -1629,25 +1650,47 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
1629#else 1650#else
1630int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); 1651int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
1631 1652
1632static inline void mm_nr_pmds_init(struct mm_struct *mm) 1653static inline void mm_inc_nr_pmds(struct mm_struct *mm)
1633{ 1654{
1634 atomic_long_set(&mm->nr_pmds, 0); 1655 atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
1635} 1656}
1636 1657
1637static inline unsigned long mm_nr_pmds(struct mm_struct *mm) 1658static inline void mm_dec_nr_pmds(struct mm_struct *mm)
1638{ 1659{
1639 return atomic_long_read(&mm->nr_pmds); 1660 atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
1640} 1661}
1662#endif
1641 1663
1642static inline void mm_inc_nr_pmds(struct mm_struct *mm) 1664#ifdef CONFIG_MMU
1665static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
1643{ 1666{
1644 atomic_long_inc(&mm->nr_pmds); 1667 atomic_long_set(&mm->pgtables_bytes, 0);
1645} 1668}
1646 1669
1647static inline void mm_dec_nr_pmds(struct mm_struct *mm) 1670static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
1671{
1672 return atomic_long_read(&mm->pgtables_bytes);
1673}
1674
1675static inline void mm_inc_nr_ptes(struct mm_struct *mm)
1676{
1677 atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
1678}
1679
1680static inline void mm_dec_nr_ptes(struct mm_struct *mm)
1648{ 1681{
1649 atomic_long_dec(&mm->nr_pmds); 1682 atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
1650} 1683}
1684#else
1685
1686static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
1687static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
1688{
1689 return 0;
1690}
1691
1692static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
1693static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
1651#endif 1694#endif
1652 1695
1653int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); 1696int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
@@ -2002,6 +2045,12 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
2002 struct mminit_pfnnid_cache *state); 2045 struct mminit_pfnnid_cache *state);
2003#endif 2046#endif
2004 2047
2048#ifdef CONFIG_HAVE_MEMBLOCK
2049void zero_resv_unavail(void);
2050#else
2051static inline void zero_resv_unavail(void) {}
2052#endif
2053
2005extern void set_dma_reserve(unsigned long new_dma_reserve); 2054extern void set_dma_reserve(unsigned long new_dma_reserve);
2006extern void memmap_init_zone(unsigned long, int, unsigned long, 2055extern void memmap_init_zone(unsigned long, int, unsigned long,
2007 unsigned long, enum memmap_context); 2056 unsigned long, enum memmap_context);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c85f11dafd56..cfd0ac4e5e0e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -48,8 +48,10 @@ struct page {
48 * inode address_space, or NULL. 48 * inode address_space, or NULL.
49 * If page mapped as anonymous 49 * If page mapped as anonymous
50 * memory, low bit is set, and 50 * memory, low bit is set, and
51 * it points to anon_vma object: 51 * it points to anon_vma object
52 * see PAGE_MAPPING_ANON below. 52 * or KSM private structure. See
53 * PAGE_MAPPING_ANON and
54 * PAGE_MAPPING_KSM.
53 */ 55 */
54 void *s_mem; /* slab first object */ 56 void *s_mem; /* slab first object */
55 atomic_t compound_mapcount; /* first tail page */ 57 atomic_t compound_mapcount; /* first tail page */
@@ -207,14 +209,6 @@ struct page {
207 not kmapped, ie. highmem) */ 209 not kmapped, ie. highmem) */
208#endif /* WANT_PAGE_VIRTUAL */ 210#endif /* WANT_PAGE_VIRTUAL */
209 211
210#ifdef CONFIG_KMEMCHECK
211 /*
212 * kmemcheck wants to track the status of each byte in a page; this
213 * is a pointer to such a status block. NULL if not tracked.
214 */
215 void *shadow;
216#endif
217
218#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS 212#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
219 int _last_cpupid; 213 int _last_cpupid;
220#endif 214#endif
@@ -399,9 +393,8 @@ struct mm_struct {
399 */ 393 */
400 atomic_t mm_count; 394 atomic_t mm_count;
401 395
402 atomic_long_t nr_ptes; /* PTE page table pages */ 396#ifdef CONFIG_MMU
403#if CONFIG_PGTABLE_LEVELS > 2 397 atomic_long_t pgtables_bytes; /* PTE page table pages */
404 atomic_long_t nr_pmds; /* PMD page table pages */
405#endif 398#endif
406 int map_count; /* number of VMAs */ 399 int map_count; /* number of VMAs */
407 400
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 2cf1c3c807f6..b25dc9db19fc 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -156,7 +156,8 @@ struct mmu_notifier_ops {
156 * shared page-tables, it not necessary to implement the 156 * shared page-tables, it not necessary to implement the
157 * invalidate_range_start()/end() notifiers, as 157 * invalidate_range_start()/end() notifiers, as
158 * invalidate_range() alread catches the points in time when an 158 * invalidate_range() alread catches the points in time when an
159 * external TLB range needs to be flushed. 159 * external TLB range needs to be flushed. For more in depth
160 * discussion on this see Documentation/vm/mmu_notifier.txt
160 * 161 *
161 * The invalidate_range() function is called under the ptl 162 * The invalidate_range() function is called under the ptl
162 * spin-lock and not allowed to sleep. 163 * spin-lock and not allowed to sleep.
@@ -213,7 +214,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm,
213extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, 214extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
214 unsigned long start, unsigned long end); 215 unsigned long start, unsigned long end);
215extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 216extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
216 unsigned long start, unsigned long end); 217 unsigned long start, unsigned long end,
218 bool only_end);
217extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, 219extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
218 unsigned long start, unsigned long end); 220 unsigned long start, unsigned long end);
219 221
@@ -267,7 +269,14 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
267 unsigned long start, unsigned long end) 269 unsigned long start, unsigned long end)
268{ 270{
269 if (mm_has_notifiers(mm)) 271 if (mm_has_notifiers(mm))
270 __mmu_notifier_invalidate_range_end(mm, start, end); 272 __mmu_notifier_invalidate_range_end(mm, start, end, false);
273}
274
275static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm,
276 unsigned long start, unsigned long end)
277{
278 if (mm_has_notifiers(mm))
279 __mmu_notifier_invalidate_range_end(mm, start, end, true);
271} 280}
272 281
273static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, 282static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
@@ -438,6 +447,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
438{ 447{
439} 448}
440 449
450static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm,
451 unsigned long start, unsigned long end)
452{
453}
454
441static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, 455static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
442 unsigned long start, unsigned long end) 456 unsigned long start, unsigned long end)
443{ 457{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a507f43ad221..67f2e3c38939 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -700,7 +700,8 @@ typedef struct pglist_data {
700 * is the first PFN that needs to be initialised. 700 * is the first PFN that needs to be initialised.
701 */ 701 */
702 unsigned long first_deferred_pfn; 702 unsigned long first_deferred_pfn;
703 unsigned long static_init_size; 703 /* Number of non-deferred pages */
704 unsigned long static_init_pgcnt;
704#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 705#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
705 706
706#ifdef CONFIG_TRANSPARENT_HUGEPAGE 707#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -712,12 +713,6 @@ typedef struct pglist_data {
712 /* Fields commonly accessed by the page reclaim scanner */ 713 /* Fields commonly accessed by the page reclaim scanner */
713 struct lruvec lruvec; 714 struct lruvec lruvec;
714 715
715 /*
716 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
717 * this node's LRU. Maintained by the pageout code.
718 */
719 unsigned int inactive_ratio;
720
721 unsigned long flags; 716 unsigned long flags;
722 717
723 ZONE_PADDING(_pad2_) 718 ZONE_PADDING(_pad2_)
diff --git a/include/linux/net.h b/include/linux/net.h
index d97d80d7fdf8..caeb159abda5 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -22,7 +22,6 @@
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/wait.h> 23#include <linux/wait.h>
24#include <linux/fcntl.h> /* For O_CLOEXEC and O_NONBLOCK */ 24#include <linux/fcntl.h> /* For O_CLOEXEC and O_NONBLOCK */
25#include <linux/kmemcheck.h>
26#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
27#include <linux/once.h> 26#include <linux/once.h>
28#include <linux/fs.h> 27#include <linux/fs.h>
@@ -111,9 +110,7 @@ struct socket_wq {
111struct socket { 110struct socket {
112 socket_state state; 111 socket_state state;
113 112
114 kmemcheck_bitfield_begin(type);
115 short type; 113 short type;
116 kmemcheck_bitfield_end(type);
117 114
118 unsigned long flags; 115 unsigned long flags;
119 116
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index de1c50b93c61..15cab3967d6d 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -104,7 +104,9 @@ extern nodemask_t _unused_nodemask_arg_;
104 * 104 *
105 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. 105 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
106 */ 106 */
107#define nodemask_pr_args(maskp) MAX_NUMNODES, (maskp)->bits 107#define nodemask_pr_args(maskp) \
108 ((maskp) != NULL) ? MAX_NUMNODES : 0, \
109 ((maskp) != NULL) ? (maskp)->bits : NULL
108 110
109/* 111/*
110 * The inline keyword gives the compiler room to decide to inline, or 112 * The inline keyword gives the compiler room to decide to inline, or
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 584b14c774c1..3ec44e27aa9d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -18,7 +18,7 @@
18 * Various page->flags bits: 18 * Various page->flags bits:
19 * 19 *
20 * PG_reserved is set for special pages, which can never be swapped out. Some 20 * PG_reserved is set for special pages, which can never be swapped out. Some
21 * of them might not even exist (eg empty_bad_page)... 21 * of them might not even exist...
22 * 22 *
23 * The PG_private bitflag is set on pagecache pages if they contain filesystem 23 * The PG_private bitflag is set on pagecache pages if they contain filesystem
24 * specific data (which is normally at page->private). It can be used by 24 * specific data (which is normally at page->private). It can be used by
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 05a04e603686..cdad58bbfd8b 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -31,7 +31,7 @@ static inline bool is_migrate_isolate(int migratetype)
31#endif 31#endif
32 32
33bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 33bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
34 bool skip_hwpoisoned_pages); 34 int migratetype, bool skip_hwpoisoned_pages);
35void set_pageblock_migratetype(struct page *page, int migratetype); 35void set_pageblock_migratetype(struct page *page, int migratetype);
36int move_freepages_block(struct zone *zone, struct page *page, 36int move_freepages_block(struct zone *zone, struct page *page,
37 int migratetype, int *num_movable); 37 int migratetype, int *num_movable);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e08b5339023c..34ce3ebf97d5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -16,6 +16,8 @@
16#include <linux/hardirq.h> /* for in_interrupt() */ 16#include <linux/hardirq.h> /* for in_interrupt() */
17#include <linux/hugetlb_inline.h> 17#include <linux/hugetlb_inline.h>
18 18
19struct pagevec;
20
19/* 21/*
20 * Bits in mapping->flags. 22 * Bits in mapping->flags.
21 */ 23 */
@@ -116,7 +118,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
116 m->gfp_mask = mask; 118 m->gfp_mask = mask;
117} 119}
118 120
119void release_pages(struct page **pages, int nr, bool cold); 121void release_pages(struct page **pages, int nr);
120 122
121/* 123/*
122 * speculatively take a reference to a page. 124 * speculatively take a reference to a page.
@@ -232,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x)
232 return __page_cache_alloc(mapping_gfp_mask(x)); 234 return __page_cache_alloc(mapping_gfp_mask(x));
233} 235}
234 236
235static inline struct page *page_cache_alloc_cold(struct address_space *x)
236{
237 return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
238}
239
240static inline gfp_t readahead_gfp_mask(struct address_space *x) 237static inline gfp_t readahead_gfp_mask(struct address_space *x)
241{ 238{
242 return mapping_gfp_mask(x) | 239 return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
243 __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
244} 240}
245 241
246typedef int filler_t(void *, struct page *); 242typedef int filler_t(void *, struct page *);
@@ -366,8 +362,16 @@ static inline unsigned find_get_pages(struct address_space *mapping,
366} 362}
367unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, 363unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
368 unsigned int nr_pages, struct page **pages); 364 unsigned int nr_pages, struct page **pages);
369unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 365unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
370 int tag, unsigned int nr_pages, struct page **pages); 366 pgoff_t end, int tag, unsigned int nr_pages,
367 struct page **pages);
368static inline unsigned find_get_pages_tag(struct address_space *mapping,
369 pgoff_t *index, int tag, unsigned int nr_pages,
370 struct page **pages)
371{
372 return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
373 nr_pages, pages);
374}
371unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 375unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
372 int tag, unsigned int nr_entries, 376 int tag, unsigned int nr_entries,
373 struct page **entries, pgoff_t *indices); 377 struct page **entries, pgoff_t *indices);
@@ -616,6 +620,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
616extern void delete_from_page_cache(struct page *page); 620extern void delete_from_page_cache(struct page *page);
617extern void __delete_from_page_cache(struct page *page, void *shadow); 621extern void __delete_from_page_cache(struct page *page, void *shadow);
618int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); 622int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
623void delete_from_page_cache_batch(struct address_space *mapping,
624 struct pagevec *pvec);
619 625
620/* 626/*
621 * Like add_to_page_cache_locked, but used to add newly allocated pages: 627 * Like add_to_page_cache_locked, but used to add newly allocated pages:
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 2636c0c0f279..5fb6580f7f23 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -17,7 +17,7 @@ struct address_space;
17 17
18struct pagevec { 18struct pagevec {
19 unsigned long nr; 19 unsigned long nr;
20 unsigned long cold; 20 bool percpu_pvec_drained;
21 struct page *pages[PAGEVEC_SIZE]; 21 struct page *pages[PAGEVEC_SIZE];
22}; 22};
23 23
@@ -38,14 +38,22 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
38 return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1); 38 return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
39} 39}
40 40
41unsigned pagevec_lookup_tag(struct pagevec *pvec, 41unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
42 struct address_space *mapping, pgoff_t *index, int tag, 42 struct address_space *mapping, pgoff_t *index, pgoff_t end,
43 unsigned nr_pages); 43 int tag);
44unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
45 struct address_space *mapping, pgoff_t *index, pgoff_t end,
46 int tag, unsigned max_pages);
47static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
48 struct address_space *mapping, pgoff_t *index, int tag)
49{
50 return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
51}
44 52
45static inline void pagevec_init(struct pagevec *pvec, int cold) 53static inline void pagevec_init(struct pagevec *pvec)
46{ 54{
47 pvec->nr = 0; 55 pvec->nr = 0;
48 pvec->cold = cold; 56 pvec->percpu_pvec_drained = false;
49} 57}
50 58
51static inline void pagevec_reinit(struct pagevec *pvec) 59static inline void pagevec_reinit(struct pagevec *pvec)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 567ebb5eaab0..0ca448c1cb42 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -301,18 +301,17 @@ void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
301void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); 301void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
302void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, 302void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
303 unsigned long index); 303 unsigned long index);
304typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); 304typedef void (*radix_tree_update_node_t)(struct radix_tree_node *);
305void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, 305void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
306 void __rcu **slot, void *entry, 306 void __rcu **slot, void *entry,
307 radix_tree_update_node_t update_node, void *private); 307 radix_tree_update_node_t update_node);
308void radix_tree_iter_replace(struct radix_tree_root *, 308void radix_tree_iter_replace(struct radix_tree_root *,
309 const struct radix_tree_iter *, void __rcu **slot, void *entry); 309 const struct radix_tree_iter *, void __rcu **slot, void *entry);
310void radix_tree_replace_slot(struct radix_tree_root *, 310void radix_tree_replace_slot(struct radix_tree_root *,
311 void __rcu **slot, void *entry); 311 void __rcu **slot, void *entry);
312void __radix_tree_delete_node(struct radix_tree_root *, 312void __radix_tree_delete_node(struct radix_tree_root *,
313 struct radix_tree_node *, 313 struct radix_tree_node *,
314 radix_tree_update_node_t update_node, 314 radix_tree_update_node_t update_node);
315 void *private);
316void radix_tree_iter_delete(struct radix_tree_root *, 315void radix_tree_iter_delete(struct radix_tree_root *,
317 struct radix_tree_iter *iter, void __rcu **slot); 316 struct radix_tree_iter *iter, void __rcu **slot);
318void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); 317void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fa6ace66fea5..289e4d54e3e0 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -2,7 +2,6 @@
2#ifndef _LINUX_RING_BUFFER_H 2#ifndef _LINUX_RING_BUFFER_H
3#define _LINUX_RING_BUFFER_H 3#define _LINUX_RING_BUFFER_H
4 4
5#include <linux/kmemcheck.h>
6#include <linux/mm.h> 5#include <linux/mm.h>
7#include <linux/seq_file.h> 6#include <linux/seq_file.h>
8#include <linux/poll.h> 7#include <linux/poll.h>
@@ -14,9 +13,7 @@ struct ring_buffer_iter;
14 * Don't refer to this struct directly, use functions below. 13 * Don't refer to this struct directly, use functions below.
15 */ 14 */
16struct ring_buffer_event { 15struct ring_buffer_event {
17 kmemcheck_bitfield_begin(bitfield);
18 u32 type_len:5, time_delta:27; 16 u32 type_len:5, time_delta:27;
19 kmemcheck_bitfield_end(bitfield);
20 17
21 u32 array[]; 18 u32 array[];
22}; 19};
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 54fe91183a8e..ed06e1c28fc7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -15,7 +15,6 @@
15#define _LINUX_SKBUFF_H 15#define _LINUX_SKBUFF_H
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/kmemcheck.h>
19#include <linux/compiler.h> 18#include <linux/compiler.h>
20#include <linux/time.h> 19#include <linux/time.h>
21#include <linux/bug.h> 20#include <linux/bug.h>
@@ -711,7 +710,6 @@ struct sk_buff {
711 /* Following fields are _not_ copied in __copy_skb_header() 710 /* Following fields are _not_ copied in __copy_skb_header()
712 * Note that queue_mapping is here mostly to fill a hole. 711 * Note that queue_mapping is here mostly to fill a hole.
713 */ 712 */
714 kmemcheck_bitfield_begin(flags1);
715 __u16 queue_mapping; 713 __u16 queue_mapping;
716 714
717/* if you move cloned around you also must adapt those constants */ 715/* if you move cloned around you also must adapt those constants */
@@ -730,7 +728,6 @@ struct sk_buff {
730 head_frag:1, 728 head_frag:1,
731 xmit_more:1, 729 xmit_more:1,
732 __unused:1; /* one bit hole */ 730 __unused:1; /* one bit hole */
733 kmemcheck_bitfield_end(flags1);
734 731
735 /* fields enclosed in headers_start/headers_end are copied 732 /* fields enclosed in headers_start/headers_end are copied
736 * using a single memcpy() in __copy_skb_header() 733 * using a single memcpy() in __copy_skb_header()
@@ -2664,7 +2661,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
2664 * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to 2661 * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
2665 * code in gfp_to_alloc_flags that should be enforcing this. 2662 * code in gfp_to_alloc_flags that should be enforcing this.
2666 */ 2663 */
2667 gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC; 2664 gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
2668 2665
2669 return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); 2666 return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
2670} 2667}
diff --git a/include/linux/slab.h b/include/linux/slab.h
index af5aa65c7c18..50697a1d6621 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -21,13 +21,20 @@
21 * Flags to pass to kmem_cache_create(). 21 * Flags to pass to kmem_cache_create().
22 * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. 22 * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
23 */ 23 */
24#define SLAB_CONSISTENCY_CHECKS 0x00000100UL /* DEBUG: Perform (expensive) checks on alloc/free */ 24/* DEBUG: Perform (expensive) checks on alloc/free */
25#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ 25#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U)
26#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ 26/* DEBUG: Red zone objs in a cache */
27#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */ 27#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U)
28#define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */ 28/* DEBUG: Poison objects */
29#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ 29#define SLAB_POISON ((slab_flags_t __force)0x00000800U)
30#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ 30/* Align objs on cache lines */
31#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U)
32/* Use GFP_DMA memory */
33#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000U)
34/* DEBUG: Store the last owner for bug hunting */
35#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000U)
36/* Panic if kmem_cache_create() fails */
37#define SLAB_PANIC ((slab_flags_t __force)0x00040000U)
31/* 38/*
32 * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! 39 * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
33 * 40 *
@@ -65,44 +72,45 @@
65 * 72 *
66 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. 73 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
67 */ 74 */
68#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ 75/* Defer freeing slabs to RCU */
69#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ 76#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U)
70#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ 77/* Spread some memory over cpuset */
78#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000U)
79/* Trace allocations and frees */
80#define SLAB_TRACE ((slab_flags_t __force)0x00200000U)
71 81
72/* Flag to prevent checks on free */ 82/* Flag to prevent checks on free */
73#ifdef CONFIG_DEBUG_OBJECTS 83#ifdef CONFIG_DEBUG_OBJECTS
74# define SLAB_DEBUG_OBJECTS 0x00400000UL 84# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000U)
75#else 85#else
76# define SLAB_DEBUG_OBJECTS 0x00000000UL 86# define SLAB_DEBUG_OBJECTS 0
77#endif 87#endif
78 88
79#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ 89/* Avoid kmemleak tracing */
90#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U)
80 91
81/* Don't track use of uninitialized memory */ 92/* Fault injection mark */
82#ifdef CONFIG_KMEMCHECK
83# define SLAB_NOTRACK 0x01000000UL
84#else
85# define SLAB_NOTRACK 0x00000000UL
86#endif
87#ifdef CONFIG_FAILSLAB 93#ifdef CONFIG_FAILSLAB
88# define SLAB_FAILSLAB 0x02000000UL /* Fault injection mark */ 94# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U)
89#else 95#else
90# define SLAB_FAILSLAB 0x00000000UL 96# define SLAB_FAILSLAB 0
91#endif 97#endif
98/* Account to memcg */
92#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) 99#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
93# define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */ 100# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U)
94#else 101#else
95# define SLAB_ACCOUNT 0x00000000UL 102# define SLAB_ACCOUNT 0
96#endif 103#endif
97 104
98#ifdef CONFIG_KASAN 105#ifdef CONFIG_KASAN
99#define SLAB_KASAN 0x08000000UL 106#define SLAB_KASAN ((slab_flags_t __force)0x08000000U)
100#else 107#else
101#define SLAB_KASAN 0x00000000UL 108#define SLAB_KASAN 0
102#endif 109#endif
103 110
104/* The following flags affect the page allocator grouping pages by mobility */ 111/* The following flags affect the page allocator grouping pages by mobility */
105#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 112/* Objects are reclaimable */
113#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U)
106#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 114#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
107/* 115/*
108 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. 116 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
@@ -128,7 +136,7 @@ void __init kmem_cache_init(void);
128bool slab_is_available(void); 136bool slab_is_available(void);
129 137
130struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, 138struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
131 unsigned long, 139 slab_flags_t,
132 void (*)(void *)); 140 void (*)(void *));
133void kmem_cache_destroy(struct kmem_cache *); 141void kmem_cache_destroy(struct kmem_cache *);
134int kmem_cache_shrink(struct kmem_cache *); 142int kmem_cache_shrink(struct kmem_cache *);
@@ -459,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
459 * Also it is possible to set different flags by OR'ing 467 * Also it is possible to set different flags by OR'ing
460 * in one or more of the following additional @flags: 468 * in one or more of the following additional @flags:
461 * 469 *
462 * %__GFP_COLD - Request cache-cold pages instead of
463 * trying to return cache-warm pages.
464 *
465 * %__GFP_HIGH - This allocation has high priority and may use emergency pools. 470 * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
466 * 471 *
467 * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail 472 * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
@@ -636,6 +641,22 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
636#define kmalloc_track_caller(size, flags) \ 641#define kmalloc_track_caller(size, flags) \
637 __kmalloc_track_caller(size, flags, _RET_IP_) 642 __kmalloc_track_caller(size, flags, _RET_IP_)
638 643
644static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
645 int node)
646{
647 if (size != 0 && n > SIZE_MAX / size)
648 return NULL;
649 if (__builtin_constant_p(n) && __builtin_constant_p(size))
650 return kmalloc_node(n * size, flags, node);
651 return __kmalloc_node(n * size, flags, node);
652}
653
654static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
655{
656 return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
657}
658
659
639#ifdef CONFIG_NUMA 660#ifdef CONFIG_NUMA
640extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); 661extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
641#define kmalloc_node_track_caller(size, flags, node) \ 662#define kmalloc_node_track_caller(size, flags, node) \
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8f7d2b1656d2..072e46e9e1d5 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -20,7 +20,7 @@ struct kmem_cache {
20 struct reciprocal_value reciprocal_buffer_size; 20 struct reciprocal_value reciprocal_buffer_size;
21/* 2) touched by every alloc & free from the backend */ 21/* 2) touched by every alloc & free from the backend */
22 22
23 unsigned int flags; /* constant flags */ 23 slab_flags_t flags; /* constant flags */
24 unsigned int num; /* # of objs per slab */ 24 unsigned int num; /* # of objs per slab */
25 25
26/* 3) cache_grow/shrink */ 26/* 3) cache_grow/shrink */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 39fa09bcde23..0adae162dc8f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -82,7 +82,7 @@ struct kmem_cache_order_objects {
82struct kmem_cache { 82struct kmem_cache {
83 struct kmem_cache_cpu __percpu *cpu_slab; 83 struct kmem_cache_cpu __percpu *cpu_slab;
84 /* Used for retriving partial slabs etc */ 84 /* Used for retriving partial slabs etc */
85 unsigned long flags; 85 slab_flags_t flags;
86 unsigned long min_partial; 86 unsigned long min_partial;
87 int size; /* The size of an object including meta data */ 87 int size; /* The size of an object including meta data */
88 int object_size; /* The size of an object without meta data */ 88 int object_size; /* The size of an object without meta data */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f02fb5db8914..c2b8128799c1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -171,8 +171,9 @@ enum {
171 SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ 171 SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */
172 SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ 172 SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
173 SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ 173 SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */
174 SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */
174 /* add others here before... */ 175 /* add others here before... */
175 SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ 176 SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */
176}; 177};
177 178
178#define SWAP_CLUSTER_MAX 32UL 179#define SWAP_CLUSTER_MAX 32UL
@@ -297,7 +298,18 @@ struct vma_swap_readahead {
297void *workingset_eviction(struct address_space *mapping, struct page *page); 298void *workingset_eviction(struct address_space *mapping, struct page *page);
298bool workingset_refault(void *shadow); 299bool workingset_refault(void *shadow);
299void workingset_activation(struct page *page); 300void workingset_activation(struct page *page);
300void workingset_update_node(struct radix_tree_node *node, void *private); 301
302/* Do not use directly, use workingset_lookup_update */
303void workingset_update_node(struct radix_tree_node *node);
304
305/* Returns workingset_update_node() if the mapping has shadow entries. */
306#define workingset_lookup_update(mapping) \
307({ \
308 radix_tree_update_node_t __helper = workingset_update_node; \
309 if (dax_mapping(mapping) || shmem_mapping(mapping)) \
310 __helper = NULL; \
311 __helper; \
312})
301 313
302/* linux/mm/page_alloc.c */ 314/* linux/mm/page_alloc.c */
303extern unsigned long totalram_pages; 315extern unsigned long totalram_pages;
@@ -462,9 +474,11 @@ extern unsigned int count_swap_pages(int, int);
462extern sector_t map_swap_page(struct page *, struct block_device **); 474extern sector_t map_swap_page(struct page *, struct block_device **);
463extern sector_t swapdev_block(int, pgoff_t); 475extern sector_t swapdev_block(int, pgoff_t);
464extern int page_swapcount(struct page *); 476extern int page_swapcount(struct page *);
477extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
465extern int __swp_swapcount(swp_entry_t entry); 478extern int __swp_swapcount(swp_entry_t entry);
466extern int swp_swapcount(swp_entry_t entry); 479extern int swp_swapcount(swp_entry_t entry);
467extern struct swap_info_struct *page_swap_info(struct page *); 480extern struct swap_info_struct *page_swap_info(struct page *);
481extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
468extern bool reuse_swap_page(struct page *, int *); 482extern bool reuse_swap_page(struct page *, int *);
469extern int try_to_free_swap(struct page *); 483extern int try_to_free_swap(struct page *);
470struct backing_dev_info; 484struct backing_dev_info;
@@ -473,6 +487,16 @@ extern void exit_swap_address_space(unsigned int type);
473 487
474#else /* CONFIG_SWAP */ 488#else /* CONFIG_SWAP */
475 489
490static inline int swap_readpage(struct page *page, bool do_poll)
491{
492 return 0;
493}
494
495static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
496{
497 return NULL;
498}
499
476#define swap_address_space(entry) (NULL) 500#define swap_address_space(entry) (NULL)
477#define get_nr_swap_pages() 0L 501#define get_nr_swap_pages() 0L
478#define total_swap_pages 0L 502#define total_swap_pages 0L
@@ -486,7 +510,7 @@ extern void exit_swap_address_space(unsigned int type);
486#define free_page_and_swap_cache(page) \ 510#define free_page_and_swap_cache(page) \
487 put_page(page) 511 put_page(page)
488#define free_pages_and_swap_cache(pages, nr) \ 512#define free_pages_and_swap_cache(pages, nr) \
489 release_pages((pages), (nr), false); 513 release_pages((pages), (nr));
490 514
491static inline void show_swap_cache_info(void) 515static inline void show_swap_cache_info(void)
492{ 516{
@@ -577,6 +601,11 @@ static inline int page_swapcount(struct page *page)
577 return 0; 601 return 0;
578} 602}
579 603
604static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
605{
606 return 0;
607}
608
580static inline int __swp_swapcount(swp_entry_t entry) 609static inline int __swp_swapcount(swp_entry_t entry)
581{ 610{
582 return 0; 611 return 0;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 4bcdf00c110f..34f053a150a9 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -44,10 +44,9 @@ enum {
44#endif 44#endif
45 45
46#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) 46#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
47# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ 47# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
48 __GFP_ZERO)
49#else 48#else
50# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK) 49# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT)
51#endif 50#endif
52 51
53/* 52/*
diff --git a/include/linux/types.h b/include/linux/types.h
index 34fce54e4f1b..c94d59ef96cc 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,6 +156,7 @@ typedef u32 dma_addr_t;
156#endif 156#endif
157 157
158typedef unsigned __bitwise gfp_t; 158typedef unsigned __bitwise gfp_t;
159typedef unsigned __bitwise slab_flags_t;
159typedef unsigned __bitwise fmode_t; 160typedef unsigned __bitwise fmode_t;
160 161
161#ifdef CONFIG_PHYS_ADDR_T_64BIT 162#ifdef CONFIG_PHYS_ADDR_T_64BIT
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1e0cb72e0598..1779c9817b39 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -7,9 +7,19 @@
7#include <linux/mmzone.h> 7#include <linux/mmzone.h>
8#include <linux/vm_event_item.h> 8#include <linux/vm_event_item.h>
9#include <linux/atomic.h> 9#include <linux/atomic.h>
10#include <linux/static_key.h>
10 11
11extern int sysctl_stat_interval; 12extern int sysctl_stat_interval;
12 13
14#ifdef CONFIG_NUMA
15#define ENABLE_NUMA_STAT 1
16#define DISABLE_NUMA_STAT 0
17extern int sysctl_vm_numa_stat;
18DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
19extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
20 int write, void __user *buffer, size_t *length, loff_t *ppos);
21#endif
22
13#ifdef CONFIG_VM_EVENT_COUNTERS 23#ifdef CONFIG_VM_EVENT_COUNTERS
14/* 24/*
15 * Light weight per cpu counter implementation. 25 * Light weight per cpu counter implementation.
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 2135c9ba6ac3..39efb968b7a4 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -17,7 +17,6 @@
17#define _INET_SOCK_H 17#define _INET_SOCK_H
18 18
19#include <linux/bitops.h> 19#include <linux/bitops.h>
20#include <linux/kmemcheck.h>
21#include <linux/string.h> 20#include <linux/string.h>
22#include <linux/types.h> 21#include <linux/types.h>
23#include <linux/jhash.h> 22#include <linux/jhash.h>
@@ -84,7 +83,6 @@ struct inet_request_sock {
84#define ireq_state req.__req_common.skc_state 83#define ireq_state req.__req_common.skc_state
85#define ireq_family req.__req_common.skc_family 84#define ireq_family req.__req_common.skc_family
86 85
87 kmemcheck_bitfield_begin(flags);
88 u16 snd_wscale : 4, 86 u16 snd_wscale : 4,
89 rcv_wscale : 4, 87 rcv_wscale : 4,
90 tstamp_ok : 1, 88 tstamp_ok : 1,
@@ -94,7 +92,6 @@ struct inet_request_sock {
94 acked : 1, 92 acked : 1,
95 no_srccheck: 1, 93 no_srccheck: 1,
96 smc_ok : 1; 94 smc_ok : 1;
97 kmemcheck_bitfield_end(flags);
98 u32 ir_mark; 95 u32 ir_mark;
99 union { 96 union {
100 struct ip_options_rcu __rcu *ireq_opt; 97 struct ip_options_rcu __rcu *ireq_opt;
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 6a75d67a30fd..1356fa6a7566 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -15,8 +15,6 @@
15#ifndef _INET_TIMEWAIT_SOCK_ 15#ifndef _INET_TIMEWAIT_SOCK_
16#define _INET_TIMEWAIT_SOCK_ 16#define _INET_TIMEWAIT_SOCK_
17 17
18
19#include <linux/kmemcheck.h>
20#include <linux/list.h> 18#include <linux/list.h>
21#include <linux/timer.h> 19#include <linux/timer.h>
22#include <linux/types.h> 20#include <linux/types.h>
@@ -69,14 +67,12 @@ struct inet_timewait_sock {
69 /* Socket demultiplex comparisons on incoming packets. */ 67 /* Socket demultiplex comparisons on incoming packets. */
70 /* these three are in inet_sock */ 68 /* these three are in inet_sock */
71 __be16 tw_sport; 69 __be16 tw_sport;
72 kmemcheck_bitfield_begin(flags);
73 /* And these are ours. */ 70 /* And these are ours. */
74 unsigned int tw_kill : 1, 71 unsigned int tw_kill : 1,
75 tw_transparent : 1, 72 tw_transparent : 1,
76 tw_flowlabel : 20, 73 tw_flowlabel : 20,
77 tw_pad : 2, /* 2 bits hole */ 74 tw_pad : 2, /* 2 bits hole */
78 tw_tos : 8; 75 tw_tos : 8;
79 kmemcheck_bitfield_end(flags);
80 struct timer_list tw_timer; 76 struct timer_list tw_timer;
81 struct inet_bind_bucket *tw_tb; 77 struct inet_bind_bucket *tw_tb;
82}; 78};
diff --git a/include/net/sock.h b/include/net/sock.h
index f8715c5af37d..79e1a2c7912c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -440,7 +440,6 @@ struct sock {
440#define SK_FL_TYPE_MASK 0xffff0000 440#define SK_FL_TYPE_MASK 0xffff0000
441#endif 441#endif
442 442
443 kmemcheck_bitfield_begin(flags);
444 unsigned int sk_padding : 1, 443 unsigned int sk_padding : 1,
445 sk_kern_sock : 1, 444 sk_kern_sock : 1,
446 sk_no_check_tx : 1, 445 sk_no_check_tx : 1,
@@ -449,8 +448,6 @@ struct sock {
449 sk_protocol : 8, 448 sk_protocol : 8,
450 sk_type : 16; 449 sk_type : 16;
451#define SK_PROTOCOL_MAX U8_MAX 450#define SK_PROTOCOL_MAX U8_MAX
452 kmemcheck_bitfield_end(flags);
453
454 u16 sk_gso_max_segs; 451 u16 sk_gso_max_segs;
455 u8 sk_pacing_shift; 452 u8 sk_pacing_shift;
456 unsigned long sk_lingertime; 453 unsigned long sk_lingertime;
@@ -1114,7 +1111,7 @@ struct proto {
1114 1111
1115 struct kmem_cache *slab; 1112 struct kmem_cache *slab;
1116 unsigned int obj_size; 1113 unsigned int obj_size;
1117 int slab_flags; 1114 slab_flags_t slab_flags;
1118 1115
1119 struct percpu_counter *orphan_count; 1116 struct percpu_counter *orphan_count;
1120 1117
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 285feeadac39..eb57e3037deb 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -172,24 +172,21 @@ TRACE_EVENT(mm_page_free,
172 172
173TRACE_EVENT(mm_page_free_batched, 173TRACE_EVENT(mm_page_free_batched,
174 174
175 TP_PROTO(struct page *page, int cold), 175 TP_PROTO(struct page *page),
176 176
177 TP_ARGS(page, cold), 177 TP_ARGS(page),
178 178
179 TP_STRUCT__entry( 179 TP_STRUCT__entry(
180 __field( unsigned long, pfn ) 180 __field( unsigned long, pfn )
181 __field( int, cold )
182 ), 181 ),
183 182
184 TP_fast_assign( 183 TP_fast_assign(
185 __entry->pfn = page_to_pfn(page); 184 __entry->pfn = page_to_pfn(page);
186 __entry->cold = cold;
187 ), 185 ),
188 186
189 TP_printk("page=%p pfn=%lu order=0 cold=%d", 187 TP_printk("page=%p pfn=%lu order=0",
190 pfn_to_page(__entry->pfn), 188 pfn_to_page(__entry->pfn),
191 __entry->pfn, 189 __entry->pfn)
192 __entry->cold)
193); 190);
194 191
195TRACE_EVENT(mm_page_alloc, 192TRACE_EVENT(mm_page_alloc,
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 648cbf603736..dbe1bb058c09 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -32,7 +32,6 @@
32 {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ 32 {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \
33 {(unsigned long)__GFP_IO, "__GFP_IO"}, \ 33 {(unsigned long)__GFP_IO, "__GFP_IO"}, \
34 {(unsigned long)__GFP_FS, "__GFP_FS"}, \ 34 {(unsigned long)__GFP_FS, "__GFP_FS"}, \
35 {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \
36 {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ 35 {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \
37 {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ 36 {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \
38 {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ 37 {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \
@@ -46,7 +45,6 @@
46 {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ 45 {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \
47 {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ 46 {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \
48 {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ 47 {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \
49 {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \
50 {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ 48 {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \
51 {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ 49 {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \
52 {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ 50 {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\
diff --git a/init/Kconfig b/init/Kconfig
index 5327146db9b5..7d5a6fbac56a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1655,12 +1655,6 @@ config HAVE_GENERIC_DMA_COHERENT
1655 bool 1655 bool
1656 default n 1656 default n
1657 1657
1658config SLABINFO
1659 bool
1660 depends on PROC_FS
1661 depends on SLAB || SLUB_DEBUG
1662 default y
1663
1664config RT_MUTEXES 1658config RT_MUTEXES
1665 bool 1659 bool
1666 1660
diff --git a/init/do_mounts.c b/init/do_mounts.c
index f6d4dd764a52..7cf4f6dafd5f 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -380,8 +380,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
380 380
381void __init mount_block_root(char *name, int flags) 381void __init mount_block_root(char *name, int flags)
382{ 382{
383 struct page *page = alloc_page(GFP_KERNEL | 383 struct page *page = alloc_page(GFP_KERNEL);
384 __GFP_NOTRACK_FALSE_POSITIVE);
385 char *fs_names = page_address(page); 384 char *fs_names = page_address(page);
386 char *p; 385 char *p;
387#ifdef CONFIG_BLOCK 386#ifdef CONFIG_BLOCK
diff --git a/init/main.c b/init/main.c
index 3bdd8da90f69..859a786f7c0a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,7 +70,6 @@
70#include <linux/kgdb.h> 70#include <linux/kgdb.h>
71#include <linux/ftrace.h> 71#include <linux/ftrace.h>
72#include <linux/async.h> 72#include <linux/async.h>
73#include <linux/kmemcheck.h>
74#include <linux/sfi.h> 73#include <linux/sfi.h>
75#include <linux/shmem_fs.h> 74#include <linux/shmem_fs.h>
76#include <linux/slab.h> 75#include <linux/slab.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8a6c37762330..b9f8686a84cf 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
85 if (fp == NULL) 85 if (fp == NULL)
86 return NULL; 86 return NULL;
87 87
88 kmemcheck_annotate_bitfield(fp, meta);
89
90 aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); 88 aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
91 if (aux == NULL) { 89 if (aux == NULL) {
92 vfree(fp); 90 vfree(fp);
@@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
127 if (fp == NULL) { 125 if (fp == NULL) {
128 __bpf_prog_uncharge(fp_old->aux->user, delta); 126 __bpf_prog_uncharge(fp_old->aux->user, delta);
129 } else { 127 } else {
130 kmemcheck_annotate_bitfield(fp, meta);
131
132 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); 128 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
133 fp->pages = pages; 129 fp->pages = pages;
134 fp->aux->prog = fp; 130 fp->aux->prog = fp;
@@ -675,8 +671,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
675 671
676 fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); 672 fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
677 if (fp != NULL) { 673 if (fp != NULL) {
678 kmemcheck_annotate_bitfield(fp, meta);
679
680 /* aux->prog still points to the fp_other one, so 674 /* aux->prog still points to the fp_other one, so
681 * when promoting the clone to the real program, 675 * when promoting the clone to the real program,
682 * this still needs to be adapted. 676 * this still needs to be adapted.
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..4e55eedba8d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -469,7 +469,7 @@ void __init fork_init(void)
469 /* create a slab on which task_structs can be allocated */ 469 /* create a slab on which task_structs can be allocated */
470 task_struct_cachep = kmem_cache_create("task_struct", 470 task_struct_cachep = kmem_cache_create("task_struct",
471 arch_task_struct_size, align, 471 arch_task_struct_size, align,
472 SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); 472 SLAB_PANIC|SLAB_ACCOUNT, NULL);
473#endif 473#endif
474 474
475 /* do the arch specific task caches init */ 475 /* do the arch specific task caches init */
@@ -817,8 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
817 init_rwsem(&mm->mmap_sem); 817 init_rwsem(&mm->mmap_sem);
818 INIT_LIST_HEAD(&mm->mmlist); 818 INIT_LIST_HEAD(&mm->mmlist);
819 mm->core_state = NULL; 819 mm->core_state = NULL;
820 atomic_long_set(&mm->nr_ptes, 0); 820 mm_pgtables_bytes_init(mm);
821 mm_nr_pmds_init(mm);
822 mm->map_count = 0; 821 mm->map_count = 0;
823 mm->locked_vm = 0; 822 mm->locked_vm = 0;
824 mm->pinned_vm = 0; 823 mm->pinned_vm = 0;
@@ -872,12 +871,9 @@ static void check_mm(struct mm_struct *mm)
872 "mm:%p idx:%d val:%ld\n", mm, i, x); 871 "mm:%p idx:%d val:%ld\n", mm, i, x);
873 } 872 }
874 873
875 if (atomic_long_read(&mm->nr_ptes)) 874 if (mm_pgtables_bytes(mm))
876 pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", 875 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
877 atomic_long_read(&mm->nr_ptes)); 876 mm_pgtables_bytes(mm));
878 if (mm_nr_pmds(mm))
879 pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
880 mm_nr_pmds(mm));
881 877
882#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 878#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
883 VM_BUG_ON_MM(mm->pmd_huge_pte, mm); 879 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
@@ -2209,18 +2205,18 @@ void __init proc_caches_init(void)
2209 sighand_cachep = kmem_cache_create("sighand_cache", 2205 sighand_cachep = kmem_cache_create("sighand_cache",
2210 sizeof(struct sighand_struct), 0, 2206 sizeof(struct sighand_struct), 0,
2211 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| 2207 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2212 SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); 2208 SLAB_ACCOUNT, sighand_ctor);
2213 signal_cachep = kmem_cache_create("signal_cache", 2209 signal_cachep = kmem_cache_create("signal_cache",
2214 sizeof(struct signal_struct), 0, 2210 sizeof(struct signal_struct), 0,
2215 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, 2211 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2216 NULL); 2212 NULL);
2217 files_cachep = kmem_cache_create("files_cache", 2213 files_cachep = kmem_cache_create("files_cache",
2218 sizeof(struct files_struct), 0, 2214 sizeof(struct files_struct), 0,
2219 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, 2215 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2220 NULL); 2216 NULL);
2221 fs_cachep = kmem_cache_create("fs_cache", 2217 fs_cachep = kmem_cache_create("fs_cache",
2222 sizeof(struct fs_struct), 0, 2218 sizeof(struct fs_struct), 0,
2223 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, 2219 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2224 NULL); 2220 NULL);
2225 /* 2221 /*
2226 * FIXME! The "sizeof(struct mm_struct)" currently includes the 2222 * FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2231,7 +2227,7 @@ void __init proc_caches_init(void)
2231 */ 2227 */
2232 mm_cachep = kmem_cache_create("mm_struct", 2228 mm_cachep = kmem_cache_create("mm_struct",
2233 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 2229 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
2234 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, 2230 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2235 NULL); 2231 NULL);
2236 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); 2232 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
2237 mmap_init(); 2233 mmap_init();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index db933d063bfc..9776da8db180 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -47,7 +47,6 @@
47#include <linux/stringify.h> 47#include <linux/stringify.h>
48#include <linux/bitops.h> 48#include <linux/bitops.h>
49#include <linux/gfp.h> 49#include <linux/gfp.h>
50#include <linux/kmemcheck.h>
51#include <linux/random.h> 50#include <linux/random.h>
52#include <linux/jhash.h> 51#include <linux/jhash.h>
53 52
@@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
3238{ 3237{
3239 int i; 3238 int i;
3240 3239
3241 kmemcheck_mark_initialized(lock, sizeof(*lock));
3242
3243 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) 3240 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
3244 lock->class_cache[i] = NULL; 3241 lock->class_cache[i] = NULL;
3245 3242
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a917a301e201..bce0464524d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1884 */ 1884 */
1885static inline int get_highmem_buffer(int safe_needed) 1885static inline int get_highmem_buffer(int safe_needed)
1886{ 1886{
1887 buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); 1887 buffer = get_image_page(GFP_ATOMIC, safe_needed);
1888 return buffer ? 0 : -ENOMEM; 1888 return buffer ? 0 : -ENOMEM;
1889} 1889}
1890 1890
@@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm,
1945 while (nr_pages-- > 0) { 1945 while (nr_pages-- > 0) {
1946 struct page *page; 1946 struct page *page;
1947 1947
1948 page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); 1948 page = alloc_image_page(GFP_ATOMIC);
1949 if (!page) 1949 if (!page)
1950 goto err_out; 1950 goto err_out;
1951 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 1951 memory_bm_set_bit(copy_bm, page_to_pfn(page));
diff --git a/kernel/signal.c b/kernel/signal.c
index 8dcd8825b2de..aa1fb9f905db 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1036 else 1036 else
1037 override_rlimit = 0; 1037 override_rlimit = 0;
1038 1038
1039 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, 1039 q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
1040 override_rlimit);
1041 if (q) { 1040 if (q) {
1042 list_add_tail(&q->list, &pending->list); 1041 list_add_tail(&q->list, &pending->list);
1043 switch ((unsigned long) info) { 1042 switch ((unsigned long) info) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 662f7b1b7a78..2f5e87f1bae2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
486} 486}
487EXPORT_SYMBOL(__tasklet_hi_schedule); 487EXPORT_SYMBOL(__tasklet_hi_schedule);
488 488
489void __tasklet_hi_schedule_first(struct tasklet_struct *t)
490{
491 lockdep_assert_irqs_disabled();
492
493 t->next = __this_cpu_read(tasklet_hi_vec.head);
494 __this_cpu_write(tasklet_hi_vec.head, t);
495 __raise_softirq_irqoff(HI_SOFTIRQ);
496}
497EXPORT_SYMBOL(__tasklet_hi_schedule_first);
498
499static __latent_entropy void tasklet_action(struct softirq_action *a) 489static __latent_entropy void tasklet_action(struct softirq_action *a)
500{ 490{
501 struct tasklet_struct *list; 491 struct tasklet_struct *list;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9576bd582d4a..4a13a389e99b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,7 +30,6 @@
30#include <linux/proc_fs.h> 30#include <linux/proc_fs.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/ctype.h> 32#include <linux/ctype.h>
33#include <linux/kmemcheck.h>
34#include <linux/kmemleak.h> 33#include <linux/kmemleak.h>
35#include <linux/fs.h> 34#include <linux/fs.h>
36#include <linux/init.h> 35#include <linux/init.h>
@@ -1174,15 +1173,6 @@ static struct ctl_table kern_table[] = {
1174 .extra2 = &one_thousand, 1173 .extra2 = &one_thousand,
1175 }, 1174 },
1176#endif 1175#endif
1177#ifdef CONFIG_KMEMCHECK
1178 {
1179 .procname = "kmemcheck",
1180 .data = &kmemcheck_enabled,
1181 .maxlen = sizeof(int),
1182 .mode = 0644,
1183 .proc_handler = proc_dointvec,
1184 },
1185#endif
1186 { 1176 {
1187 .procname = "panic_on_warn", 1177 .procname = "panic_on_warn",
1188 .data = &panic_on_warn, 1178 .data = &panic_on_warn,
@@ -1366,6 +1356,15 @@ static struct ctl_table vm_table[] = {
1366 .mode = 0644, 1356 .mode = 0644,
1367 .proc_handler = &hugetlb_mempolicy_sysctl_handler, 1357 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1368 }, 1358 },
1359 {
1360 .procname = "numa_stat",
1361 .data = &sysctl_vm_numa_stat,
1362 .maxlen = sizeof(int),
1363 .mode = 0644,
1364 .proc_handler = sysctl_vm_numa_stat_handler,
1365 .extra1 = &zero,
1366 .extra2 = &one,
1367 },
1369#endif 1368#endif
1370 { 1369 {
1371 .procname = "hugetlb_shm_group", 1370 .procname = "hugetlb_shm_group",
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 845f3805c73d..d57fede84b38 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -13,7 +13,6 @@
13#include <linux/uaccess.h> 13#include <linux/uaccess.h>
14#include <linux/hardirq.h> 14#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */ 15#include <linux/kthread.h> /* for self test */
16#include <linux/kmemcheck.h>
17#include <linux/module.h> 16#include <linux/module.h>
18#include <linux/percpu.h> 17#include <linux/percpu.h>
19#include <linux/mutex.h> 18#include <linux/mutex.h>
@@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
2055 } 2054 }
2056 2055
2057 event = __rb_page_index(tail_page, tail); 2056 event = __rb_page_index(tail_page, tail);
2058 kmemcheck_annotate_bitfield(event, bitfield);
2059 2057
2060 /* account for padding bytes */ 2058 /* account for padding bytes */
2061 local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); 2059 local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
@@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
2686 /* We reserved something on the buffer */ 2684 /* We reserved something on the buffer */
2687 2685
2688 event = __rb_page_index(tail_page, tail); 2686 event = __rb_page_index(tail_page, tail);
2689 kmemcheck_annotate_bitfield(event, bitfield);
2690 rb_update_event(cpu_buffer, event, info); 2687 rb_update_event(cpu_buffer, event, info);
2691 2688
2692 local_inc(&tail_page->entries); 2689 local_inc(&tail_page->entries);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 07ce7449765a..5402e3954659 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -504,7 +504,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
504 504
505config DEBUG_SLAB 505config DEBUG_SLAB
506 bool "Debug slab memory allocations" 506 bool "Debug slab memory allocations"
507 depends on DEBUG_KERNEL && SLAB && !KMEMCHECK 507 depends on DEBUG_KERNEL && SLAB
508 help 508 help
509 Say Y here to have the kernel do limited verification on memory 509 Say Y here to have the kernel do limited verification on memory
510 allocation as well as poisoning memory on free to catch use of freed 510 allocation as well as poisoning memory on free to catch use of freed
@@ -516,7 +516,7 @@ config DEBUG_SLAB_LEAK
516 516
517config SLUB_DEBUG_ON 517config SLUB_DEBUG_ON
518 bool "SLUB debugging on by default" 518 bool "SLUB debugging on by default"
519 depends on SLUB && SLUB_DEBUG && !KMEMCHECK 519 depends on SLUB && SLUB_DEBUG
520 default n 520 default n
521 help 521 help
522 Boot with debugging on by default. SLUB boots by default with 522 Boot with debugging on by default. SLUB boots by default with
@@ -730,8 +730,6 @@ config DEBUG_STACKOVERFLOW
730 730
731 If in doubt, say "N". 731 If in doubt, say "N".
732 732
733source "lib/Kconfig.kmemcheck"
734
735source "lib/Kconfig.kasan" 733source "lib/Kconfig.kasan"
736 734
737endmenu # "Memory Debugging" 735endmenu # "Memory Debugging"
diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck
deleted file mode 100644
index 846e039a86b4..000000000000
--- a/lib/Kconfig.kmemcheck
+++ /dev/null
@@ -1,94 +0,0 @@
1config HAVE_ARCH_KMEMCHECK
2 bool
3
4if HAVE_ARCH_KMEMCHECK
5
6menuconfig KMEMCHECK
7 bool "kmemcheck: trap use of uninitialized memory"
8 depends on DEBUG_KERNEL
9 depends on !X86_USE_3DNOW
10 depends on SLUB || SLAB
11 depends on !CC_OPTIMIZE_FOR_SIZE
12 depends on !FUNCTION_TRACER
13 select FRAME_POINTER
14 select STACKTRACE
15 default n
16 help
17 This option enables tracing of dynamically allocated kernel memory
18 to see if memory is used before it has been given an initial value.
19 Be aware that this requires half of your memory for bookkeeping and
20 will insert extra code at *every* read and write to tracked memory
21 thus slow down the kernel code (but user code is unaffected).
22
23 The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable
24 or enable kmemcheck at boot-time. If the kernel is started with
25 kmemcheck=0, the large memory and CPU overhead is not incurred.
26
27choice
28 prompt "kmemcheck: default mode at boot"
29 depends on KMEMCHECK
30 default KMEMCHECK_ONESHOT_BY_DEFAULT
31 help
32 This option controls the default behaviour of kmemcheck when the
33 kernel boots and no kmemcheck= parameter is given.
34
35config KMEMCHECK_DISABLED_BY_DEFAULT
36 bool "disabled"
37 depends on KMEMCHECK
38
39config KMEMCHECK_ENABLED_BY_DEFAULT
40 bool "enabled"
41 depends on KMEMCHECK
42
43config KMEMCHECK_ONESHOT_BY_DEFAULT
44 bool "one-shot"
45 depends on KMEMCHECK
46 help
47 In one-shot mode, only the first error detected is reported before
48 kmemcheck is disabled.
49
50endchoice
51
52config KMEMCHECK_QUEUE_SIZE
53 int "kmemcheck: error queue size"
54 depends on KMEMCHECK
55 default 64
56 help
57 Select the maximum number of errors to store in the queue. Since
58 errors can occur virtually anywhere and in any context, we need a
59 temporary storage area which is guarantueed not to generate any
60 other faults. The queue will be emptied as soon as a tasklet may
61 be scheduled. If the queue is full, new error reports will be
62 lost.
63
64config KMEMCHECK_SHADOW_COPY_SHIFT
65 int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)"
66 depends on KMEMCHECK
67 range 2 8
68 default 5
69 help
70 Select the number of shadow bytes to save along with each entry of
71 the queue. These bytes indicate what parts of an allocation are
72 initialized, uninitialized, etc. and will be displayed when an
73 error is detected to help the debugging of a particular problem.
74
75config KMEMCHECK_PARTIAL_OK
76 bool "kmemcheck: allow partially uninitialized memory"
77 depends on KMEMCHECK
78 default y
79 help
80 This option works around certain GCC optimizations that produce
81 32-bit reads from 16-bit variables where the upper 16 bits are
82 thrown away afterwards. This may of course also hide some real
83 bugs.
84
85config KMEMCHECK_BITOPS_OK
86 bool "kmemcheck: allow bit-field manipulation"
87 depends on KMEMCHECK
88 default n
89 help
90 This option silences warnings that would be generated for bit-field
91 accesses where not all the bits are initialized at the same time.
92 This may also hide some real bugs.
93
94endif
diff --git a/lib/idr.c b/lib/idr.c
index edd9b2be1651..2593ce513a18 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -171,7 +171,7 @@ void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id)
171 if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) 171 if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
172 return ERR_PTR(-ENOENT); 172 return ERR_PTR(-ENOENT);
173 173
174 __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL, NULL); 174 __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL);
175 175
176 return entry; 176 return entry;
177} 177}
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8b1feca1230a..c8d55565fafa 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -677,8 +677,7 @@ out:
677 * @root radix tree root 677 * @root radix tree root
678 */ 678 */
679static inline bool radix_tree_shrink(struct radix_tree_root *root, 679static inline bool radix_tree_shrink(struct radix_tree_root *root,
680 radix_tree_update_node_t update_node, 680 radix_tree_update_node_t update_node)
681 void *private)
682{ 681{
683 bool shrunk = false; 682 bool shrunk = false;
684 683
@@ -739,7 +738,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
739 if (!radix_tree_is_internal_node(child)) { 738 if (!radix_tree_is_internal_node(child)) {
740 node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; 739 node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
741 if (update_node) 740 if (update_node)
742 update_node(node, private); 741 update_node(node);
743 } 742 }
744 743
745 WARN_ON_ONCE(!list_empty(&node->private_list)); 744 WARN_ON_ONCE(!list_empty(&node->private_list));
@@ -752,7 +751,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
752 751
753static bool delete_node(struct radix_tree_root *root, 752static bool delete_node(struct radix_tree_root *root,
754 struct radix_tree_node *node, 753 struct radix_tree_node *node,
755 radix_tree_update_node_t update_node, void *private) 754 radix_tree_update_node_t update_node)
756{ 755{
757 bool deleted = false; 756 bool deleted = false;
758 757
@@ -762,8 +761,8 @@ static bool delete_node(struct radix_tree_root *root,
762 if (node->count) { 761 if (node->count) {
763 if (node_to_entry(node) == 762 if (node_to_entry(node) ==
764 rcu_dereference_raw(root->rnode)) 763 rcu_dereference_raw(root->rnode))
765 deleted |= radix_tree_shrink(root, update_node, 764 deleted |= radix_tree_shrink(root,
766 private); 765 update_node);
767 return deleted; 766 return deleted;
768 } 767 }
769 768
@@ -1173,7 +1172,6 @@ static int calculate_count(struct radix_tree_root *root,
1173 * @slot: pointer to slot in @node 1172 * @slot: pointer to slot in @node
1174 * @item: new item to store in the slot. 1173 * @item: new item to store in the slot.
1175 * @update_node: callback for changing leaf nodes 1174 * @update_node: callback for changing leaf nodes
1176 * @private: private data to pass to @update_node
1177 * 1175 *
1178 * For use with __radix_tree_lookup(). Caller must hold tree write locked 1176 * For use with __radix_tree_lookup(). Caller must hold tree write locked
1179 * across slot lookup and replacement. 1177 * across slot lookup and replacement.
@@ -1181,7 +1179,7 @@ static int calculate_count(struct radix_tree_root *root,
1181void __radix_tree_replace(struct radix_tree_root *root, 1179void __radix_tree_replace(struct radix_tree_root *root,
1182 struct radix_tree_node *node, 1180 struct radix_tree_node *node,
1183 void __rcu **slot, void *item, 1181 void __rcu **slot, void *item,
1184 radix_tree_update_node_t update_node, void *private) 1182 radix_tree_update_node_t update_node)
1185{ 1183{
1186 void *old = rcu_dereference_raw(*slot); 1184 void *old = rcu_dereference_raw(*slot);
1187 int exceptional = !!radix_tree_exceptional_entry(item) - 1185 int exceptional = !!radix_tree_exceptional_entry(item) -
@@ -1201,9 +1199,9 @@ void __radix_tree_replace(struct radix_tree_root *root,
1201 return; 1199 return;
1202 1200
1203 if (update_node) 1201 if (update_node)
1204 update_node(node, private); 1202 update_node(node);
1205 1203
1206 delete_node(root, node, update_node, private); 1204 delete_node(root, node, update_node);
1207} 1205}
1208 1206
1209/** 1207/**
@@ -1225,7 +1223,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
1225void radix_tree_replace_slot(struct radix_tree_root *root, 1223void radix_tree_replace_slot(struct radix_tree_root *root,
1226 void __rcu **slot, void *item) 1224 void __rcu **slot, void *item)
1227{ 1225{
1228 __radix_tree_replace(root, NULL, slot, item, NULL, NULL); 1226 __radix_tree_replace(root, NULL, slot, item, NULL);
1229} 1227}
1230EXPORT_SYMBOL(radix_tree_replace_slot); 1228EXPORT_SYMBOL(radix_tree_replace_slot);
1231 1229
@@ -1242,7 +1240,7 @@ void radix_tree_iter_replace(struct radix_tree_root *root,
1242 const struct radix_tree_iter *iter, 1240 const struct radix_tree_iter *iter,
1243 void __rcu **slot, void *item) 1241 void __rcu **slot, void *item)
1244{ 1242{
1245 __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); 1243 __radix_tree_replace(root, iter->node, slot, item, NULL);
1246} 1244}
1247 1245
1248#ifdef CONFIG_RADIX_TREE_MULTIORDER 1246#ifdef CONFIG_RADIX_TREE_MULTIORDER
@@ -1972,7 +1970,6 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
1972 * @root: radix tree root 1970 * @root: radix tree root
1973 * @node: node containing @index 1971 * @node: node containing @index
1974 * @update_node: callback for changing leaf nodes 1972 * @update_node: callback for changing leaf nodes
1975 * @private: private data to pass to @update_node
1976 * 1973 *
1977 * After clearing the slot at @index in @node from radix tree 1974 * After clearing the slot at @index in @node from radix tree
1978 * rooted at @root, call this function to attempt freeing the 1975 * rooted at @root, call this function to attempt freeing the
@@ -1980,10 +1977,9 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
1980 */ 1977 */
1981void __radix_tree_delete_node(struct radix_tree_root *root, 1978void __radix_tree_delete_node(struct radix_tree_root *root,
1982 struct radix_tree_node *node, 1979 struct radix_tree_node *node,
1983 radix_tree_update_node_t update_node, 1980 radix_tree_update_node_t update_node)
1984 void *private)
1985{ 1981{
1986 delete_node(root, node, update_node, private); 1982 delete_node(root, node, update_node);
1987} 1983}
1988 1984
1989static bool __radix_tree_delete(struct radix_tree_root *root, 1985static bool __radix_tree_delete(struct radix_tree_root *root,
@@ -2001,7 +1997,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root,
2001 node_tag_clear(root, node, tag, offset); 1997 node_tag_clear(root, node, tag, offset);
2002 1998
2003 replace_slot(slot, NULL, node, -1, exceptional); 1999 replace_slot(slot, NULL, node, -1, exceptional);
2004 return node && delete_node(root, node, NULL, NULL); 2000 return node && delete_node(root, node, NULL);
2005} 2001}
2006 2002
2007/** 2003/**
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5b0adf1435de..e5e606ee5f71 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC
11 bool "Debug page memory allocations" 11 bool "Debug page memory allocations"
12 depends on DEBUG_KERNEL 12 depends on DEBUG_KERNEL
13 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC 13 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
14 depends on !KMEMCHECK
15 select PAGE_EXTENSION 14 select PAGE_EXTENSION
16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 15 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 ---help--- 16 ---help---
diff --git a/mm/Makefile b/mm/Makefile
index 4659b93cba43..e7ebd176fb93 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n
17KCOV_INSTRUMENT_page_alloc.o := n 17KCOV_INSTRUMENT_page_alloc.o := n
18KCOV_INSTRUMENT_debug-pagealloc.o := n 18KCOV_INSTRUMENT_debug-pagealloc.o := n
19KCOV_INSTRUMENT_kmemleak.o := n 19KCOV_INSTRUMENT_kmemleak.o := n
20KCOV_INSTRUMENT_kmemcheck.o := n
21KCOV_INSTRUMENT_memcontrol.o := n 20KCOV_INSTRUMENT_memcontrol.o := n
22KCOV_INSTRUMENT_mmzone.o := n 21KCOV_INSTRUMENT_mmzone.o := n
23KCOV_INSTRUMENT_vmstat.o := n 22KCOV_INSTRUMENT_vmstat.o := n
@@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o
70obj-$(CONFIG_PAGE_POISONING) += page_poison.o 69obj-$(CONFIG_PAGE_POISONING) += page_poison.o
71obj-$(CONFIG_SLAB) += slab.o 70obj-$(CONFIG_SLAB) += slab.o
72obj-$(CONFIG_SLUB) += slub.o 71obj-$(CONFIG_SLUB) += slub.o
73obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
74obj-$(CONFIG_KASAN) += kasan/ 72obj-$(CONFIG_KASAN) += kasan/
75obj-$(CONFIG_FAILSLAB) += failslab.o 73obj-$(CONFIG_FAILSLAB) += failslab.o
76obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 74obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/cma.c b/mm/cma.c
index 022e52bd8370..0607729abf3b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
461 trace_cma_alloc(pfn, page, count, align); 461 trace_cma_alloc(pfn, page, count, align);
462 462
463 if (ret && !(gfp_mask & __GFP_NOWARN)) { 463 if (ret && !(gfp_mask & __GFP_NOWARN)) {
464 pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", 464 pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
465 __func__, count, ret); 465 __func__, count, ret);
466 cma_debug_show_areas(cma); 466 cma_debug_show_areas(cma);
467 } 467 }
diff --git a/mm/debug.c b/mm/debug.c
index 6726bec731c9..d947f3e03b0d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,7 +105,7 @@ void dump_mm(const struct mm_struct *mm)
105 "get_unmapped_area %p\n" 105 "get_unmapped_area %p\n"
106#endif 106#endif
107 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" 107 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
108 "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" 108 "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
109 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" 109 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
110 "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" 110 "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
111 "start_code %lx end_code %lx start_data %lx end_data %lx\n" 111 "start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -135,8 +135,7 @@ void dump_mm(const struct mm_struct *mm)
135 mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, 135 mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
136 mm->pgd, atomic_read(&mm->mm_users), 136 mm->pgd, atomic_read(&mm->mm_users),
137 atomic_read(&mm->mm_count), 137 atomic_read(&mm->mm_count),
138 atomic_long_read((atomic_long_t *)&mm->nr_ptes), 138 mm_pgtables_bytes(mm),
139 mm_nr_pmds((struct mm_struct *)mm),
140 mm->map_count, 139 mm->map_count,
141 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, 140 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
142 mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, 141 mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/filemap.c b/mm/filemap.c
index 594d73fef8b4..923fc2ebd74a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,7 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/memcontrol.h> 36#include <linux/memcontrol.h>
37#include <linux/cleancache.h> 37#include <linux/cleancache.h>
38#include <linux/shmem_fs.h>
38#include <linux/rmap.h> 39#include <linux/rmap.h>
39#include "internal.h" 40#include "internal.h"
40 41
@@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
134 *shadowp = p; 135 *shadowp = p;
135 } 136 }
136 __radix_tree_replace(&mapping->page_tree, node, slot, page, 137 __radix_tree_replace(&mapping->page_tree, node, slot, page,
137 workingset_update_node, mapping); 138 workingset_lookup_update(mapping));
138 mapping->nrpages++; 139 mapping->nrpages++;
139 return 0; 140 return 0;
140} 141}
@@ -162,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping,
162 163
163 radix_tree_clear_tags(&mapping->page_tree, node, slot); 164 radix_tree_clear_tags(&mapping->page_tree, node, slot);
164 __radix_tree_replace(&mapping->page_tree, node, slot, shadow, 165 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
165 workingset_update_node, mapping); 166 workingset_lookup_update(mapping));
166 } 167 }
167 168
169 page->mapping = NULL;
170 /* Leave page->index set: truncation lookup relies upon it */
171
168 if (shadow) { 172 if (shadow) {
169 mapping->nrexceptional += nr; 173 mapping->nrexceptional += nr;
170 /* 174 /*
@@ -178,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
178 mapping->nrpages -= nr; 182 mapping->nrpages -= nr;
179} 183}
180 184
181/* 185static void unaccount_page_cache_page(struct address_space *mapping,
182 * Delete a page from the page cache and free it. Caller has to make 186 struct page *page)
183 * sure the page is locked and that nobody else uses it - or that usage
184 * is safe. The caller must hold the mapping's tree_lock.
185 */
186void __delete_from_page_cache(struct page *page, void *shadow)
187{ 187{
188 struct address_space *mapping = page->mapping; 188 int nr;
189 int nr = hpage_nr_pages(page);
190 189
191 trace_mm_filemap_delete_from_page_cache(page);
192 /* 190 /*
193 * if we're uptodate, flush out into the cleancache, otherwise 191 * if we're uptodate, flush out into the cleancache, otherwise
194 * invalidate any existing cleancache entries. We can't leave 192 * invalidate any existing cleancache entries. We can't leave
@@ -224,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow)
224 } 222 }
225 } 223 }
226 224
227 page_cache_tree_delete(mapping, page, shadow);
228
229 page->mapping = NULL;
230 /* Leave page->index set: truncation lookup relies upon it */
231
232 /* hugetlb pages do not participate in page cache accounting. */ 225 /* hugetlb pages do not participate in page cache accounting. */
233 if (PageHuge(page)) 226 if (PageHuge(page))
234 return; 227 return;
235 228
229 nr = hpage_nr_pages(page);
230
236 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 231 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
237 if (PageSwapBacked(page)) { 232 if (PageSwapBacked(page)) {
238 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); 233 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
@@ -243,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow)
243 } 238 }
244 239
245 /* 240 /*
246 * At this point page must be either written or cleaned by truncate. 241 * At this point page must be either written or cleaned by
247 * Dirty page here signals a bug and loss of unwritten data. 242 * truncate. Dirty page here signals a bug and loss of
243 * unwritten data.
248 * 244 *
249 * This fixes dirty accounting after removing the page entirely but 245 * This fixes dirty accounting after removing the page entirely
250 * leaves PageDirty set: it has no effect for truncated page and 246 * but leaves PageDirty set: it has no effect for truncated
251 * anyway will be cleared before returning page into buddy allocator. 247 * page and anyway will be cleared before returning page into
248 * buddy allocator.
252 */ 249 */
253 if (WARN_ON_ONCE(PageDirty(page))) 250 if (WARN_ON_ONCE(PageDirty(page)))
254 account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); 251 account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
255} 252}
256 253
254/*
255 * Delete a page from the page cache and free it. Caller has to make
256 * sure the page is locked and that nobody else uses it - or that usage
257 * is safe. The caller must hold the mapping's tree_lock.
258 */
259void __delete_from_page_cache(struct page *page, void *shadow)
260{
261 struct address_space *mapping = page->mapping;
262
263 trace_mm_filemap_delete_from_page_cache(page);
264
265 unaccount_page_cache_page(mapping, page);
266 page_cache_tree_delete(mapping, page, shadow);
267}
268
269static void page_cache_free_page(struct address_space *mapping,
270 struct page *page)
271{
272 void (*freepage)(struct page *);
273
274 freepage = mapping->a_ops->freepage;
275 if (freepage)
276 freepage(page);
277
278 if (PageTransHuge(page) && !PageHuge(page)) {
279 page_ref_sub(page, HPAGE_PMD_NR);
280 VM_BUG_ON_PAGE(page_count(page) <= 0, page);
281 } else {
282 put_page(page);
283 }
284}
285
257/** 286/**
258 * delete_from_page_cache - delete page from page cache 287 * delete_from_page_cache - delete page from page cache
259 * @page: the page which the kernel is trying to remove from page cache 288 * @page: the page which the kernel is trying to remove from page cache
@@ -266,27 +295,98 @@ void delete_from_page_cache(struct page *page)
266{ 295{
267 struct address_space *mapping = page_mapping(page); 296 struct address_space *mapping = page_mapping(page);
268 unsigned long flags; 297 unsigned long flags;
269 void (*freepage)(struct page *);
270 298
271 BUG_ON(!PageLocked(page)); 299 BUG_ON(!PageLocked(page));
272
273 freepage = mapping->a_ops->freepage;
274
275 spin_lock_irqsave(&mapping->tree_lock, flags); 300 spin_lock_irqsave(&mapping->tree_lock, flags);
276 __delete_from_page_cache(page, NULL); 301 __delete_from_page_cache(page, NULL);
277 spin_unlock_irqrestore(&mapping->tree_lock, flags); 302 spin_unlock_irqrestore(&mapping->tree_lock, flags);
278 303
279 if (freepage) 304 page_cache_free_page(mapping, page);
280 freepage(page); 305}
306EXPORT_SYMBOL(delete_from_page_cache);
281 307
282 if (PageTransHuge(page) && !PageHuge(page)) { 308/*
283 page_ref_sub(page, HPAGE_PMD_NR); 309 * page_cache_tree_delete_batch - delete several pages from page cache
284 VM_BUG_ON_PAGE(page_count(page) <= 0, page); 310 * @mapping: the mapping to which pages belong
285 } else { 311 * @pvec: pagevec with pages to delete
286 put_page(page); 312 *
313 * The function walks over mapping->page_tree and removes pages passed in @pvec
314 * from the radix tree. The function expects @pvec to be sorted by page index.
315 * It tolerates holes in @pvec (radix tree entries at those indices are not
316 * modified). The function expects only THP head pages to be present in the
317 * @pvec and takes care to delete all corresponding tail pages from the radix
318 * tree as well.
319 *
320 * The function expects mapping->tree_lock to be held.
321 */
322static void
323page_cache_tree_delete_batch(struct address_space *mapping,
324 struct pagevec *pvec)
325{
326 struct radix_tree_iter iter;
327 void **slot;
328 int total_pages = 0;
329 int i = 0, tail_pages = 0;
330 struct page *page;
331 pgoff_t start;
332
333 start = pvec->pages[0]->index;
334 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
335 if (i >= pagevec_count(pvec) && !tail_pages)
336 break;
337 page = radix_tree_deref_slot_protected(slot,
338 &mapping->tree_lock);
339 if (radix_tree_exceptional_entry(page))
340 continue;
341 if (!tail_pages) {
342 /*
343 * Some page got inserted in our range? Skip it. We
344 * have our pages locked so they are protected from
345 * being removed.
346 */
347 if (page != pvec->pages[i])
348 continue;
349 WARN_ON_ONCE(!PageLocked(page));
350 if (PageTransHuge(page) && !PageHuge(page))
351 tail_pages = HPAGE_PMD_NR - 1;
352 page->mapping = NULL;
353 /*
354 * Leave page->index set: truncation lookup relies
355 * upon it
356 */
357 i++;
358 } else {
359 tail_pages--;
360 }
361 radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
362 __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
363 workingset_lookup_update(mapping));
364 total_pages++;
287 } 365 }
366 mapping->nrpages -= total_pages;
367}
368
369void delete_from_page_cache_batch(struct address_space *mapping,
370 struct pagevec *pvec)
371{
372 int i;
373 unsigned long flags;
374
375 if (!pagevec_count(pvec))
376 return;
377
378 spin_lock_irqsave(&mapping->tree_lock, flags);
379 for (i = 0; i < pagevec_count(pvec); i++) {
380 trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
381
382 unaccount_page_cache_page(mapping, pvec->pages[i]);
383 }
384 page_cache_tree_delete_batch(mapping, pvec);
385 spin_unlock_irqrestore(&mapping->tree_lock, flags);
386
387 for (i = 0; i < pagevec_count(pvec); i++)
388 page_cache_free_page(mapping, pvec->pages[i]);
288} 389}
289EXPORT_SYMBOL(delete_from_page_cache);
290 390
291int filemap_check_errors(struct address_space *mapping) 391int filemap_check_errors(struct address_space *mapping)
292{ 392{
@@ -419,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
419 if (end_byte < start_byte) 519 if (end_byte < start_byte)
420 return; 520 return;
421 521
422 pagevec_init(&pvec, 0); 522 pagevec_init(&pvec);
423 while ((index <= end) && 523 while (index <= end) {
424 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
425 PAGECACHE_TAG_WRITEBACK,
426 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
427 unsigned i; 524 unsigned i;
428 525
526 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
527 end, PAGECACHE_TAG_WRITEBACK);
528 if (!nr_pages)
529 break;
530
429 for (i = 0; i < nr_pages; i++) { 531 for (i = 0; i < nr_pages; i++) {
430 struct page *page = pvec.pages[i]; 532 struct page *page = pvec.pages[i];
431 533
432 /* until radix tree lookup accepts end_index */
433 if (page->index > end)
434 continue;
435
436 wait_on_page_writeback(page); 534 wait_on_page_writeback(page);
437 ClearPageError(page); 535 ClearPageError(page);
438 } 536 }
@@ -1754,9 +1852,10 @@ repeat:
1754EXPORT_SYMBOL(find_get_pages_contig); 1852EXPORT_SYMBOL(find_get_pages_contig);
1755 1853
1756/** 1854/**
1757 * find_get_pages_tag - find and return pages that match @tag 1855 * find_get_pages_range_tag - find and return pages in given range matching @tag
1758 * @mapping: the address_space to search 1856 * @mapping: the address_space to search
1759 * @index: the starting page index 1857 * @index: the starting page index
1858 * @end: The final page index (inclusive)
1760 * @tag: the tag index 1859 * @tag: the tag index
1761 * @nr_pages: the maximum number of pages 1860 * @nr_pages: the maximum number of pages
1762 * @pages: where the resulting pages are placed 1861 * @pages: where the resulting pages are placed
@@ -1764,8 +1863,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
1764 * Like find_get_pages, except we only return pages which are tagged with 1863 * Like find_get_pages, except we only return pages which are tagged with
1765 * @tag. We update @index to index the next page for the traversal. 1864 * @tag. We update @index to index the next page for the traversal.
1766 */ 1865 */
1767unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 1866unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
1768 int tag, unsigned int nr_pages, struct page **pages) 1867 pgoff_t end, int tag, unsigned int nr_pages,
1868 struct page **pages)
1769{ 1869{
1770 struct radix_tree_iter iter; 1870 struct radix_tree_iter iter;
1771 void **slot; 1871 void **slot;
@@ -1778,6 +1878,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
1778 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1878 radix_tree_for_each_tagged(slot, &mapping->page_tree,
1779 &iter, *index, tag) { 1879 &iter, *index, tag) {
1780 struct page *head, *page; 1880 struct page *head, *page;
1881
1882 if (iter.index > end)
1883 break;
1781repeat: 1884repeat:
1782 page = radix_tree_deref_slot(slot); 1885 page = radix_tree_deref_slot(slot);
1783 if (unlikely(!page)) 1886 if (unlikely(!page))
@@ -1819,18 +1922,28 @@ repeat:
1819 } 1922 }
1820 1923
1821 pages[ret] = page; 1924 pages[ret] = page;
1822 if (++ret == nr_pages) 1925 if (++ret == nr_pages) {
1823 break; 1926 *index = pages[ret - 1]->index + 1;
1927 goto out;
1928 }
1824 } 1929 }
1825 1930
1931 /*
1932 * We come here when we got at @end. We take care to not overflow the
1933 * index @index as it confuses some of the callers. This breaks the
1934 * iteration when there is page at index -1 but that is already broken
1935 * anyway.
1936 */
1937 if (end == (pgoff_t)-1)
1938 *index = (pgoff_t)-1;
1939 else
1940 *index = end + 1;
1941out:
1826 rcu_read_unlock(); 1942 rcu_read_unlock();
1827 1943
1828 if (ret)
1829 *index = pages[ret - 1]->index + 1;
1830
1831 return ret; 1944 return ret;
1832} 1945}
1833EXPORT_SYMBOL(find_get_pages_tag); 1946EXPORT_SYMBOL(find_get_pages_range_tag);
1834 1947
1835/** 1948/**
1836 * find_get_entries_tag - find and return entries that match @tag 1949 * find_get_entries_tag - find and return entries that match @tag
@@ -2159,7 +2272,7 @@ no_cached_page:
2159 * Ok, it wasn't cached, so we need to create a new 2272 * Ok, it wasn't cached, so we need to create a new
2160 * page.. 2273 * page..
2161 */ 2274 */
2162 page = page_cache_alloc_cold(mapping); 2275 page = page_cache_alloc(mapping);
2163 if (!page) { 2276 if (!page) {
2164 error = -ENOMEM; 2277 error = -ENOMEM;
2165 goto out; 2278 goto out;
@@ -2271,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
2271 int ret; 2384 int ret;
2272 2385
2273 do { 2386 do {
2274 page = __page_cache_alloc(gfp_mask|__GFP_COLD); 2387 page = __page_cache_alloc(gfp_mask);
2275 if (!page) 2388 if (!page)
2276 return -ENOMEM; 2389 return -ENOMEM;
2277 2390
@@ -2675,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
2675repeat: 2788repeat:
2676 page = find_get_page(mapping, index); 2789 page = find_get_page(mapping, index);
2677 if (!page) { 2790 if (!page) {
2678 page = __page_cache_alloc(gfp | __GFP_COLD); 2791 page = __page_cache_alloc(gfp);
2679 if (!page) 2792 if (!page)
2680 return ERR_PTR(-ENOMEM); 2793 return ERR_PTR(-ENOMEM);
2681 err = add_to_page_cache_lru(page, mapping, index, gfp); 2794 err = add_to_page_cache_lru(page, mapping, index, gfp);
diff --git a/mm/hmm.c b/mm/hmm.c
index a88a847bccba..ea19742a5d60 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
803 803
804static void hmm_devmem_radix_release(struct resource *resource) 804static void hmm_devmem_radix_release(struct resource *resource)
805{ 805{
806 resource_size_t key, align_start, align_size, align_end; 806 resource_size_t key, align_start, align_size;
807 807
808 align_start = resource->start & ~(PA_SECTION_SIZE - 1); 808 align_start = resource->start & ~(PA_SECTION_SIZE - 1);
809 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); 809 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
810 align_end = align_start + align_size - 1;
811 810
812 mutex_lock(&hmm_devmem_lock); 811 mutex_lock(&hmm_devmem_lock);
813 for (key = resource->start; 812 for (key = resource->start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 003f7bcd0952..86fe697e8bfb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
606 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 606 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
607 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 607 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
608 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 608 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
609 atomic_long_inc(&vma->vm_mm->nr_ptes); 609 mm_inc_nr_ptes(vma->vm_mm);
610 spin_unlock(vmf->ptl); 610 spin_unlock(vmf->ptl);
611 count_vm_event(THP_FAULT_ALLOC); 611 count_vm_event(THP_FAULT_ALLOC);
612 } 612 }
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
662 if (pgtable) 662 if (pgtable)
663 pgtable_trans_huge_deposit(mm, pmd, pgtable); 663 pgtable_trans_huge_deposit(mm, pmd, pgtable);
664 set_pmd_at(mm, haddr, pmd, entry); 664 set_pmd_at(mm, haddr, pmd, entry);
665 atomic_long_inc(&mm->nr_ptes); 665 mm_inc_nr_ptes(mm);
666 return true; 666 return true;
667} 667}
668 668
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
747 747
748 if (pgtable) { 748 if (pgtable) {
749 pgtable_trans_huge_deposit(mm, pmd, pgtable); 749 pgtable_trans_huge_deposit(mm, pmd, pgtable);
750 atomic_long_inc(&mm->nr_ptes); 750 mm_inc_nr_ptes(mm);
751 } 751 }
752 752
753 set_pmd_at(mm, addr, pmd, entry); 753 set_pmd_at(mm, addr, pmd, entry);
@@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
942 set_pmd_at(src_mm, addr, src_pmd, pmd); 942 set_pmd_at(src_mm, addr, src_pmd, pmd);
943 } 943 }
944 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 944 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
945 atomic_long_inc(&dst_mm->nr_ptes); 945 mm_inc_nr_ptes(dst_mm);
946 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 946 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
947 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 947 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
948 ret = 0; 948 ret = 0;
@@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
978 get_page(src_page); 978 get_page(src_page);
979 page_dup_rmap(src_page, true); 979 page_dup_rmap(src_page, true);
980 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 980 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
981 atomic_long_inc(&dst_mm->nr_ptes); 981 mm_inc_nr_ptes(dst_mm);
982 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 982 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
983 983
984 pmdp_set_wrprotect(src_mm, addr, src_pmd); 984 pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
1189 goto out_free_pages; 1189 goto out_free_pages;
1190 VM_BUG_ON_PAGE(!PageHead(page), page); 1190 VM_BUG_ON_PAGE(!PageHead(page), page);
1191 1191
1192 /*
1193 * Leave pmd empty until pte is filled note we must notify here as
1194 * concurrent CPU thread might write to new page before the call to
1195 * mmu_notifier_invalidate_range_end() happens which can lead to a
1196 * device seeing memory write in different order than CPU.
1197 *
1198 * See Documentation/vm/mmu_notifier.txt
1199 */
1192 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); 1200 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1193 /* leave pmd empty until pte is filled */
1194 1201
1195 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); 1202 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
1196 pmd_populate(vma->vm_mm, &_pmd, pgtable); 1203 pmd_populate(vma->vm_mm, &_pmd, pgtable);
@@ -1216,7 +1223,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
1216 page_remove_rmap(page, true); 1223 page_remove_rmap(page, true);
1217 spin_unlock(vmf->ptl); 1224 spin_unlock(vmf->ptl);
1218 1225
1219 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 1226 /*
1227 * No need to double call mmu_notifier->invalidate_range() callback as
1228 * the above pmdp_huge_clear_flush_notify() did already call it.
1229 */
1230 mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
1231 mmun_end);
1220 1232
1221 ret |= VM_FAULT_WRITE; 1233 ret |= VM_FAULT_WRITE;
1222 put_page(page); 1234 put_page(page);
@@ -1365,7 +1377,12 @@ alloc:
1365 } 1377 }
1366 spin_unlock(vmf->ptl); 1378 spin_unlock(vmf->ptl);
1367out_mn: 1379out_mn:
1368 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 1380 /*
1381 * No need to double call mmu_notifier->invalidate_range() callback as
1382 * the above pmdp_huge_clear_flush_notify() did already call it.
1383 */
1384 mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
1385 mmun_end);
1369out: 1386out:
1370 return ret; 1387 return ret;
1371out_unlock: 1388out_unlock:
@@ -1678,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1678 1695
1679 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1696 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1680 pte_free(mm, pgtable); 1697 pte_free(mm, pgtable);
1681 atomic_long_dec(&mm->nr_ptes); 1698 mm_dec_nr_ptes(mm);
1682} 1699}
1683 1700
1684int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1701int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
@@ -2017,7 +2034,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2017 2034
2018out: 2035out:
2019 spin_unlock(ptl); 2036 spin_unlock(ptl);
2020 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); 2037 /*
2038 * No need to double call mmu_notifier->invalidate_range() callback as
2039 * the above pudp_huge_clear_flush_notify() did already call it.
2040 */
2041 mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
2042 HPAGE_PUD_SIZE);
2021} 2043}
2022#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2044#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2023 2045
@@ -2029,8 +2051,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2029 pmd_t _pmd; 2051 pmd_t _pmd;
2030 int i; 2052 int i;
2031 2053
2032 /* leave pmd empty until pte is filled */ 2054 /*
2033 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2055 * Leave pmd empty until pte is filled note that it is fine to delay
2056 * notification until mmu_notifier_invalidate_range_end() as we are
2057 * replacing a zero pmd write protected page with a zero pte write
2058 * protected page.
2059 *
2060 * See Documentation/vm/mmu_notifier.txt
2061 */
2062 pmdp_huge_clear_flush(vma, haddr, pmd);
2034 2063
2035 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2064 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2036 pmd_populate(mm, &_pmd, pgtable); 2065 pmd_populate(mm, &_pmd, pgtable);
@@ -2085,6 +2114,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2085 add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); 2114 add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
2086 return; 2115 return;
2087 } else if (is_huge_zero_pmd(*pmd)) { 2116 } else if (is_huge_zero_pmd(*pmd)) {
2117 /*
2118 * FIXME: Do we want to invalidate secondary mmu by calling
2119 * mmu_notifier_invalidate_range() see comments below inside
2120 * __split_huge_pmd() ?
2121 *
2122 * We are going from a zero huge page write protected to zero
2123 * small page also write protected so it does not seems useful
2124 * to invalidate secondary mmu at this time.
2125 */
2088 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2126 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2089 } 2127 }
2090 2128
@@ -2220,7 +2258,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2220 __split_huge_pmd_locked(vma, pmd, haddr, freeze); 2258 __split_huge_pmd_locked(vma, pmd, haddr, freeze);
2221out: 2259out:
2222 spin_unlock(ptl); 2260 spin_unlock(ptl);
2223 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); 2261 /*
2262 * No need to double call mmu_notifier->invalidate_range() callback.
2263 * They are 3 cases to consider inside __split_huge_pmd_locked():
2264 * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
2265 * 2) __split_huge_zero_page_pmd() read only zero page and any write
2266 * fault will trigger a flush_notify before pointing to a new page
2267 * (it is fine if the secondary mmu keeps pointing to the old zero
2268 * page in the meantime)
2269 * 3) Split a huge pmd into pte pointing to the same page. No need
2270 * to invalidate secondary tlb entry they are all still valid.
2271 * any further changes to individual pte will notify. So no need
2272 * to call mmu_notifier->invalidate_range()
2273 */
2274 mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
2275 HPAGE_PMD_SIZE);
2224} 2276}
2225 2277
2226void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2278void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d2ff5e8bf2b..681b300185c0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3256 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); 3256 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
3257 } else { 3257 } else {
3258 if (cow) { 3258 if (cow) {
3259 /*
3260 * No need to notify as we are downgrading page
3261 * table protection not changing it to point
3262 * to a new page.
3263 *
3264 * See Documentation/vm/mmu_notifier.txt
3265 */
3259 huge_ptep_set_wrprotect(src, addr, src_pte); 3266 huge_ptep_set_wrprotect(src, addr, src_pte);
3260 mmu_notifier_invalidate_range(src, mmun_start,
3261 mmun_end);
3262 } 3267 }
3263 entry = huge_ptep_get(src_pte); 3268 entry = huge_ptep_get(src_pte);
3264 ptepage = pte_page(entry); 3269 ptepage = pte_page(entry);
@@ -4318,7 +4323,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
4318 * and that page table be reused and filled with junk. 4323 * and that page table be reused and filled with junk.
4319 */ 4324 */
4320 flush_hugetlb_tlb_range(vma, start, end); 4325 flush_hugetlb_tlb_range(vma, start, end);
4321 mmu_notifier_invalidate_range(mm, start, end); 4326 /*
4327 * No need to call mmu_notifier_invalidate_range() we are downgrading
4328 * page table protection not changing it to point to a new page.
4329 *
4330 * See Documentation/vm/mmu_notifier.txt
4331 */
4322 i_mmap_unlock_write(vma->vm_file->f_mapping); 4332 i_mmap_unlock_write(vma->vm_file->f_mapping);
4323 mmu_notifier_invalidate_range_end(mm, start, end); 4333 mmu_notifier_invalidate_range_end(mm, start, end);
4324 4334
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6f319fb81718..405bba487df5 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -337,7 +337,7 @@ static size_t optimal_redzone(size_t object_size)
337} 337}
338 338
339void kasan_cache_create(struct kmem_cache *cache, size_t *size, 339void kasan_cache_create(struct kmem_cache *cache, size_t *size,
340 unsigned long *flags) 340 slab_flags_t *flags)
341{ 341{
342 int redzone_adjust; 342 int redzone_adjust;
343 int orig_size = *size; 343 int orig_size = *size;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43cb3043311b..ea4ff259b671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1270 _pmd = pmdp_collapse_flush(vma, addr, pmd); 1270 _pmd = pmdp_collapse_flush(vma, addr, pmd);
1271 spin_unlock(ptl); 1271 spin_unlock(ptl);
1272 up_write(&vma->vm_mm->mmap_sem); 1272 up_write(&vma->vm_mm->mmap_sem);
1273 atomic_long_dec(&vma->vm_mm->nr_ptes); 1273 mm_dec_nr_ptes(vma->vm_mm);
1274 pte_free(vma->vm_mm, pmd_pgtable(_pmd)); 1274 pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1275 } 1275 }
1276 } 1276 }
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 800d64b854ea..cec594032515 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -1,126 +1 @@
1// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
2#include <linux/gfp.h>
3#include <linux/mm_types.h>
4#include <linux/mm.h>
5#include <linux/slab.h>
6#include "slab.h"
7#include <linux/kmemcheck.h>
8
9void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
10{
11 struct page *shadow;
12 int pages;
13 int i;
14
15 pages = 1 << order;
16
17 /*
18 * With kmemcheck enabled, we need to allocate a memory area for the
19 * shadow bits as well.
20 */
21 shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
22 if (!shadow) {
23 if (printk_ratelimit())
24 pr_err("kmemcheck: failed to allocate shadow bitmap\n");
25 return;
26 }
27
28 for(i = 0; i < pages; ++i)
29 page[i].shadow = page_address(&shadow[i]);
30
31 /*
32 * Mark it as non-present for the MMU so that our accesses to
33 * this memory will trigger a page fault and let us analyze
34 * the memory accesses.
35 */
36 kmemcheck_hide_pages(page, pages);
37}
38
39void kmemcheck_free_shadow(struct page *page, int order)
40{
41 struct page *shadow;
42 int pages;
43 int i;
44
45 if (!kmemcheck_page_is_tracked(page))
46 return;
47
48 pages = 1 << order;
49
50 kmemcheck_show_pages(page, pages);
51
52 shadow = virt_to_page(page[0].shadow);
53
54 for(i = 0; i < pages; ++i)
55 page[i].shadow = NULL;
56
57 __free_pages(shadow, order);
58}
59
60void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
61 size_t size)
62{
63 if (unlikely(!object)) /* Skip object if allocation failed */
64 return;
65
66 /*
67 * Has already been memset(), which initializes the shadow for us
68 * as well.
69 */
70 if (gfpflags & __GFP_ZERO)
71 return;
72
73 /* No need to initialize the shadow of a non-tracked slab. */
74 if (s->flags & SLAB_NOTRACK)
75 return;
76
77 if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
78 /*
79 * Allow notracked objects to be allocated from
80 * tracked caches. Note however that these objects
81 * will still get page faults on access, they just
82 * won't ever be flagged as uninitialized. If page
83 * faults are not acceptable, the slab cache itself
84 * should be marked NOTRACK.
85 */
86 kmemcheck_mark_initialized(object, size);
87 } else if (!s->ctor) {
88 /*
89 * New objects should be marked uninitialized before
90 * they're returned to the called.
91 */
92 kmemcheck_mark_uninitialized(object, size);
93 }
94}
95
96void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
97{
98 /* TODO: RCU freeing is unsupported for now; hide false positives. */
99 if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
100 kmemcheck_mark_freed(object, size);
101}
102
103void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
104 gfp_t gfpflags)
105{
106 int pages;
107
108 if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
109 return;
110
111 pages = 1 << order;
112
113 /*
114 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
115 * can become uninitialized by copying uninitialized memory
116 * into them.
117 */
118
119 /* XXX: Can use zone->node for node? */
120 kmemcheck_alloc_shadow(page, order, gfpflags, -1);
121
122 if (gfpflags & __GFP_ZERO)
123 kmemcheck_mark_initialized_pages(page, pages);
124 else
125 kmemcheck_mark_uninitialized_pages(page, pages);
126}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 7780cd83a495..e4738d5e9b8c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -110,7 +110,6 @@
110#include <linux/atomic.h> 110#include <linux/atomic.h>
111 111
112#include <linux/kasan.h> 112#include <linux/kasan.h>
113#include <linux/kmemcheck.h>
114#include <linux/kmemleak.h> 113#include <linux/kmemleak.h>
115#include <linux/memory_hotplug.h> 114#include <linux/memory_hotplug.h>
116 115
@@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object)
1238{ 1237{
1239 u32 old_csum = object->checksum; 1238 u32 old_csum = object->checksum;
1240 1239
1241 if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
1242 return false;
1243
1244 kasan_disable_current(); 1240 kasan_disable_current();
1245 object->checksum = crc32(0, (void *)object->pointer, object->size); 1241 object->checksum = crc32(0, (void *)object->pointer, object->size);
1246 kasan_enable_current(); 1242 kasan_enable_current();
@@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end,
1314 if (scan_should_stop()) 1310 if (scan_should_stop())
1315 break; 1311 break;
1316 1312
1317 /* don't scan uninitialized memory */
1318 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
1319 BYTES_PER_POINTER))
1320 continue;
1321
1322 kasan_disable_current(); 1313 kasan_disable_current();
1323 pointer = *ptr; 1314 pointer = *ptr;
1324 kasan_enable_current(); 1315 kasan_enable_current();
@@ -2104,7 +2095,7 @@ static int __init kmemleak_late_init(void)
2104 return -ENOMEM; 2095 return -ENOMEM;
2105 } 2096 }
2106 2097
2107 dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, 2098 dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL,
2108 &kmemleak_fops); 2099 &kmemleak_fops);
2109 if (!dentry) 2100 if (!dentry)
2110 pr_warn("Failed to create the debugfs kmemleak file\n"); 2101 pr_warn("Failed to create the debugfs kmemleak file\n");
diff --git a/mm/ksm.c b/mm/ksm.c
index 6cb60f46cce5..be8f4576f842 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1052 * So we clear the pte and flush the tlb before the check 1052 * So we clear the pte and flush the tlb before the check
1053 * this assure us that no O_DIRECT can happen after the check 1053 * this assure us that no O_DIRECT can happen after the check
1054 * or in the middle of the check. 1054 * or in the middle of the check.
1055 *
1056 * No need to notify as we are downgrading page table to read
1057 * only not changing it to point to a new page.
1058 *
1059 * See Documentation/vm/mmu_notifier.txt
1055 */ 1060 */
1056 entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); 1061 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1057 /* 1062 /*
1058 * Check that no O_DIRECT or similar I/O is in progress on the 1063 * Check that no O_DIRECT or similar I/O is in progress on the
1059 * page 1064 * page
@@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
1136 } 1141 }
1137 1142
1138 flush_cache_page(vma, addr, pte_pfn(*ptep)); 1143 flush_cache_page(vma, addr, pte_pfn(*ptep));
1139 ptep_clear_flush_notify(vma, addr, ptep); 1144 /*
1145 * No need to notify as we are replacing a read only page with another
1146 * read only page with the same content.
1147 *
1148 * See Documentation/vm/mmu_notifier.txt
1149 */
1150 ptep_clear_flush(vma, addr, ptep);
1140 set_pte_at_notify(mm, addr, ptep, newpte); 1151 set_pte_at_notify(mm, addr, ptep, newpte);
1141 1152
1142 page_remove_rmap(page, false); 1153 page_remove_rmap(page, false);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f141f0c80ff3..fd41e969ede5 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -221,6 +221,7 @@ restart:
221 switch (ret) { 221 switch (ret) {
222 case LRU_REMOVED_RETRY: 222 case LRU_REMOVED_RETRY:
223 assert_spin_locked(&nlru->lock); 223 assert_spin_locked(&nlru->lock);
224 /* fall through */
224 case LRU_REMOVED: 225 case LRU_REMOVED:
225 isolated++; 226 isolated++;
226 nlru->nr_items--; 227 nlru->nr_items--;
diff --git a/mm/memblock.c b/mm/memblock.c
index 91205780e6b1..46aacdfa4f4d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -533,7 +533,7 @@ repeat:
533 base = obase; 533 base = obase;
534 nr_new = 0; 534 nr_new = 0;
535 535
536 for_each_memblock_type(type, rgn) { 536 for_each_memblock_type(idx, type, rgn) {
537 phys_addr_t rbase = rgn->base; 537 phys_addr_t rbase = rgn->base;
538 phys_addr_t rend = rbase + rgn->size; 538 phys_addr_t rend = rbase + rgn->size;
539 539
@@ -637,7 +637,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
637 if (memblock_double_array(type, base, size) < 0) 637 if (memblock_double_array(type, base, size) < 0)
638 return -ENOMEM; 638 return -ENOMEM;
639 639
640 for_each_memblock_type(type, rgn) { 640 for_each_memblock_type(idx, type, rgn) {
641 phys_addr_t rbase = rgn->base; 641 phys_addr_t rbase = rgn->base;
642 phys_addr_t rend = rbase + rgn->size; 642 phys_addr_t rend = rbase + rgn->size;
643 643
@@ -1327,7 +1327,6 @@ again:
1327 return NULL; 1327 return NULL;
1328done: 1328done:
1329 ptr = phys_to_virt(alloc); 1329 ptr = phys_to_virt(alloc);
1330 memset(ptr, 0, size);
1331 1330
1332 /* 1331 /*
1333 * The min_count is set to 0 so that bootmem allocated blocks 1332 * The min_count is set to 0 so that bootmem allocated blocks
@@ -1341,6 +1340,45 @@ done:
1341} 1340}
1342 1341
1343/** 1342/**
1343 * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
1344 * memory and without panicking
1345 * @size: size of memory block to be allocated in bytes
1346 * @align: alignment of the region and block's size
1347 * @min_addr: the lower bound of the memory region from where the allocation
1348 * is preferred (phys address)
1349 * @max_addr: the upper bound of the memory region from where the allocation
1350 * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
1351 * allocate only from memory limited by memblock.current_limit value
1352 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1353 *
1354 * Public function, provides additional debug information (including caller
1355 * info), if enabled. Does not zero allocated memory, does not panic if request
1356 * cannot be satisfied.
1357 *
1358 * RETURNS:
1359 * Virtual address of allocated memory block on success, NULL on failure.
1360 */
1361void * __init memblock_virt_alloc_try_nid_raw(
1362 phys_addr_t size, phys_addr_t align,
1363 phys_addr_t min_addr, phys_addr_t max_addr,
1364 int nid)
1365{
1366 void *ptr;
1367
1368 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1369 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1370 (u64)max_addr, (void *)_RET_IP_);
1371
1372 ptr = memblock_virt_alloc_internal(size, align,
1373 min_addr, max_addr, nid);
1374#ifdef CONFIG_DEBUG_VM
1375 if (ptr && size > 0)
1376 memset(ptr, 0xff, size);
1377#endif
1378 return ptr;
1379}
1380
1381/**
1344 * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block 1382 * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
1345 * @size: size of memory block to be allocated in bytes 1383 * @size: size of memory block to be allocated in bytes
1346 * @align: alignment of the region and block's size 1384 * @align: alignment of the region and block's size
@@ -1351,8 +1389,8 @@ done:
1351 * allocate only from memory limited by memblock.current_limit value 1389 * allocate only from memory limited by memblock.current_limit value
1352 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 1390 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1353 * 1391 *
1354 * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides 1392 * Public function, provides additional debug information (including caller
1355 * additional debug information (including caller info), if enabled. 1393 * info), if enabled. This function zeroes the allocated memory.
1356 * 1394 *
1357 * RETURNS: 1395 * RETURNS:
1358 * Virtual address of allocated memory block on success, NULL on failure. 1396 * Virtual address of allocated memory block on success, NULL on failure.
@@ -1362,11 +1400,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
1362 phys_addr_t min_addr, phys_addr_t max_addr, 1400 phys_addr_t min_addr, phys_addr_t max_addr,
1363 int nid) 1401 int nid)
1364{ 1402{
1403 void *ptr;
1404
1365 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", 1405 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1366 __func__, (u64)size, (u64)align, nid, (u64)min_addr, 1406 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1367 (u64)max_addr, (void *)_RET_IP_); 1407 (u64)max_addr, (void *)_RET_IP_);
1368 return memblock_virt_alloc_internal(size, align, min_addr, 1408
1369 max_addr, nid); 1409 ptr = memblock_virt_alloc_internal(size, align,
1410 min_addr, max_addr, nid);
1411 if (ptr)
1412 memset(ptr, 0, size);
1413 return ptr;
1370} 1414}
1371 1415
1372/** 1416/**
@@ -1380,7 +1424,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
1380 * allocate only from memory limited by memblock.current_limit value 1424 * allocate only from memory limited by memblock.current_limit value
1381 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 1425 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1382 * 1426 *
1383 * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() 1427 * Public panicking version of memblock_virt_alloc_try_nid_nopanic()
1384 * which provides debug information (including caller info), if enabled, 1428 * which provides debug information (including caller info), if enabled,
1385 * and panics if the request can not be satisfied. 1429 * and panics if the request can not be satisfied.
1386 * 1430 *
@@ -1399,8 +1443,10 @@ void * __init memblock_virt_alloc_try_nid(
1399 (u64)max_addr, (void *)_RET_IP_); 1443 (u64)max_addr, (void *)_RET_IP_);
1400 ptr = memblock_virt_alloc_internal(size, align, 1444 ptr = memblock_virt_alloc_internal(size, align,
1401 min_addr, max_addr, nid); 1445 min_addr, max_addr, nid);
1402 if (ptr) 1446 if (ptr) {
1447 memset(ptr, 0, size);
1403 return ptr; 1448 return ptr;
1449 }
1404 1450
1405 panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", 1451 panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
1406 __func__, (u64)size, (u64)align, nid, (u64)min_addr, 1452 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
@@ -1715,7 +1761,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
1715 1761
1716 pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); 1762 pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt);
1717 1763
1718 for_each_memblock_type(type, rgn) { 1764 for_each_memblock_type(idx, type, rgn) {
1719 char nid_buf[32] = ""; 1765 char nid_buf[32] = "";
1720 1766
1721 base = rgn->base; 1767 base = rgn->base;
@@ -1739,7 +1785,7 @@ memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
1739 unsigned long size = 0; 1785 unsigned long size = 0;
1740 int idx; 1786 int idx;
1741 1787
1742 for_each_memblock_type((&memblock.reserved), rgn) { 1788 for_each_memblock_type(idx, (&memblock.reserved), rgn) {
1743 phys_addr_t start, end; 1789 phys_addr_t start, end;
1744 1790
1745 if (rgn->base + rgn->size < start_addr) 1791 if (rgn->base + rgn->size < start_addr)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 661f046ad318..50e6906314f8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4049,7 +4049,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
4049 .write = mem_cgroup_reset, 4049 .write = mem_cgroup_reset,
4050 .read_u64 = mem_cgroup_read_u64, 4050 .read_u64 = mem_cgroup_read_u64,
4051 }, 4051 },
4052#ifdef CONFIG_SLABINFO 4052#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4053 { 4053 {
4054 .name = "kmem.slabinfo", 4054 .name = "kmem.slabinfo",
4055 .seq_start = memcg_slab_start, 4055 .seq_start = memcg_slab_start,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 88366626c0b7..4acdf393a801 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1587,7 +1587,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1587 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1587 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1588 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1588 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1589 if (ret) { 1589 if (ret) {
1590 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", 1590 pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1591 pfn, ret, page->flags, &page->flags); 1591 pfn, ret, page->flags, &page->flags);
1592 if (!list_empty(&pagelist)) 1592 if (!list_empty(&pagelist))
1593 putback_movable_pages(&pagelist); 1593 putback_movable_pages(&pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index cae514e7dcfc..85e7a87da79f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
438 pgtable_t token = pmd_pgtable(*pmd); 438 pgtable_t token = pmd_pgtable(*pmd);
439 pmd_clear(pmd); 439 pmd_clear(pmd);
440 pte_free_tlb(tlb, token, addr); 440 pte_free_tlb(tlb, token, addr);
441 atomic_long_dec(&tlb->mm->nr_ptes); 441 mm_dec_nr_ptes(tlb->mm);
442} 442}
443 443
444static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 444static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
506 pud = pud_offset(p4d, start); 506 pud = pud_offset(p4d, start);
507 p4d_clear(p4d); 507 p4d_clear(p4d);
508 pud_free_tlb(tlb, pud, start); 508 pud_free_tlb(tlb, pud, start);
509 mm_dec_nr_puds(tlb->mm);
509} 510}
510 511
511static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, 512static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -665,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
665 666
666 ptl = pmd_lock(mm, pmd); 667 ptl = pmd_lock(mm, pmd);
667 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 668 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
668 atomic_long_inc(&mm->nr_ptes); 669 mm_inc_nr_ptes(mm);
669 pmd_populate(mm, pmd, new); 670 pmd_populate(mm, pmd, new);
670 new = NULL; 671 new = NULL;
671 } 672 }
@@ -2554,7 +2555,11 @@ static int wp_page_copy(struct vm_fault *vmf)
2554 put_page(new_page); 2555 put_page(new_page);
2555 2556
2556 pte_unmap_unlock(vmf->pte, vmf->ptl); 2557 pte_unmap_unlock(vmf->pte, vmf->ptl);
2557 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2558 /*
2559 * No need to double call mmu_notifier->invalidate_range() callback as
2560 * the above ptep_clear_flush_notify() did already call it.
2561 */
2562 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
2558 if (old_page) { 2563 if (old_page) {
2559 /* 2564 /*
2560 * Don't let another task, with possibly unlocked vma, 2565 * Don't let another task, with possibly unlocked vma,
@@ -2842,7 +2847,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
2842int do_swap_page(struct vm_fault *vmf) 2847int do_swap_page(struct vm_fault *vmf)
2843{ 2848{
2844 struct vm_area_struct *vma = vmf->vma; 2849 struct vm_area_struct *vma = vmf->vma;
2845 struct page *page = NULL, *swapcache; 2850 struct page *page = NULL, *swapcache = NULL;
2846 struct mem_cgroup *memcg; 2851 struct mem_cgroup *memcg;
2847 struct vma_swap_readahead swap_ra; 2852 struct vma_swap_readahead swap_ra;
2848 swp_entry_t entry; 2853 swp_entry_t entry;
@@ -2881,17 +2886,36 @@ int do_swap_page(struct vm_fault *vmf)
2881 } 2886 }
2882 goto out; 2887 goto out;
2883 } 2888 }
2889
2890
2884 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2891 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2885 if (!page) 2892 if (!page)
2886 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, 2893 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
2887 vmf->address); 2894 vmf->address);
2888 if (!page) { 2895 if (!page) {
2889 if (vma_readahead) 2896 struct swap_info_struct *si = swp_swap_info(entry);
2890 page = do_swap_page_readahead(entry, 2897
2891 GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); 2898 if (si->flags & SWP_SYNCHRONOUS_IO &&
2892 else 2899 __swap_count(si, entry) == 1) {
2893 page = swapin_readahead(entry, 2900 /* skip swapcache */
2894 GFP_HIGHUSER_MOVABLE, vma, vmf->address); 2901 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2902 if (page) {
2903 __SetPageLocked(page);
2904 __SetPageSwapBacked(page);
2905 set_page_private(page, entry.val);
2906 lru_cache_add_anon(page);
2907 swap_readpage(page, true);
2908 }
2909 } else {
2910 if (vma_readahead)
2911 page = do_swap_page_readahead(entry,
2912 GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
2913 else
2914 page = swapin_readahead(entry,
2915 GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2916 swapcache = page;
2917 }
2918
2895 if (!page) { 2919 if (!page) {
2896 /* 2920 /*
2897 * Back out if somebody else faulted in this pte 2921 * Back out if somebody else faulted in this pte
@@ -2920,7 +2944,6 @@ int do_swap_page(struct vm_fault *vmf)
2920 goto out_release; 2944 goto out_release;
2921 } 2945 }
2922 2946
2923 swapcache = page;
2924 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); 2947 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2925 2948
2926 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2949 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2935,7 +2958,8 @@ int do_swap_page(struct vm_fault *vmf)
2935 * test below, are not enough to exclude that. Even if it is still 2958 * test below, are not enough to exclude that. Even if it is still
2936 * swapcache, we need to check that the page's swap has not changed. 2959 * swapcache, we need to check that the page's swap has not changed.
2937 */ 2960 */
2938 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 2961 if (unlikely((!PageSwapCache(page) ||
2962 page_private(page) != entry.val)) && swapcache)
2939 goto out_page; 2963 goto out_page;
2940 2964
2941 page = ksm_might_need_to_copy(page, vma, vmf->address); 2965 page = ksm_might_need_to_copy(page, vma, vmf->address);
@@ -2988,14 +3012,16 @@ int do_swap_page(struct vm_fault *vmf)
2988 pte = pte_mksoft_dirty(pte); 3012 pte = pte_mksoft_dirty(pte);
2989 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); 3013 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2990 vmf->orig_pte = pte; 3014 vmf->orig_pte = pte;
2991 if (page == swapcache) { 3015
2992 do_page_add_anon_rmap(page, vma, vmf->address, exclusive); 3016 /* ksm created a completely new copy */
2993 mem_cgroup_commit_charge(page, memcg, true, false); 3017 if (unlikely(page != swapcache && swapcache)) {
2994 activate_page(page);
2995 } else { /* ksm created a completely new copy */
2996 page_add_new_anon_rmap(page, vma, vmf->address, false); 3018 page_add_new_anon_rmap(page, vma, vmf->address, false);
2997 mem_cgroup_commit_charge(page, memcg, false, false); 3019 mem_cgroup_commit_charge(page, memcg, false, false);
2998 lru_cache_add_active_or_unevictable(page, vma); 3020 lru_cache_add_active_or_unevictable(page, vma);
3021 } else {
3022 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3023 mem_cgroup_commit_charge(page, memcg, true, false);
3024 activate_page(page);
2999 } 3025 }
3000 3026
3001 swap_free(entry); 3027 swap_free(entry);
@@ -3003,7 +3029,7 @@ int do_swap_page(struct vm_fault *vmf)
3003 (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 3029 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3004 try_to_free_swap(page); 3030 try_to_free_swap(page);
3005 unlock_page(page); 3031 unlock_page(page);
3006 if (page != swapcache) { 3032 if (page != swapcache && swapcache) {
3007 /* 3033 /*
3008 * Hold the lock to avoid the swap entry to be reused 3034 * Hold the lock to avoid the swap entry to be reused
3009 * until we take the PT lock for the pte_same() check 3035 * until we take the PT lock for the pte_same() check
@@ -3036,7 +3062,7 @@ out_page:
3036 unlock_page(page); 3062 unlock_page(page);
3037out_release: 3063out_release:
3038 put_page(page); 3064 put_page(page);
3039 if (page != swapcache) { 3065 if (page != swapcache && swapcache) {
3040 unlock_page(swapcache); 3066 unlock_page(swapcache);
3041 put_page(swapcache); 3067 put_page(swapcache);
3042 } 3068 }
@@ -3212,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
3212 goto map_pte; 3238 goto map_pte;
3213 } 3239 }
3214 3240
3215 atomic_long_inc(&vma->vm_mm->nr_ptes); 3241 mm_inc_nr_ptes(vma->vm_mm);
3216 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); 3242 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3217 spin_unlock(vmf->ptl); 3243 spin_unlock(vmf->ptl);
3218 vmf->prealloc_pte = NULL; 3244 vmf->prealloc_pte = NULL;
@@ -3271,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
3271 * We are going to consume the prealloc table, 3297 * We are going to consume the prealloc table,
3272 * count that as nr_ptes. 3298 * count that as nr_ptes.
3273 */ 3299 */
3274 atomic_long_inc(&vma->vm_mm->nr_ptes); 3300 mm_inc_nr_ptes(vma->vm_mm);
3275 vmf->prealloc_pte = NULL; 3301 vmf->prealloc_pte = NULL;
3276} 3302}
3277 3303
@@ -4124,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4124 4150
4125 spin_lock(&mm->page_table_lock); 4151 spin_lock(&mm->page_table_lock);
4126#ifndef __ARCH_HAS_5LEVEL_HACK 4152#ifndef __ARCH_HAS_5LEVEL_HACK
4127 if (p4d_present(*p4d)) /* Another has populated it */ 4153 if (!p4d_present(*p4d)) {
4128 pud_free(mm, new); 4154 mm_inc_nr_puds(mm);
4129 else
4130 p4d_populate(mm, p4d, new); 4155 p4d_populate(mm, p4d, new);
4131#else 4156 } else /* Another has populated it */
4132 if (pgd_present(*p4d)) /* Another has populated it */
4133 pud_free(mm, new); 4157 pud_free(mm, new);
4134 else 4158#else
4159 if (!pgd_present(*p4d)) {
4160 mm_inc_nr_puds(mm);
4135 pgd_populate(mm, p4d, new); 4161 pgd_populate(mm, p4d, new);
4162 } else /* Another has populated it */
4163 pud_free(mm, new);
4136#endif /* __ARCH_HAS_5LEVEL_HACK */ 4164#endif /* __ARCH_HAS_5LEVEL_HACK */
4137 spin_unlock(&mm->page_table_lock); 4165 spin_unlock(&mm->page_table_lock);
4138 return 0; 4166 return 0;
@@ -4457,17 +4485,15 @@ void print_vma_addr(char *prefix, unsigned long ip)
4457 struct vm_area_struct *vma; 4485 struct vm_area_struct *vma;
4458 4486
4459 /* 4487 /*
4460 * Do not print if we are in atomic 4488 * we might be running from an atomic context so we cannot sleep
4461 * contexts (in exception stacks, etc.):
4462 */ 4489 */
4463 if (preempt_count()) 4490 if (!down_read_trylock(&mm->mmap_sem))
4464 return; 4491 return;
4465 4492
4466 down_read(&mm->mmap_sem);
4467 vma = find_vma(mm, ip); 4493 vma = find_vma(mm, ip);
4468 if (vma && vma->vm_file) { 4494 if (vma && vma->vm_file) {
4469 struct file *f = vma->vm_file; 4495 struct file *f = vma->vm_file;
4470 char *buf = (char *)__get_free_page(GFP_KERNEL); 4496 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4471 if (buf) { 4497 if (buf) {
4472 char *p; 4498 char *p;
4473 4499
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d4b5f29906b9..c52aa05b106c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -265,7 +265,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
265 /* 265 /*
266 * Make all the pages reserved so that nobody will stumble over half 266 * Make all the pages reserved so that nobody will stumble over half
267 * initialized state. 267 * initialized state.
268 * FIXME: We also have to associate it with a node because pfn_to_node 268 * FIXME: We also have to associate it with a node because page_to_nid
269 * relies on having page with the proper node. 269 * relies on having page with the proper node.
270 */ 270 */
271 for (i = 0; i < PAGES_PER_SECTION; i++) { 271 for (i = 0; i < PAGES_PER_SECTION; i++) {
@@ -1590,11 +1590,11 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
1590} 1590}
1591 1591
1592static int __ref __offline_pages(unsigned long start_pfn, 1592static int __ref __offline_pages(unsigned long start_pfn,
1593 unsigned long end_pfn, unsigned long timeout) 1593 unsigned long end_pfn)
1594{ 1594{
1595 unsigned long pfn, nr_pages, expire; 1595 unsigned long pfn, nr_pages;
1596 long offlined_pages; 1596 long offlined_pages;
1597 int ret, drain, retry_max, node; 1597 int ret, node;
1598 unsigned long flags; 1598 unsigned long flags;
1599 unsigned long valid_start, valid_end; 1599 unsigned long valid_start, valid_end;
1600 struct zone *zone; 1600 struct zone *zone;
@@ -1630,44 +1630,22 @@ static int __ref __offline_pages(unsigned long start_pfn,
1630 goto failed_removal; 1630 goto failed_removal;
1631 1631
1632 pfn = start_pfn; 1632 pfn = start_pfn;
1633 expire = jiffies + timeout;
1634 drain = 0;
1635 retry_max = 5;
1636repeat: 1633repeat:
1637 /* start memory hot removal */ 1634 /* start memory hot removal */
1638 ret = -EAGAIN;
1639 if (time_after(jiffies, expire))
1640 goto failed_removal;
1641 ret = -EINTR; 1635 ret = -EINTR;
1642 if (signal_pending(current)) 1636 if (signal_pending(current))
1643 goto failed_removal; 1637 goto failed_removal;
1644 ret = 0; 1638
1645 if (drain) { 1639 cond_resched();
1646 lru_add_drain_all_cpuslocked(); 1640 lru_add_drain_all_cpuslocked();
1647 cond_resched(); 1641 drain_all_pages(zone);
1648 drain_all_pages(zone);
1649 }
1650 1642
1651 pfn = scan_movable_pages(start_pfn, end_pfn); 1643 pfn = scan_movable_pages(start_pfn, end_pfn);
1652 if (pfn) { /* We have movable pages */ 1644 if (pfn) { /* We have movable pages */
1653 ret = do_migrate_range(pfn, end_pfn); 1645 ret = do_migrate_range(pfn, end_pfn);
1654 if (!ret) { 1646 goto repeat;
1655 drain = 1;
1656 goto repeat;
1657 } else {
1658 if (ret < 0)
1659 if (--retry_max == 0)
1660 goto failed_removal;
1661 yield();
1662 drain = 1;
1663 goto repeat;
1664 }
1665 } 1647 }
1666 /* drain all zone's lru pagevec, this is asynchronous... */ 1648
1667 lru_add_drain_all_cpuslocked();
1668 yield();
1669 /* drain pcp pages, this is synchronous. */
1670 drain_all_pages(zone);
1671 /* 1649 /*
1672 * dissolve free hugepages in the memory block before doing offlining 1650 * dissolve free hugepages in the memory block before doing offlining
1673 * actually in order to make hugetlbfs's object counting consistent. 1651 * actually in order to make hugetlbfs's object counting consistent.
@@ -1677,10 +1655,8 @@ repeat:
1677 goto failed_removal; 1655 goto failed_removal;
1678 /* check again */ 1656 /* check again */
1679 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1657 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1680 if (offlined_pages < 0) { 1658 if (offlined_pages < 0)
1681 ret = -EBUSY; 1659 goto repeat;
1682 goto failed_removal;
1683 }
1684 pr_info("Offlined Pages %ld\n", offlined_pages); 1660 pr_info("Offlined Pages %ld\n", offlined_pages);
1685 /* Ok, all of our target is isolated. 1661 /* Ok, all of our target is isolated.
1686 We cannot do rollback at this point. */ 1662 We cannot do rollback at this point. */
@@ -1728,7 +1704,7 @@ failed_removal:
1728/* Must be protected by mem_hotplug_begin() or a device_lock */ 1704/* Must be protected by mem_hotplug_begin() or a device_lock */
1729int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1705int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1730{ 1706{
1731 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1707 return __offline_pages(start_pfn, start_pfn + nr_pages);
1732} 1708}
1733#endif /* CONFIG_MEMORY_HOTREMOVE */ 1709#endif /* CONFIG_MEMORY_HOTREMOVE */
1734 1710
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a2af6d58a68f..4ce44d3ff03d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,6 +85,7 @@
85#include <linux/interrupt.h> 85#include <linux/interrupt.h>
86#include <linux/init.h> 86#include <linux/init.h>
87#include <linux/compat.h> 87#include <linux/compat.h>
88#include <linux/ptrace.h>
88#include <linux/swap.h> 89#include <linux/swap.h>
89#include <linux/seq_file.h> 90#include <linux/seq_file.h>
90#include <linux/proc_fs.h> 91#include <linux/proc_fs.h>
@@ -1365,7 +1366,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1365 const unsigned long __user *, old_nodes, 1366 const unsigned long __user *, old_nodes,
1366 const unsigned long __user *, new_nodes) 1367 const unsigned long __user *, new_nodes)
1367{ 1368{
1368 const struct cred *cred = current_cred(), *tcred;
1369 struct mm_struct *mm = NULL; 1369 struct mm_struct *mm = NULL;
1370 struct task_struct *task; 1370 struct task_struct *task;
1371 nodemask_t task_nodes; 1371 nodemask_t task_nodes;
@@ -1401,15 +1401,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1401 err = -EINVAL; 1401 err = -EINVAL;
1402 1402
1403 /* 1403 /*
1404 * Check if this process has the right to modify the specified 1404 * Check if this process has the right to modify the specified process.
1405 * process. The right exists if the process has administrative 1405 * Use the regular "ptrace_may_access()" checks.
1406 * capabilities, superuser privileges or the same
1407 * userid as the target process.
1408 */ 1406 */
1409 tcred = __task_cred(task); 1407 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1410 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1411 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1412 !capable(CAP_SYS_NICE)) {
1413 rcu_read_unlock(); 1408 rcu_read_unlock();
1414 err = -EPERM; 1409 err = -EPERM;
1415 goto out_put; 1410 goto out_put;
@@ -1920,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1920 struct page *page; 1915 struct page *page;
1921 1916
1922 page = __alloc_pages(gfp, order, nid); 1917 page = __alloc_pages(gfp, order, nid);
1918 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
1919 if (!static_branch_likely(&vm_numa_stat_key))
1920 return page;
1923 if (page && page_to_nid(page) == nid) { 1921 if (page && page_to_nid(page) == nid) {
1924 preempt_disable(); 1922 preempt_disable();
1925 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); 1923 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
diff --git a/mm/mempool.c b/mm/mempool.c
index c4a23cdae3f0..7d8c5a0010a2 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -189,7 +189,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
189 pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); 189 pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
190 if (!pool) 190 if (!pool)
191 return NULL; 191 return NULL;
192 pool->elements = kmalloc_node(min_nr * sizeof(void *), 192 pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
193 gfp_mask, node_id); 193 gfp_mask, node_id);
194 if (!pool->elements) { 194 if (!pool->elements) {
195 kfree(pool); 195 kfree(pool);
diff --git a/mm/migrate.c b/mm/migrate.c
index 1236449b4777..4d0be47a322a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2089,7 +2089,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
2089 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); 2089 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
2090 2090
2091 spin_unlock(ptl); 2091 spin_unlock(ptl);
2092 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2092 /*
2093 * No need to double call mmu_notifier->invalidate_range() callback as
2094 * the above pmdp_huge_clear_flush_notify() did already call it.
2095 */
2096 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
2093 2097
2094 /* Take an "isolate" reference and put new page on the LRU. */ 2098 /* Take an "isolate" reference and put new page on the LRU. */
2095 get_page(new_page); 2099 get_page(new_page);
@@ -2805,9 +2809,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
2805 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2809 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2806 } 2810 }
2807 2811
2812 /*
2813 * No need to double call mmu_notifier->invalidate_range() callback as
2814 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
2815 * did already call it.
2816 */
2808 if (notified) 2817 if (notified)
2809 mmu_notifier_invalidate_range_end(mm, mmu_start, 2818 mmu_notifier_invalidate_range_only_end(mm, mmu_start,
2810 migrate->end); 2819 migrate->end);
2811} 2820}
2812 2821
2813/* 2822/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 46af369c13e5..30472d438794 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
289 struct pagevec pvec_putback; 289 struct pagevec pvec_putback;
290 int pgrescued = 0; 290 int pgrescued = 0;
291 291
292 pagevec_init(&pvec_putback, 0); 292 pagevec_init(&pvec_putback);
293 293
294 /* Phase 1: page isolation */ 294 /* Phase 1: page isolation */
295 spin_lock_irq(zone_lru_lock(zone)); 295 spin_lock_irq(zone_lru_lock(zone));
@@ -448,7 +448,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
448 struct pagevec pvec; 448 struct pagevec pvec;
449 struct zone *zone; 449 struct zone *zone;
450 450
451 pagevec_init(&pvec, 0); 451 pagevec_init(&pvec);
452 /* 452 /*
453 * Although FOLL_DUMP is intended for get_dump_page(), 453 * Although FOLL_DUMP is intended for get_dump_page(),
454 * it just so happens that its special treatment of the 454 * it just so happens that its special treatment of the
@@ -670,8 +670,6 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
670 if (!can_do_mlock()) 670 if (!can_do_mlock())
671 return -EPERM; 671 return -EPERM;
672 672
673 lru_add_drain_all(); /* flush pagevec */
674
675 len = PAGE_ALIGN(len + (offset_in_page(start))); 673 len = PAGE_ALIGN(len + (offset_in_page(start)));
676 start &= PAGE_MASK; 674 start &= PAGE_MASK;
677 675
@@ -798,9 +796,6 @@ SYSCALL_DEFINE1(mlockall, int, flags)
798 if (!can_do_mlock()) 796 if (!can_do_mlock())
799 return -EPERM; 797 return -EPERM;
800 798
801 if (flags & MCL_CURRENT)
802 lru_add_drain_all(); /* flush pagevec */
803
804 lock_limit = rlimit(RLIMIT_MEMLOCK); 799 lock_limit = rlimit(RLIMIT_MEMLOCK);
805 lock_limit >>= PAGE_SHIFT; 800 lock_limit >>= PAGE_SHIFT;
806 801
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 314285284e6e..96edb33fd09a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -190,7 +190,9 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
190EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); 190EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
191 191
192void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 192void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
193 unsigned long start, unsigned long end) 193 unsigned long start,
194 unsigned long end,
195 bool only_end)
194{ 196{
195 struct mmu_notifier *mn; 197 struct mmu_notifier *mn;
196 int id; 198 int id;
@@ -204,8 +206,13 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
204 * subsystem registers either invalidate_range_start()/end() or 206 * subsystem registers either invalidate_range_start()/end() or
205 * invalidate_range(), so this will be no additional overhead 207 * invalidate_range(), so this will be no additional overhead
206 * (besides the pointer check). 208 * (besides the pointer check).
209 *
210 * We skip call to invalidate_range() if we know it is safe ie
211 * call site use mmu_notifier_invalidate_range_only_end() which
212 * is safe to do when we know that a call to invalidate_range()
213 * already happen under page table lock.
207 */ 214 */
208 if (mn->ops->invalidate_range) 215 if (!only_end && mn->ops->invalidate_range)
209 mn->ops->invalidate_range(mn, mm, start, end); 216 mn->ops->invalidate_range(mn, mm, start, end);
210 if (mn->ops->invalidate_range_end) 217 if (mn->ops->invalidate_range_end)
211 mn->ops->invalidate_range_end(mn, mm, start, end); 218 mn->ops->invalidate_range_end(mn, mm, start, end);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..c86fbd1b590e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
44 44
45#include <asm/tlb.h> 45#include <asm/tlb.h>
46#include "internal.h" 46#include "internal.h"
47#include "slab.h"
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
49#include <trace/events/oom.h> 50#include <trace/events/oom.h>
@@ -161,6 +162,25 @@ static bool oom_unkillable_task(struct task_struct *p,
161 return false; 162 return false;
162} 163}
163 164
165/*
166 * Print out unreclaimble slabs info when unreclaimable slabs amount is greater
167 * than all user memory (LRU pages)
168 */
169static bool is_dump_unreclaim_slabs(void)
170{
171 unsigned long nr_lru;
172
173 nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
174 global_node_page_state(NR_INACTIVE_ANON) +
175 global_node_page_state(NR_ACTIVE_FILE) +
176 global_node_page_state(NR_INACTIVE_FILE) +
177 global_node_page_state(NR_ISOLATED_ANON) +
178 global_node_page_state(NR_ISOLATED_FILE) +
179 global_node_page_state(NR_UNEVICTABLE);
180
181 return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
182}
183
164/** 184/**
165 * oom_badness - heuristic function to determine which candidate task to kill 185 * oom_badness - heuristic function to determine which candidate task to kill
166 * @p: task struct of which task we should calculate 186 * @p: task struct of which task we should calculate
@@ -201,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
201 * task's rss, pagetable and swap space use. 221 * task's rss, pagetable and swap space use.
202 */ 222 */
203 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + 223 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
204 atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); 224 mm_pgtables_bytes(p->mm) / PAGE_SIZE;
205 task_unlock(p); 225 task_unlock(p);
206 226
207 /* 227 /*
@@ -369,15 +389,15 @@ static void select_bad_process(struct oom_control *oc)
369 * Dumps the current memory state of all eligible tasks. Tasks not in the same 389 * Dumps the current memory state of all eligible tasks. Tasks not in the same
370 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 390 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
371 * are not shown. 391 * are not shown.
372 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, 392 * State information includes task's pid, uid, tgid, vm size, rss,
373 * swapents, oom_score_adj value, and name. 393 * pgtables_bytes, swapents, oom_score_adj value, and name.
374 */ 394 */
375static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) 395static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
376{ 396{
377 struct task_struct *p; 397 struct task_struct *p;
378 struct task_struct *task; 398 struct task_struct *task;
379 399
380 pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); 400 pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
381 rcu_read_lock(); 401 rcu_read_lock();
382 for_each_process(p) { 402 for_each_process(p) {
383 if (oom_unkillable_task(p, memcg, nodemask)) 403 if (oom_unkillable_task(p, memcg, nodemask))
@@ -393,11 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
393 continue; 413 continue;
394 } 414 }
395 415
396 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", 416 pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
397 task->pid, from_kuid(&init_user_ns, task_uid(task)), 417 task->pid, from_kuid(&init_user_ns, task_uid(task)),
398 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 418 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
399 atomic_long_read(&task->mm->nr_ptes), 419 mm_pgtables_bytes(task->mm),
400 mm_nr_pmds(task->mm),
401 get_mm_counter(task->mm, MM_SWAPENTS), 420 get_mm_counter(task->mm, MM_SWAPENTS),
402 task->signal->oom_score_adj, task->comm); 421 task->signal->oom_score_adj, task->comm);
403 task_unlock(task); 422 task_unlock(task);
@@ -407,23 +426,22 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
407 426
408static void dump_header(struct oom_control *oc, struct task_struct *p) 427static void dump_header(struct oom_control *oc, struct task_struct *p)
409{ 428{
410 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", 429 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
411 current->comm, oc->gfp_mask, &oc->gfp_mask); 430 current->comm, oc->gfp_mask, &oc->gfp_mask,
412 if (oc->nodemask) 431 nodemask_pr_args(oc->nodemask), oc->order,
413 pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); 432 current->signal->oom_score_adj);
414 else
415 pr_cont("(null)");
416 pr_cont(", order=%d, oom_score_adj=%hd\n",
417 oc->order, current->signal->oom_score_adj);
418 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) 433 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
419 pr_warn("COMPACTION is disabled!!!\n"); 434 pr_warn("COMPACTION is disabled!!!\n");
420 435
421 cpuset_print_current_mems_allowed(); 436 cpuset_print_current_mems_allowed();
422 dump_stack(); 437 dump_stack();
423 if (oc->memcg) 438 if (is_memcg_oom(oc))
424 mem_cgroup_print_oom_info(oc->memcg, p); 439 mem_cgroup_print_oom_info(oc->memcg, p);
425 else 440 else {
426 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); 441 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
442 if (is_dump_unreclaim_slabs())
443 dump_unreclaimable_slab();
444 }
427 if (sysctl_oom_dump_tasks) 445 if (sysctl_oom_dump_tasks)
428 dump_tasks(oc->memcg, oc->nodemask); 446 dump_tasks(oc->memcg, oc->nodemask);
429} 447}
@@ -618,9 +636,6 @@ static int oom_reaper(void *unused)
618 636
619static void wake_oom_reaper(struct task_struct *tsk) 637static void wake_oom_reaper(struct task_struct *tsk)
620{ 638{
621 if (!oom_reaper_th)
622 return;
623
624 /* tsk is already queued? */ 639 /* tsk is already queued? */
625 if (tsk == oom_reaper_list || tsk->oom_reaper_list) 640 if (tsk == oom_reaper_list || tsk->oom_reaper_list)
626 return; 641 return;
@@ -638,11 +653,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
638static int __init oom_init(void) 653static int __init oom_init(void)
639{ 654{
640 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); 655 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
641 if (IS_ERR(oom_reaper_th)) {
642 pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
643 PTR_ERR(oom_reaper_th));
644 oom_reaper_th = NULL;
645 }
646 return 0; 656 return 0;
647} 657}
648subsys_initcall(oom_init) 658subsys_initcall(oom_init)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c518c845f202..8a1551154285 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -433,8 +433,11 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
433 else 433 else
434 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; 434 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
435 435
436 if (bg_thresh >= thresh) 436 if (unlikely(bg_thresh >= thresh)) {
437 pr_warn("vm direct limit must be set greater than background limit.\n");
437 bg_thresh = thresh / 2; 438 bg_thresh = thresh / 2;
439 }
440
438 tsk = current; 441 tsk = current;
439 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { 442 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
440 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; 443 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
@@ -625,9 +628,9 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
625 * On idle system, we can be called long after we scheduled because we use 628 * On idle system, we can be called long after we scheduled because we use
626 * deferred timers so count with missed periods. 629 * deferred timers so count with missed periods.
627 */ 630 */
628static void writeout_period(unsigned long t) 631static void writeout_period(struct timer_list *t)
629{ 632{
630 struct wb_domain *dom = (void *)t; 633 struct wb_domain *dom = from_timer(dom, t, period_timer);
631 int miss_periods = (jiffies - dom->period_time) / 634 int miss_periods = (jiffies - dom->period_time) /
632 VM_COMPLETIONS_PERIOD_LEN; 635 VM_COMPLETIONS_PERIOD_LEN;
633 636
@@ -650,8 +653,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
650 653
651 spin_lock_init(&dom->lock); 654 spin_lock_init(&dom->lock);
652 655
653 setup_deferrable_timer(&dom->period_timer, writeout_period, 656 timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
654 (unsigned long)dom);
655 657
656 dom->dirty_limit_tstamp = jiffies; 658 dom->dirty_limit_tstamp = jiffies;
657 659
@@ -1543,7 +1545,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
1543 * actually dirty; with m+n sitting in the percpu 1545 * actually dirty; with m+n sitting in the percpu
1544 * deltas. 1546 * deltas.
1545 */ 1547 */
1546 if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { 1548 if (dtc->wb_thresh < 2 * wb_stat_error()) {
1547 wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); 1549 wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
1548 dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); 1550 dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
1549 } else { 1551 } else {
@@ -1559,8 +1561,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
1559 * If we're over `background_thresh' then the writeback threads are woken to 1561 * If we're over `background_thresh' then the writeback threads are woken to
1560 * perform some writeout. 1562 * perform some writeout.
1561 */ 1563 */
1562static void balance_dirty_pages(struct address_space *mapping, 1564static void balance_dirty_pages(struct bdi_writeback *wb,
1563 struct bdi_writeback *wb,
1564 unsigned long pages_dirtied) 1565 unsigned long pages_dirtied)
1565{ 1566{
1566 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; 1567 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
@@ -1802,7 +1803,7 @@ pause:
1802 * more page. However wb_dirty has accounting errors. So use 1803 * more page. However wb_dirty has accounting errors. So use
1803 * the larger and more IO friendly wb_stat_error. 1804 * the larger and more IO friendly wb_stat_error.
1804 */ 1805 */
1805 if (sdtc->wb_dirty <= wb_stat_error(wb)) 1806 if (sdtc->wb_dirty <= wb_stat_error())
1806 break; 1807 break;
1807 1808
1808 if (fatal_signal_pending(current)) 1809 if (fatal_signal_pending(current))
@@ -1910,7 +1911,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
1910 preempt_enable(); 1911 preempt_enable();
1911 1912
1912 if (unlikely(current->nr_dirtied >= ratelimit)) 1913 if (unlikely(current->nr_dirtied >= ratelimit))
1913 balance_dirty_pages(mapping, wb, current->nr_dirtied); 1914 balance_dirty_pages(wb, current->nr_dirtied);
1914 1915
1915 wb_put(wb); 1916 wb_put(wb);
1916} 1917}
@@ -2167,7 +2168,7 @@ int write_cache_pages(struct address_space *mapping,
2167 int range_whole = 0; 2168 int range_whole = 0;
2168 int tag; 2169 int tag;
2169 2170
2170 pagevec_init(&pvec, 0); 2171 pagevec_init(&pvec);
2171 if (wbc->range_cyclic) { 2172 if (wbc->range_cyclic) {
2172 writeback_index = mapping->writeback_index; /* prev offset */ 2173 writeback_index = mapping->writeback_index; /* prev offset */
2173 index = writeback_index; 2174 index = writeback_index;
@@ -2194,30 +2195,14 @@ retry:
2194 while (!done && (index <= end)) { 2195 while (!done && (index <= end)) {
2195 int i; 2196 int i;
2196 2197
2197 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2198 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
2198 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2199 tag);
2199 if (nr_pages == 0) 2200 if (nr_pages == 0)
2200 break; 2201 break;
2201 2202
2202 for (i = 0; i < nr_pages; i++) { 2203 for (i = 0; i < nr_pages; i++) {
2203 struct page *page = pvec.pages[i]; 2204 struct page *page = pvec.pages[i];
2204 2205
2205 /*
2206 * At this point, the page may be truncated or
2207 * invalidated (changing page->mapping to NULL), or
2208 * even swizzled back from swapper_space to tmpfs file
2209 * mapping. However, page->index will not change
2210 * because we have a reference on the page.
2211 */
2212 if (page->index > end) {
2213 /*
2214 * can't be range_cyclic (1st pass) because
2215 * end == -1 in that case.
2216 */
2217 done = 1;
2218 break;
2219 }
2220
2221 done_index = page->index; 2206 done_index = page->index;
2222 2207
2223 lock_page(page); 2208 lock_page(page);
@@ -2623,7 +2608,7 @@ EXPORT_SYMBOL(set_page_dirty_lock);
2623 * page without actually doing it through the VM. Can you say "ext3 is 2608 * page without actually doing it through the VM. Can you say "ext3 is
2624 * horribly ugly"? Thought you could. 2609 * horribly ugly"? Thought you could.
2625 */ 2610 */
2626void cancel_dirty_page(struct page *page) 2611void __cancel_dirty_page(struct page *page)
2627{ 2612{
2628 struct address_space *mapping = page_mapping(page); 2613 struct address_space *mapping = page_mapping(page);
2629 2614
@@ -2644,7 +2629,7 @@ void cancel_dirty_page(struct page *page)
2644 ClearPageDirty(page); 2629 ClearPageDirty(page);
2645 } 2630 }
2646} 2631}
2647EXPORT_SYMBOL(cancel_dirty_page); 2632EXPORT_SYMBOL(__cancel_dirty_page);
2648 2633
2649/* 2634/*
2650 * Clear a page's dirty flag, while caring for dirty memory accounting. 2635 * Clear a page's dirty flag, while caring for dirty memory accounting.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77e4d3c5c57b..55ded92f9809 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
24#include <linux/memblock.h> 24#include <linux/memblock.h>
25#include <linux/compiler.h> 25#include <linux/compiler.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/kmemcheck.h>
28#include <linux/kasan.h> 27#include <linux/kasan.h>
29#include <linux/module.h> 28#include <linux/module.h>
30#include <linux/suspend.h> 29#include <linux/suspend.h>
@@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
83EXPORT_PER_CPU_SYMBOL(numa_node); 82EXPORT_PER_CPU_SYMBOL(numa_node);
84#endif 83#endif
85 84
85DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
86
86#ifdef CONFIG_HAVE_MEMORYLESS_NODES 87#ifdef CONFIG_HAVE_MEMORYLESS_NODES
87/* 88/*
88 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 89 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
290int page_group_by_mobility_disabled __read_mostly; 291int page_group_by_mobility_disabled __read_mostly;
291 292
292#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 293#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
294
295/*
296 * Determine how many pages need to be initialized durig early boot
297 * (non-deferred initialization).
298 * The value of first_deferred_pfn will be set later, once non-deferred pages
299 * are initialized, but for now set it ULONG_MAX.
300 */
293static inline void reset_deferred_meminit(pg_data_t *pgdat) 301static inline void reset_deferred_meminit(pg_data_t *pgdat)
294{ 302{
295 unsigned long max_initialise; 303 phys_addr_t start_addr, end_addr;
296 unsigned long reserved_lowmem; 304 unsigned long max_pgcnt;
305 unsigned long reserved;
297 306
298 /* 307 /*
299 * Initialise at least 2G of a node but also take into account that 308 * Initialise at least 2G of a node but also take into account that
300 * two large system hashes that can take up 1GB for 0.25TB/node. 309 * two large system hashes that can take up 1GB for 0.25TB/node.
301 */ 310 */
302 max_initialise = max(2UL << (30 - PAGE_SHIFT), 311 max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
303 (pgdat->node_spanned_pages >> 8)); 312 (pgdat->node_spanned_pages >> 8));
304 313
305 /* 314 /*
306 * Compensate the all the memblock reservations (e.g. crash kernel) 315 * Compensate the all the memblock reservations (e.g. crash kernel)
307 * from the initial estimation to make sure we will initialize enough 316 * from the initial estimation to make sure we will initialize enough
308 * memory to boot. 317 * memory to boot.
309 */ 318 */
310 reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, 319 start_addr = PFN_PHYS(pgdat->node_start_pfn);
311 pgdat->node_start_pfn + max_initialise); 320 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
312 max_initialise += reserved_lowmem; 321 reserved = memblock_reserved_memory_within(start_addr, end_addr);
322 max_pgcnt += PHYS_PFN(reserved);
313 323
314 pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); 324 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
315 pgdat->first_deferred_pfn = ULONG_MAX; 325 pgdat->first_deferred_pfn = ULONG_MAX;
316} 326}
317 327
@@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
338 if (zone_end < pgdat_end_pfn(pgdat)) 348 if (zone_end < pgdat_end_pfn(pgdat))
339 return true; 349 return true;
340 (*nr_initialised)++; 350 (*nr_initialised)++;
341 if ((*nr_initialised > pgdat->static_init_size) && 351 if ((*nr_initialised > pgdat->static_init_pgcnt) &&
342 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 352 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
343 pgdat->first_deferred_pfn = pfn; 353 pgdat->first_deferred_pfn = pfn;
344 return false; 354 return false;
@@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
1013 VM_BUG_ON_PAGE(PageTail(page), page); 1023 VM_BUG_ON_PAGE(PageTail(page), page);
1014 1024
1015 trace_mm_page_free(page, order); 1025 trace_mm_page_free(page, order);
1016 kmemcheck_free_shadow(page, order);
1017 1026
1018 /* 1027 /*
1019 * Check tail pages before head page information is cleared to 1028 * Check tail pages before head page information is cleared to
@@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone,
1170static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1179static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1171 unsigned long zone, int nid) 1180 unsigned long zone, int nid)
1172{ 1181{
1182 mm_zero_struct_page(page);
1173 set_page_links(page, zone, nid, pfn); 1183 set_page_links(page, zone, nid, pfn);
1174 init_page_count(page); 1184 init_page_count(page);
1175 page_mapcount_reset(page); 1185 page_mapcount_reset(page);
@@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone)
1410} 1420}
1411 1421
1412#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1422#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1413static void __init deferred_free_range(struct page *page, 1423static void __init deferred_free_range(unsigned long pfn,
1414 unsigned long pfn, int nr_pages) 1424 unsigned long nr_pages)
1415{ 1425{
1416 int i; 1426 struct page *page;
1427 unsigned long i;
1417 1428
1418 if (!page) 1429 if (!nr_pages)
1419 return; 1430 return;
1420 1431
1432 page = pfn_to_page(pfn);
1433
1421 /* Free a large naturally-aligned chunk if possible */ 1434 /* Free a large naturally-aligned chunk if possible */
1422 if (nr_pages == pageblock_nr_pages && 1435 if (nr_pages == pageblock_nr_pages &&
1423 (pfn & (pageblock_nr_pages - 1)) == 0) { 1436 (pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void)
1443 complete(&pgdat_init_all_done_comp); 1456 complete(&pgdat_init_all_done_comp);
1444} 1457}
1445 1458
1459/*
1460 * Helper for deferred_init_range, free the given range, reset the counters, and
1461 * return number of pages freed.
1462 */
1463static inline unsigned long __init __def_free(unsigned long *nr_free,
1464 unsigned long *free_base_pfn,
1465 struct page **page)
1466{
1467 unsigned long nr = *nr_free;
1468
1469 deferred_free_range(*free_base_pfn, nr);
1470 *free_base_pfn = 0;
1471 *nr_free = 0;
1472 *page = NULL;
1473
1474 return nr;
1475}
1476
1477static unsigned long __init deferred_init_range(int nid, int zid,
1478 unsigned long start_pfn,
1479 unsigned long end_pfn)
1480{
1481 struct mminit_pfnnid_cache nid_init_state = { };
1482 unsigned long nr_pgmask = pageblock_nr_pages - 1;
1483 unsigned long free_base_pfn = 0;
1484 unsigned long nr_pages = 0;
1485 unsigned long nr_free = 0;
1486 struct page *page = NULL;
1487 unsigned long pfn;
1488
1489 /*
1490 * First we check if pfn is valid on architectures where it is possible
1491 * to have holes within pageblock_nr_pages. On systems where it is not
1492 * possible, this function is optimized out.
1493 *
1494 * Then, we check if a current large page is valid by only checking the
1495 * validity of the head pfn.
1496 *
1497 * meminit_pfn_in_nid is checked on systems where pfns can interleave
1498 * within a node: a pfn is between start and end of a node, but does not
1499 * belong to this memory node.
1500 *
1501 * Finally, we minimize pfn page lookups and scheduler checks by
1502 * performing it only once every pageblock_nr_pages.
1503 *
1504 * We do it in two loops: first we initialize struct page, than free to
1505 * buddy allocator, becuse while we are freeing pages we can access
1506 * pages that are ahead (computing buddy page in __free_one_page()).
1507 */
1508 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1509 if (!pfn_valid_within(pfn))
1510 continue;
1511 if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
1512 if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1513 if (page && (pfn & nr_pgmask))
1514 page++;
1515 else
1516 page = pfn_to_page(pfn);
1517 __init_single_page(page, pfn, zid, nid);
1518 cond_resched();
1519 }
1520 }
1521 }
1522
1523 page = NULL;
1524 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1525 if (!pfn_valid_within(pfn)) {
1526 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1527 } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
1528 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1529 } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1530 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1531 } else if (page && (pfn & nr_pgmask)) {
1532 page++;
1533 nr_free++;
1534 } else {
1535 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1536 page = pfn_to_page(pfn);
1537 free_base_pfn = pfn;
1538 nr_free = 1;
1539 cond_resched();
1540 }
1541 }
1542 /* Free the last block of pages to allocator */
1543 nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
1544
1545 return nr_pages;
1546}
1547
1446/* Initialise remaining memory on a node */ 1548/* Initialise remaining memory on a node */
1447static int __init deferred_init_memmap(void *data) 1549static int __init deferred_init_memmap(void *data)
1448{ 1550{
1449 pg_data_t *pgdat = data; 1551 pg_data_t *pgdat = data;
1450 int nid = pgdat->node_id; 1552 int nid = pgdat->node_id;
1451 struct mminit_pfnnid_cache nid_init_state = { };
1452 unsigned long start = jiffies; 1553 unsigned long start = jiffies;
1453 unsigned long nr_pages = 0; 1554 unsigned long nr_pages = 0;
1454 unsigned long walk_start, walk_end; 1555 unsigned long spfn, epfn;
1455 int i, zid; 1556 phys_addr_t spa, epa;
1557 int zid;
1456 struct zone *zone; 1558 struct zone *zone;
1457 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1559 unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1458 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1560 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1561 u64 i;
1459 1562
1460 if (first_init_pfn == ULONG_MAX) { 1563 if (first_init_pfn == ULONG_MAX) {
1461 pgdat_init_report_one_done(); 1564 pgdat_init_report_one_done();
@@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data)
1477 if (first_init_pfn < zone_end_pfn(zone)) 1580 if (first_init_pfn < zone_end_pfn(zone))
1478 break; 1581 break;
1479 } 1582 }
1583 first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
1480 1584
1481 for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { 1585 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1482 unsigned long pfn, end_pfn; 1586 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1483 struct page *page = NULL; 1587 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1484 struct page *free_base_page = NULL; 1588 nr_pages += deferred_init_range(nid, zid, spfn, epfn);
1485 unsigned long free_base_pfn = 0;
1486 int nr_to_free = 0;
1487
1488 end_pfn = min(walk_end, zone_end_pfn(zone));
1489 pfn = first_init_pfn;
1490 if (pfn < walk_start)
1491 pfn = walk_start;
1492 if (pfn < zone->zone_start_pfn)
1493 pfn = zone->zone_start_pfn;
1494
1495 for (; pfn < end_pfn; pfn++) {
1496 if (!pfn_valid_within(pfn))
1497 goto free_range;
1498
1499 /*
1500 * Ensure pfn_valid is checked every
1501 * pageblock_nr_pages for memory holes
1502 */
1503 if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1504 if (!pfn_valid(pfn)) {
1505 page = NULL;
1506 goto free_range;
1507 }
1508 }
1509
1510 if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1511 page = NULL;
1512 goto free_range;
1513 }
1514
1515 /* Minimise pfn page lookups and scheduler checks */
1516 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1517 page++;
1518 } else {
1519 nr_pages += nr_to_free;
1520 deferred_free_range(free_base_page,
1521 free_base_pfn, nr_to_free);
1522 free_base_page = NULL;
1523 free_base_pfn = nr_to_free = 0;
1524
1525 page = pfn_to_page(pfn);
1526 cond_resched();
1527 }
1528
1529 if (page->flags) {
1530 VM_BUG_ON(page_zone(page) != zone);
1531 goto free_range;
1532 }
1533
1534 __init_single_page(page, pfn, zid, nid);
1535 if (!free_base_page) {
1536 free_base_page = page;
1537 free_base_pfn = pfn;
1538 nr_to_free = 0;
1539 }
1540 nr_to_free++;
1541
1542 /* Where possible, batch up pages for a single free */
1543 continue;
1544free_range:
1545 /* Free the current block of pages to allocator */
1546 nr_pages += nr_to_free;
1547 deferred_free_range(free_base_page, free_base_pfn,
1548 nr_to_free);
1549 free_base_page = NULL;
1550 free_base_pfn = nr_to_free = 0;
1551 }
1552 /* Free the last block of pages to allocator */
1553 nr_pages += nr_to_free;
1554 deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1555
1556 first_init_pfn = max(end_pfn, first_init_pfn);
1557 } 1589 }
1558 1590
1559 /* Sanity check that the next zone really is unpopulated */ 1591 /* Sanity check that the next zone really is unpopulated */
@@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
1792 * Go through the free lists for the given migratetype and remove 1824 * Go through the free lists for the given migratetype and remove
1793 * the smallest available page from the freelists 1825 * the smallest available page from the freelists
1794 */ 1826 */
1795static inline 1827static __always_inline
1796struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1828struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1797 int migratetype) 1829 int migratetype)
1798{ 1830{
@@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
1836}; 1868};
1837 1869
1838#ifdef CONFIG_CMA 1870#ifdef CONFIG_CMA
1839static struct page *__rmqueue_cma_fallback(struct zone *zone, 1871static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1840 unsigned int order) 1872 unsigned int order)
1841{ 1873{
1842 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1874 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
@@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2217 * deviation from the rest of this file, to make the for loop 2249 * deviation from the rest of this file, to make the for loop
2218 * condition simpler. 2250 * condition simpler.
2219 */ 2251 */
2220static inline bool 2252static __always_inline bool
2221__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 2253__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
2222{ 2254{
2223 struct free_area *area; 2255 struct free_area *area;
@@ -2289,8 +2321,8 @@ do_steal:
2289 * Do the hard work of removing an element from the buddy allocator. 2321 * Do the hard work of removing an element from the buddy allocator.
2290 * Call me with the zone->lock already held. 2322 * Call me with the zone->lock already held.
2291 */ 2323 */
2292static struct page *__rmqueue(struct zone *zone, unsigned int order, 2324static __always_inline struct page *
2293 int migratetype) 2325__rmqueue(struct zone *zone, unsigned int order, int migratetype)
2294{ 2326{
2295 struct page *page; 2327 struct page *page;
2296 2328
@@ -2315,7 +2347,7 @@ retry:
2315 */ 2347 */
2316static int rmqueue_bulk(struct zone *zone, unsigned int order, 2348static int rmqueue_bulk(struct zone *zone, unsigned int order,
2317 unsigned long count, struct list_head *list, 2349 unsigned long count, struct list_head *list,
2318 int migratetype, bool cold) 2350 int migratetype)
2319{ 2351{
2320 int i, alloced = 0; 2352 int i, alloced = 0;
2321 2353
@@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2329 continue; 2361 continue;
2330 2362
2331 /* 2363 /*
2332 * Split buddy pages returned by expand() are received here 2364 * Split buddy pages returned by expand() are received here in
2333 * in physical page order. The page is added to the callers and 2365 * physical page order. The page is added to the tail of
2334 * list and the list head then moves forward. From the callers 2366 * caller's list. From the callers perspective, the linked list
2335 * perspective, the linked list is ordered by page number in 2367 * is ordered by page number under some conditions. This is
2336 * some conditions. This is useful for IO devices that can 2368 * useful for IO devices that can forward direction from the
2337 * merge IO requests if the physical pages are ordered 2369 * head, thus also in the physical page order. This is useful
2338 * properly. 2370 * for IO devices that can merge IO requests if the physical
2371 * pages are ordered properly.
2339 */ 2372 */
2340 if (likely(!cold)) 2373 list_add_tail(&page->lru, list);
2341 list_add(&page->lru, list);
2342 else
2343 list_add_tail(&page->lru, list);
2344 list = &page->lru;
2345 alloced++; 2374 alloced++;
2346 if (is_migrate_cma(get_pcppage_migratetype(page))) 2375 if (is_migrate_cma(get_pcppage_migratetype(page)))
2347 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2376 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -2590,24 +2619,25 @@ void mark_free_pages(struct zone *zone)
2590} 2619}
2591#endif /* CONFIG_PM */ 2620#endif /* CONFIG_PM */
2592 2621
2593/* 2622static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
2594 * Free a 0-order page
2595 * cold == true ? free a cold page : free a hot page
2596 */
2597void free_hot_cold_page(struct page *page, bool cold)
2598{ 2623{
2599 struct zone *zone = page_zone(page);
2600 struct per_cpu_pages *pcp;
2601 unsigned long flags;
2602 unsigned long pfn = page_to_pfn(page);
2603 int migratetype; 2624 int migratetype;
2604 2625
2605 if (!free_pcp_prepare(page)) 2626 if (!free_pcp_prepare(page))
2606 return; 2627 return false;
2607 2628
2608 migratetype = get_pfnblock_migratetype(page, pfn); 2629 migratetype = get_pfnblock_migratetype(page, pfn);
2609 set_pcppage_migratetype(page, migratetype); 2630 set_pcppage_migratetype(page, migratetype);
2610 local_irq_save(flags); 2631 return true;
2632}
2633
2634static void free_unref_page_commit(struct page *page, unsigned long pfn)
2635{
2636 struct zone *zone = page_zone(page);
2637 struct per_cpu_pages *pcp;
2638 int migratetype;
2639
2640 migratetype = get_pcppage_migratetype(page);
2611 __count_vm_event(PGFREE); 2641 __count_vm_event(PGFREE);
2612 2642
2613 /* 2643 /*
@@ -2620,38 +2650,62 @@ void free_hot_cold_page(struct page *page, bool cold)
2620 if (migratetype >= MIGRATE_PCPTYPES) { 2650 if (migratetype >= MIGRATE_PCPTYPES) {
2621 if (unlikely(is_migrate_isolate(migratetype))) { 2651 if (unlikely(is_migrate_isolate(migratetype))) {
2622 free_one_page(zone, page, pfn, 0, migratetype); 2652 free_one_page(zone, page, pfn, 0, migratetype);
2623 goto out; 2653 return;
2624 } 2654 }
2625 migratetype = MIGRATE_MOVABLE; 2655 migratetype = MIGRATE_MOVABLE;
2626 } 2656 }
2627 2657
2628 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2658 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2629 if (!cold) 2659 list_add(&page->lru, &pcp->lists[migratetype]);
2630 list_add(&page->lru, &pcp->lists[migratetype]);
2631 else
2632 list_add_tail(&page->lru, &pcp->lists[migratetype]);
2633 pcp->count++; 2660 pcp->count++;
2634 if (pcp->count >= pcp->high) { 2661 if (pcp->count >= pcp->high) {
2635 unsigned long batch = READ_ONCE(pcp->batch); 2662 unsigned long batch = READ_ONCE(pcp->batch);
2636 free_pcppages_bulk(zone, batch, pcp); 2663 free_pcppages_bulk(zone, batch, pcp);
2637 pcp->count -= batch; 2664 pcp->count -= batch;
2638 } 2665 }
2666}
2639 2667
2640out: 2668/*
2669 * Free a 0-order page
2670 */
2671void free_unref_page(struct page *page)
2672{
2673 unsigned long flags;
2674 unsigned long pfn = page_to_pfn(page);
2675
2676 if (!free_unref_page_prepare(page, pfn))
2677 return;
2678
2679 local_irq_save(flags);
2680 free_unref_page_commit(page, pfn);
2641 local_irq_restore(flags); 2681 local_irq_restore(flags);
2642} 2682}
2643 2683
2644/* 2684/*
2645 * Free a list of 0-order pages 2685 * Free a list of 0-order pages
2646 */ 2686 */
2647void free_hot_cold_page_list(struct list_head *list, bool cold) 2687void free_unref_page_list(struct list_head *list)
2648{ 2688{
2649 struct page *page, *next; 2689 struct page *page, *next;
2690 unsigned long flags, pfn;
2691
2692 /* Prepare pages for freeing */
2693 list_for_each_entry_safe(page, next, list, lru) {
2694 pfn = page_to_pfn(page);
2695 if (!free_unref_page_prepare(page, pfn))
2696 list_del(&page->lru);
2697 set_page_private(page, pfn);
2698 }
2650 2699
2700 local_irq_save(flags);
2651 list_for_each_entry_safe(page, next, list, lru) { 2701 list_for_each_entry_safe(page, next, list, lru) {
2652 trace_mm_page_free_batched(page, cold); 2702 unsigned long pfn = page_private(page);
2653 free_hot_cold_page(page, cold); 2703
2704 set_page_private(page, 0);
2705 trace_mm_page_free_batched(page);
2706 free_unref_page_commit(page, pfn);
2654 } 2707 }
2708 local_irq_restore(flags);
2655} 2709}
2656 2710
2657/* 2711/*
@@ -2669,15 +2723,6 @@ void split_page(struct page *page, unsigned int order)
2669 VM_BUG_ON_PAGE(PageCompound(page), page); 2723 VM_BUG_ON_PAGE(PageCompound(page), page);
2670 VM_BUG_ON_PAGE(!page_count(page), page); 2724 VM_BUG_ON_PAGE(!page_count(page), page);
2671 2725
2672#ifdef CONFIG_KMEMCHECK
2673 /*
2674 * Split shadow pages too, because free(page[0]) would
2675 * otherwise free the whole shadow.
2676 */
2677 if (kmemcheck_page_is_tracked(page))
2678 split_page(virt_to_page(page[0].shadow), order);
2679#endif
2680
2681 for (i = 1; i < (1 << order); i++) 2726 for (i = 1; i < (1 << order); i++)
2682 set_page_refcounted(page + i); 2727 set_page_refcounted(page + i);
2683 split_page_owner(page, order); 2728 split_page_owner(page, order);
@@ -2743,6 +2788,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2743#ifdef CONFIG_NUMA 2788#ifdef CONFIG_NUMA
2744 enum numa_stat_item local_stat = NUMA_LOCAL; 2789 enum numa_stat_item local_stat = NUMA_LOCAL;
2745 2790
2791 /* skip numa counters update if numa stats is disabled */
2792 if (!static_branch_likely(&vm_numa_stat_key))
2793 return;
2794
2746 if (z->node != numa_node_id()) 2795 if (z->node != numa_node_id())
2747 local_stat = NUMA_OTHER; 2796 local_stat = NUMA_OTHER;
2748 2797
@@ -2758,7 +2807,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2758 2807
2759/* Remove page from the per-cpu list, caller must protect the list */ 2808/* Remove page from the per-cpu list, caller must protect the list */
2760static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 2809static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2761 bool cold, struct per_cpu_pages *pcp, 2810 struct per_cpu_pages *pcp,
2762 struct list_head *list) 2811 struct list_head *list)
2763{ 2812{
2764 struct page *page; 2813 struct page *page;
@@ -2767,16 +2816,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2767 if (list_empty(list)) { 2816 if (list_empty(list)) {
2768 pcp->count += rmqueue_bulk(zone, 0, 2817 pcp->count += rmqueue_bulk(zone, 0,
2769 pcp->batch, list, 2818 pcp->batch, list,
2770 migratetype, cold); 2819 migratetype);
2771 if (unlikely(list_empty(list))) 2820 if (unlikely(list_empty(list)))
2772 return NULL; 2821 return NULL;
2773 } 2822 }
2774 2823
2775 if (cold) 2824 page = list_first_entry(list, struct page, lru);
2776 page = list_last_entry(list, struct page, lru);
2777 else
2778 page = list_first_entry(list, struct page, lru);
2779
2780 list_del(&page->lru); 2825 list_del(&page->lru);
2781 pcp->count--; 2826 pcp->count--;
2782 } while (check_new_pcp(page)); 2827 } while (check_new_pcp(page));
@@ -2791,14 +2836,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2791{ 2836{
2792 struct per_cpu_pages *pcp; 2837 struct per_cpu_pages *pcp;
2793 struct list_head *list; 2838 struct list_head *list;
2794 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2795 struct page *page; 2839 struct page *page;
2796 unsigned long flags; 2840 unsigned long flags;
2797 2841
2798 local_irq_save(flags); 2842 local_irq_save(flags);
2799 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2843 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2800 list = &pcp->lists[migratetype]; 2844 list = &pcp->lists[migratetype];
2801 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); 2845 page = __rmqueue_pcplist(zone, migratetype, pcp, list);
2802 if (page) { 2846 if (page) {
2803 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2847 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2804 zone_statistics(preferred_zone, zone); 2848 zone_statistics(preferred_zone, zone);
@@ -3006,9 +3050,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3006 if (!area->nr_free) 3050 if (!area->nr_free)
3007 continue; 3051 continue;
3008 3052
3009 if (alloc_harder)
3010 return true;
3011
3012 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3053 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3013 if (!list_empty(&area->free_list[mt])) 3054 if (!list_empty(&area->free_list[mt]))
3014 return true; 3055 return true;
@@ -3020,6 +3061,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3020 return true; 3061 return true;
3021 } 3062 }
3022#endif 3063#endif
3064 if (alloc_harder &&
3065 !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3066 return true;
3023 } 3067 }
3024 return false; 3068 return false;
3025} 3069}
@@ -3235,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3235 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 3279 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3236 return; 3280 return;
3237 3281
3238 pr_warn("%s: ", current->comm);
3239
3240 va_start(args, fmt); 3282 va_start(args, fmt);
3241 vaf.fmt = fmt; 3283 vaf.fmt = fmt;
3242 vaf.va = &args; 3284 vaf.va = &args;
3243 pr_cont("%pV", &vaf); 3285 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
3286 current->comm, &vaf, gfp_mask, &gfp_mask,
3287 nodemask_pr_args(nodemask));
3244 va_end(args); 3288 va_end(args);
3245 3289
3246 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
3247 if (nodemask)
3248 pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
3249 else
3250 pr_cont("(null)\n");
3251
3252 cpuset_print_current_mems_allowed(); 3290 cpuset_print_current_mems_allowed();
3253 3291
3254 dump_stack(); 3292 dump_stack();
@@ -3868,8 +3906,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3868 enum compact_result compact_result; 3906 enum compact_result compact_result;
3869 int compaction_retries; 3907 int compaction_retries;
3870 int no_progress_loops; 3908 int no_progress_loops;
3871 unsigned long alloc_start = jiffies;
3872 unsigned int stall_timeout = 10 * HZ;
3873 unsigned int cpuset_mems_cookie; 3909 unsigned int cpuset_mems_cookie;
3874 int reserve_flags; 3910 int reserve_flags;
3875 3911
@@ -4001,14 +4037,6 @@ retry:
4001 if (!can_direct_reclaim) 4037 if (!can_direct_reclaim)
4002 goto nopage; 4038 goto nopage;
4003 4039
4004 /* Make sure we know about allocations which stall for too long */
4005 if (time_after(jiffies, alloc_start + stall_timeout)) {
4006 warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
4007 "page allocation stalls for %ums, order:%u",
4008 jiffies_to_msecs(jiffies-alloc_start), order);
4009 stall_timeout += 10 * HZ;
4010 }
4011
4012 /* Avoid recursion of direct reclaim */ 4040 /* Avoid recursion of direct reclaim */
4013 if (current->flags & PF_MEMALLOC) 4041 if (current->flags & PF_MEMALLOC)
4014 goto nopage; 4042 goto nopage;
@@ -4223,9 +4251,6 @@ out:
4223 page = NULL; 4251 page = NULL;
4224 } 4252 }
4225 4253
4226 if (kmemcheck_enabled && page)
4227 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
4228
4229 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); 4254 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
4230 4255
4231 return page; 4256 return page;
@@ -4262,7 +4287,7 @@ void __free_pages(struct page *page, unsigned int order)
4262{ 4287{
4263 if (put_page_testzero(page)) { 4288 if (put_page_testzero(page)) {
4264 if (order == 0) 4289 if (order == 0)
4265 free_hot_cold_page(page, false); 4290 free_unref_page(page);
4266 else 4291 else
4267 __free_pages_ok(page, order); 4292 __free_pages_ok(page, order);
4268 } 4293 }
@@ -4320,7 +4345,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
4320 unsigned int order = compound_order(page); 4345 unsigned int order = compound_order(page);
4321 4346
4322 if (order == 0) 4347 if (order == 0)
4323 free_hot_cold_page(page, false); 4348 free_unref_page(page);
4324 else 4349 else
4325 __free_pages_ok(page, order); 4350 __free_pages_ok(page, order);
4326 } 4351 }
@@ -6126,6 +6151,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
6126 } 6151 }
6127} 6152}
6128 6153
6154#ifdef CONFIG_FLAT_NODE_MEM_MAP
6129static void __ref alloc_node_mem_map(struct pglist_data *pgdat) 6155static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6130{ 6156{
6131 unsigned long __maybe_unused start = 0; 6157 unsigned long __maybe_unused start = 0;
@@ -6135,7 +6161,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6135 if (!pgdat->node_spanned_pages) 6161 if (!pgdat->node_spanned_pages)
6136 return; 6162 return;
6137 6163
6138#ifdef CONFIG_FLAT_NODE_MEM_MAP
6139 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 6164 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
6140 offset = pgdat->node_start_pfn - start; 6165 offset = pgdat->node_start_pfn - start;
6141 /* ia64 gets its own node_mem_map, before this, without bootmem */ 6166 /* ia64 gets its own node_mem_map, before this, without bootmem */
@@ -6157,6 +6182,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6157 pgdat->node_id); 6182 pgdat->node_id);
6158 pgdat->node_mem_map = map + offset; 6183 pgdat->node_mem_map = map + offset;
6159 } 6184 }
6185 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
6186 __func__, pgdat->node_id, (unsigned long)pgdat,
6187 (unsigned long)pgdat->node_mem_map);
6160#ifndef CONFIG_NEED_MULTIPLE_NODES 6188#ifndef CONFIG_NEED_MULTIPLE_NODES
6161 /* 6189 /*
6162 * With no DISCONTIG, the global mem_map is just set as node 0's 6190 * With no DISCONTIG, the global mem_map is just set as node 0's
@@ -6169,8 +6197,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6169#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6197#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6170 } 6198 }
6171#endif 6199#endif
6172#endif /* CONFIG_FLAT_NODE_MEM_MAP */
6173} 6200}
6201#else
6202static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
6203#endif /* CONFIG_FLAT_NODE_MEM_MAP */
6174 6204
6175void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 6205void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6176 unsigned long node_start_pfn, unsigned long *zholes_size) 6206 unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -6197,16 +6227,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6197 zones_size, zholes_size); 6227 zones_size, zholes_size);
6198 6228
6199 alloc_node_mem_map(pgdat); 6229 alloc_node_mem_map(pgdat);
6200#ifdef CONFIG_FLAT_NODE_MEM_MAP
6201 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
6202 nid, (unsigned long)pgdat,
6203 (unsigned long)pgdat->node_mem_map);
6204#endif
6205 6230
6206 reset_deferred_meminit(pgdat); 6231 reset_deferred_meminit(pgdat);
6207 free_area_init_core(pgdat); 6232 free_area_init_core(pgdat);
6208} 6233}
6209 6234
6235#ifdef CONFIG_HAVE_MEMBLOCK
6236/*
6237 * Only struct pages that are backed by physical memory are zeroed and
6238 * initialized by going through __init_single_page(). But, there are some
6239 * struct pages which are reserved in memblock allocator and their fields
6240 * may be accessed (for example page_to_pfn() on some configuration accesses
6241 * flags). We must explicitly zero those struct pages.
6242 */
6243void __paginginit zero_resv_unavail(void)
6244{
6245 phys_addr_t start, end;
6246 unsigned long pfn;
6247 u64 i, pgcnt;
6248
6249 /*
6250 * Loop through ranges that are reserved, but do not have reported
6251 * physical memory backing.
6252 */
6253 pgcnt = 0;
6254 for_each_resv_unavail_range(i, &start, &end) {
6255 for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
6256 mm_zero_struct_page(pfn_to_page(pfn));
6257 pgcnt++;
6258 }
6259 }
6260
6261 /*
6262 * Struct pages that do not have backing memory. This could be because
6263 * firmware is using some of this memory, or for some other reasons.
6264 * Once memblock is changed so such behaviour is not allowed: i.e.
6265 * list of "reserved" memory must be a subset of list of "memory", then
6266 * this code can be removed.
6267 */
6268 if (pgcnt)
6269 pr_info("Reserved but unavailable: %lld pages", pgcnt);
6270}
6271#endif /* CONFIG_HAVE_MEMBLOCK */
6272
6210#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 6273#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6211 6274
6212#if MAX_NUMNODES > 1 6275#if MAX_NUMNODES > 1
@@ -6630,6 +6693,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6630 node_set_state(nid, N_MEMORY); 6693 node_set_state(nid, N_MEMORY);
6631 check_for_memory(pgdat, nid); 6694 check_for_memory(pgdat, nid);
6632 } 6695 }
6696 zero_resv_unavail();
6633} 6697}
6634 6698
6635static int __init cmdline_parse_core(char *p, unsigned long *core) 6699static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6793,6 +6857,7 @@ void __init free_area_init(unsigned long *zones_size)
6793{ 6857{
6794 free_area_init_node(0, zones_size, 6858 free_area_init_node(0, zones_size,
6795 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 6859 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
6860 zero_resv_unavail();
6796} 6861}
6797 6862
6798static int page_alloc_cpu_dead(unsigned int cpu) 6863static int page_alloc_cpu_dead(unsigned int cpu)
@@ -7305,18 +7370,17 @@ void *__init alloc_large_system_hash(const char *tablename,
7305 7370
7306 log2qty = ilog2(numentries); 7371 log2qty = ilog2(numentries);
7307 7372
7308 /*
7309 * memblock allocator returns zeroed memory already, so HASH_ZERO is
7310 * currently not used when HASH_EARLY is specified.
7311 */
7312 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 7373 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
7313 do { 7374 do {
7314 size = bucketsize << log2qty; 7375 size = bucketsize << log2qty;
7315 if (flags & HASH_EARLY) 7376 if (flags & HASH_EARLY) {
7316 table = memblock_virt_alloc_nopanic(size, 0); 7377 if (flags & HASH_ZERO)
7317 else if (hashdist) 7378 table = memblock_virt_alloc_nopanic(size, 0);
7379 else
7380 table = memblock_virt_alloc_raw(size, 0);
7381 } else if (hashdist) {
7318 table = __vmalloc(size, gfp_flags, PAGE_KERNEL); 7382 table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
7319 else { 7383 } else {
7320 /* 7384 /*
7321 * If bucketsize is not a power-of-two, we may free 7385 * If bucketsize is not a power-of-two, we may free
7322 * some pages at the end of hash table which 7386 * some pages at the end of hash table which
@@ -7353,10 +7417,10 @@ void *__init alloc_large_system_hash(const char *tablename,
7353 * race condition. So you can't expect this function should be exact. 7417 * race condition. So you can't expect this function should be exact.
7354 */ 7418 */
7355bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7419bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7420 int migratetype,
7356 bool skip_hwpoisoned_pages) 7421 bool skip_hwpoisoned_pages)
7357{ 7422{
7358 unsigned long pfn, iter, found; 7423 unsigned long pfn, iter, found;
7359 int mt;
7360 7424
7361 /* 7425 /*
7362 * For avoiding noise data, lru_add_drain_all() should be called 7426 * For avoiding noise data, lru_add_drain_all() should be called
@@ -7364,8 +7428,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7364 */ 7428 */
7365 if (zone_idx(zone) == ZONE_MOVABLE) 7429 if (zone_idx(zone) == ZONE_MOVABLE)
7366 return false; 7430 return false;
7367 mt = get_pageblock_migratetype(page); 7431
7368 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 7432 /*
7433 * CMA allocations (alloc_contig_range) really need to mark isolate
7434 * CMA pageblocks even when they are not movable in fact so consider
7435 * them movable here.
7436 */
7437 if (is_migrate_cma(migratetype) &&
7438 is_migrate_cma(get_pageblock_migratetype(page)))
7369 return false; 7439 return false;
7370 7440
7371 pfn = page_to_pfn(page); 7441 pfn = page_to_pfn(page);
@@ -7377,6 +7447,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7377 7447
7378 page = pfn_to_page(check); 7448 page = pfn_to_page(check);
7379 7449
7450 if (PageReserved(page))
7451 return true;
7452
7380 /* 7453 /*
7381 * Hugepages are not in LRU lists, but they're movable. 7454 * Hugepages are not in LRU lists, but they're movable.
7382 * We need not scan over tail pages bacause we don't 7455 * We need not scan over tail pages bacause we don't
@@ -7450,7 +7523,7 @@ bool is_pageblock_removable_nolock(struct page *page)
7450 if (!zone_spans_pfn(zone, pfn)) 7523 if (!zone_spans_pfn(zone, pfn))
7451 return false; 7524 return false;
7452 7525
7453 return !has_unmovable_pages(zone, page, 0, true); 7526 return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
7454} 7527}
7455 7528
7456#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 7529#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 4f0367d472c4..2c16216c29b6 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -125,7 +125,6 @@ struct page_ext *lookup_page_ext(struct page *page)
125 struct page_ext *base; 125 struct page_ext *base;
126 126
127 base = NODE_DATA(page_to_nid(page))->node_page_ext; 127 base = NODE_DATA(page_to_nid(page))->node_page_ext;
128#if defined(CONFIG_DEBUG_VM)
129 /* 128 /*
130 * The sanity checks the page allocator does upon freeing a 129 * The sanity checks the page allocator does upon freeing a
131 * page can reach here before the page_ext arrays are 130 * page can reach here before the page_ext arrays are
@@ -134,7 +133,6 @@ struct page_ext *lookup_page_ext(struct page *page)
134 */ 133 */
135 if (unlikely(!base)) 134 if (unlikely(!base))
136 return NULL; 135 return NULL;
137#endif
138 index = pfn - round_down(node_start_pfn(page_to_nid(page)), 136 index = pfn - round_down(node_start_pfn(page_to_nid(page)),
139 MAX_ORDER_NR_PAGES); 137 MAX_ORDER_NR_PAGES);
140 return get_entry(base, index); 138 return get_entry(base, index);
@@ -199,7 +197,6 @@ struct page_ext *lookup_page_ext(struct page *page)
199{ 197{
200 unsigned long pfn = page_to_pfn(page); 198 unsigned long pfn = page_to_pfn(page);
201 struct mem_section *section = __pfn_to_section(pfn); 199 struct mem_section *section = __pfn_to_section(pfn);
202#if defined(CONFIG_DEBUG_VM)
203 /* 200 /*
204 * The sanity checks the page allocator does upon freeing a 201 * The sanity checks the page allocator does upon freeing a
205 * page can reach here before the page_ext arrays are 202 * page can reach here before the page_ext arrays are
@@ -208,7 +205,6 @@ struct page_ext *lookup_page_ext(struct page *page)
208 */ 205 */
209 if (!section->page_ext) 206 if (!section->page_ext)
210 return NULL; 207 return NULL;
211#endif
212 return get_entry(section->page_ext, pfn); 208 return get_entry(section->page_ext, pfn);
213} 209}
214 210
diff --git a/mm/page_io.c b/mm/page_io.c
index cd52b9cc169b..e93f1a4cacd7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -347,7 +347,7 @@ out:
347 return ret; 347 return ret;
348} 348}
349 349
350int swap_readpage(struct page *page, bool do_poll) 350int swap_readpage(struct page *page, bool synchronous)
351{ 351{
352 struct bio *bio; 352 struct bio *bio;
353 int ret = 0; 353 int ret = 0;
@@ -355,7 +355,7 @@ int swap_readpage(struct page *page, bool do_poll)
355 blk_qc_t qc; 355 blk_qc_t qc;
356 struct gendisk *disk; 356 struct gendisk *disk;
357 357
358 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 358 VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
359 VM_BUG_ON_PAGE(!PageLocked(page), page); 359 VM_BUG_ON_PAGE(!PageLocked(page), page);
360 VM_BUG_ON_PAGE(PageUptodate(page), page); 360 VM_BUG_ON_PAGE(PageUptodate(page), page);
361 if (frontswap_load(page) == 0) { 361 if (frontswap_load(page) == 0) {
@@ -403,7 +403,7 @@ int swap_readpage(struct page *page, bool do_poll)
403 count_vm_event(PSWPIN); 403 count_vm_event(PSWPIN);
404 bio_get(bio); 404 bio_get(bio);
405 qc = submit_bio(bio); 405 qc = submit_bio(bio);
406 while (do_poll) { 406 while (synchronous) {
407 set_current_state(TASK_UNINTERRUPTIBLE); 407 set_current_state(TASK_UNINTERRUPTIBLE);
408 if (!READ_ONCE(bio->bi_private)) 408 if (!READ_ONCE(bio->bi_private))
409 break; 409 break;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 44f213935bf6..165ed8117bd1 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,7 +15,7 @@
15#define CREATE_TRACE_POINTS 15#define CREATE_TRACE_POINTS
16#include <trace/events/page_isolation.h> 16#include <trace/events/page_isolation.h>
17 17
18static int set_migratetype_isolate(struct page *page, 18static int set_migratetype_isolate(struct page *page, int migratetype,
19 bool skip_hwpoisoned_pages) 19 bool skip_hwpoisoned_pages)
20{ 20{
21 struct zone *zone; 21 struct zone *zone;
@@ -52,7 +52,7 @@ static int set_migratetype_isolate(struct page *page,
52 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 52 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
53 * We just check MOVABLE pages. 53 * We just check MOVABLE pages.
54 */ 54 */
55 if (!has_unmovable_pages(zone, page, arg.pages_found, 55 if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
56 skip_hwpoisoned_pages)) 56 skip_hwpoisoned_pages))
57 ret = 0; 57 ret = 0;
58 58
@@ -64,14 +64,14 @@ static int set_migratetype_isolate(struct page *page,
64out: 64out:
65 if (!ret) { 65 if (!ret) {
66 unsigned long nr_pages; 66 unsigned long nr_pages;
67 int migratetype = get_pageblock_migratetype(page); 67 int mt = get_pageblock_migratetype(page);
68 68
69 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 69 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
70 zone->nr_isolate_pageblock++; 70 zone->nr_isolate_pageblock++;
71 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, 71 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
72 NULL); 72 NULL);
73 73
74 __mod_zone_freepage_state(zone, -nr_pages, migratetype); 74 __mod_zone_freepage_state(zone, -nr_pages, mt);
75 } 75 }
76 76
77 spin_unlock_irqrestore(&zone->lock, flags); 77 spin_unlock_irqrestore(&zone->lock, flags);
@@ -183,7 +183,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
183 pfn += pageblock_nr_pages) { 183 pfn += pageblock_nr_pages) {
184 page = __first_valid_page(pfn, pageblock_nr_pages); 184 page = __first_valid_page(pfn, pageblock_nr_pages);
185 if (page && 185 if (page &&
186 set_migratetype_isolate(page, skip_hwpoisoned_pages)) { 186 set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) {
187 undo_pfn = pfn; 187 undo_pfn = pfn;
188 goto undo; 188 goto undo;
189 } 189 }
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 4f44b95b9d1e..8592543a0f15 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -20,9 +20,9 @@
20#define PAGE_OWNER_STACK_DEPTH (16) 20#define PAGE_OWNER_STACK_DEPTH (16)
21 21
22struct page_owner { 22struct page_owner {
23 unsigned int order; 23 unsigned short order;
24 short last_migrate_reason;
24 gfp_t gfp_mask; 25 gfp_t gfp_mask;
25 int last_migrate_reason;
26 depot_stack_handle_t handle; 26 depot_stack_handle_t handle;
27}; 27};
28 28
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 15dab691ea70..9158e5a81391 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
81static int pcpu_alloc_pages(struct pcpu_chunk *chunk, 81static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
82 struct page **pages, int page_start, int page_end) 82 struct page **pages, int page_start, int page_end)
83{ 83{
84 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 84 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
85 unsigned int cpu, tcpu; 85 unsigned int cpu, tcpu;
86 int i; 86 int i;
87 87
diff --git a/mm/rmap.c b/mm/rmap.c
index b874c4761e84..47db27f8049e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -899,7 +899,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
899 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 899 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
900 900
901 while (page_vma_mapped_walk(&pvmw)) { 901 while (page_vma_mapped_walk(&pvmw)) {
902 unsigned long cstart, cend; 902 unsigned long cstart;
903 int ret = 0; 903 int ret = 0;
904 904
905 cstart = address = pvmw.address; 905 cstart = address = pvmw.address;
@@ -915,7 +915,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
915 entry = pte_wrprotect(entry); 915 entry = pte_wrprotect(entry);
916 entry = pte_mkclean(entry); 916 entry = pte_mkclean(entry);
917 set_pte_at(vma->vm_mm, address, pte, entry); 917 set_pte_at(vma->vm_mm, address, pte, entry);
918 cend = cstart + PAGE_SIZE;
919 ret = 1; 918 ret = 1;
920 } else { 919 } else {
921#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 920#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -931,7 +930,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
931 entry = pmd_mkclean(entry); 930 entry = pmd_mkclean(entry);
932 set_pmd_at(vma->vm_mm, address, pmd, entry); 931 set_pmd_at(vma->vm_mm, address, pmd, entry);
933 cstart &= PMD_MASK; 932 cstart &= PMD_MASK;
934 cend = cstart + PMD_SIZE;
935 ret = 1; 933 ret = 1;
936#else 934#else
937 /* unexpected pmd-mapped page? */ 935 /* unexpected pmd-mapped page? */
@@ -939,10 +937,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
939#endif 937#endif
940 } 938 }
941 939
942 if (ret) { 940 /*
943 mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); 941 * No need to call mmu_notifier_invalidate_range() as we are
942 * downgrading page table protection not changing it to point
943 * to a new page.
944 *
945 * See Documentation/vm/mmu_notifier.txt
946 */
947 if (ret)
944 (*cleaned)++; 948 (*cleaned)++;
945 }
946 } 949 }
947 950
948 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 951 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
@@ -1318,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound)
1318 * It would be tidy to reset the PageAnon mapping here, 1321 * It would be tidy to reset the PageAnon mapping here,
1319 * but that might overwrite a racing page_add_anon_rmap 1322 * but that might overwrite a racing page_add_anon_rmap
1320 * which increments mapcount after us but sets mapping 1323 * which increments mapcount after us but sets mapping
1321 * before us: so leave the reset to free_hot_cold_page, 1324 * before us: so leave the reset to free_unref_page,
1322 * and remember that it's only reliable while mapped. 1325 * and remember that it's only reliable while mapped.
1323 * Leaving it set also helps swapoff to reinstate ptes 1326 * Leaving it set also helps swapoff to reinstate ptes
1324 * faster for those pages still in swapcache. 1327 * faster for those pages still in swapcache.
@@ -1426,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1426 if (pte_soft_dirty(pteval)) 1429 if (pte_soft_dirty(pteval))
1427 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1430 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1428 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1431 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
1432 /*
1433 * No need to invalidate here it will synchronize on
1434 * against the special swap migration pte.
1435 */
1429 goto discard; 1436 goto discard;
1430 } 1437 }
1431 1438
@@ -1483,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1483 * will take care of the rest. 1490 * will take care of the rest.
1484 */ 1491 */
1485 dec_mm_counter(mm, mm_counter(page)); 1492 dec_mm_counter(mm, mm_counter(page));
1493 /* We have to invalidate as we cleared the pte */
1494 mmu_notifier_invalidate_range(mm, address,
1495 address + PAGE_SIZE);
1486 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1496 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1487 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { 1497 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
1488 swp_entry_t entry; 1498 swp_entry_t entry;
@@ -1498,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1498 if (pte_soft_dirty(pteval)) 1508 if (pte_soft_dirty(pteval))
1499 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1509 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1500 set_pte_at(mm, address, pvmw.pte, swp_pte); 1510 set_pte_at(mm, address, pvmw.pte, swp_pte);
1511 /*
1512 * No need to invalidate here it will synchronize on
1513 * against the special swap migration pte.
1514 */
1501 } else if (PageAnon(page)) { 1515 } else if (PageAnon(page)) {
1502 swp_entry_t entry = { .val = page_private(subpage) }; 1516 swp_entry_t entry = { .val = page_private(subpage) };
1503 pte_t swp_pte; 1517 pte_t swp_pte;
@@ -1509,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1509 WARN_ON_ONCE(1); 1523 WARN_ON_ONCE(1);
1510 ret = false; 1524 ret = false;
1511 /* We have to invalidate as we cleared the pte */ 1525 /* We have to invalidate as we cleared the pte */
1526 mmu_notifier_invalidate_range(mm, address,
1527 address + PAGE_SIZE);
1512 page_vma_mapped_walk_done(&pvmw); 1528 page_vma_mapped_walk_done(&pvmw);
1513 break; 1529 break;
1514 } 1530 }
@@ -1516,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1516 /* MADV_FREE page check */ 1532 /* MADV_FREE page check */
1517 if (!PageSwapBacked(page)) { 1533 if (!PageSwapBacked(page)) {
1518 if (!PageDirty(page)) { 1534 if (!PageDirty(page)) {
1535 /* Invalidate as we cleared the pte */
1536 mmu_notifier_invalidate_range(mm,
1537 address, address + PAGE_SIZE);
1519 dec_mm_counter(mm, MM_ANONPAGES); 1538 dec_mm_counter(mm, MM_ANONPAGES);
1520 goto discard; 1539 goto discard;
1521 } 1540 }
@@ -1549,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1549 if (pte_soft_dirty(pteval)) 1568 if (pte_soft_dirty(pteval))
1550 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1569 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1551 set_pte_at(mm, address, pvmw.pte, swp_pte); 1570 set_pte_at(mm, address, pvmw.pte, swp_pte);
1552 } else 1571 /* Invalidate as we cleared the pte */
1572 mmu_notifier_invalidate_range(mm, address,
1573 address + PAGE_SIZE);
1574 } else {
1575 /*
1576 * We should not need to notify here as we reach this
1577 * case only from freeze_page() itself only call from
1578 * split_huge_page_to_list() so everything below must
1579 * be true:
1580 * - page is not anonymous
1581 * - page is locked
1582 *
1583 * So as it is a locked file back page thus it can not
1584 * be remove from the page cache and replace by a new
1585 * page before mmu_notifier_invalidate_range_end so no
1586 * concurrent thread might update its page table to
1587 * point at new page while a device still is using this
1588 * page.
1589 *
1590 * See Documentation/vm/mmu_notifier.txt
1591 */
1553 dec_mm_counter(mm, mm_counter_file(page)); 1592 dec_mm_counter(mm, mm_counter_file(page));
1593 }
1554discard: 1594discard:
1595 /*
1596 * No need to call mmu_notifier_invalidate_range() it has be
1597 * done above for all cases requiring it to happen under page
1598 * table lock before mmu_notifier_invalidate_range_end()
1599 *
1600 * See Documentation/vm/mmu_notifier.txt
1601 */
1555 page_remove_rmap(subpage, PageHuge(page)); 1602 page_remove_rmap(subpage, PageHuge(page));
1556 put_page(page); 1603 put_page(page);
1557 mmu_notifier_invalidate_range(mm, address,
1558 address + PAGE_SIZE);
1559 } 1604 }
1560 1605
1561 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 1606 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
diff --git a/mm/shmem.c b/mm/shmem.c
index 07a1d22807be..ab22eaa2412e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -338,7 +338,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
338 if (item != expected) 338 if (item != expected)
339 return -ENOENT; 339 return -ENOENT;
340 __radix_tree_replace(&mapping->page_tree, node, pslot, 340 __radix_tree_replace(&mapping->page_tree, node, pslot,
341 replacement, NULL, NULL); 341 replacement, NULL);
342 return 0; 342 return 0;
343} 343}
344 344
@@ -747,7 +747,7 @@ void shmem_unlock_mapping(struct address_space *mapping)
747 pgoff_t indices[PAGEVEC_SIZE]; 747 pgoff_t indices[PAGEVEC_SIZE];
748 pgoff_t index = 0; 748 pgoff_t index = 0;
749 749
750 pagevec_init(&pvec, 0); 750 pagevec_init(&pvec);
751 /* 751 /*
752 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 752 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
753 */ 753 */
@@ -790,7 +790,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
790 if (lend == -1) 790 if (lend == -1)
791 end = -1; /* unsigned, so actually very big */ 791 end = -1; /* unsigned, so actually very big */
792 792
793 pagevec_init(&pvec, 0); 793 pagevec_init(&pvec);
794 index = start; 794 index = start;
795 while (index < end) { 795 while (index < end) {
796 pvec.nr = find_get_entries(mapping, index, 796 pvec.nr = find_get_entries(mapping, index,
@@ -2528,7 +2528,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2528 bool done = false; 2528 bool done = false;
2529 int i; 2529 int i;
2530 2530
2531 pagevec_init(&pvec, 0); 2531 pagevec_init(&pvec);
2532 pvec.nr = 1; /* start small: we may be there already */ 2532 pvec.nr = 1; /* start small: we may be there already */
2533 while (!done) { 2533 while (!done) {
2534 pvec.nr = find_get_entries(mapping, index, 2534 pvec.nr = find_get_entries(mapping, index,
@@ -3862,12 +3862,11 @@ static void shmem_init_inode(void *foo)
3862 inode_init_once(&info->vfs_inode); 3862 inode_init_once(&info->vfs_inode);
3863} 3863}
3864 3864
3865static int shmem_init_inodecache(void) 3865static void shmem_init_inodecache(void)
3866{ 3866{
3867 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 3867 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3868 sizeof(struct shmem_inode_info), 3868 sizeof(struct shmem_inode_info),
3869 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 3869 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3870 return 0;
3871} 3870}
3872 3871
3873static void shmem_destroy_inodecache(void) 3872static void shmem_destroy_inodecache(void)
@@ -3991,9 +3990,7 @@ int __init shmem_init(void)
3991 if (shmem_inode_cachep) 3990 if (shmem_inode_cachep)
3992 return 0; 3991 return 0;
3993 3992
3994 error = shmem_init_inodecache(); 3993 shmem_init_inodecache();
3995 if (error)
3996 goto out3;
3997 3994
3998 error = register_filesystem(&shmem_fs_type); 3995 error = register_filesystem(&shmem_fs_type);
3999 if (error) { 3996 if (error) {
@@ -4020,7 +4017,6 @@ out1:
4020 unregister_filesystem(&shmem_fs_type); 4017 unregister_filesystem(&shmem_fs_type);
4021out2: 4018out2:
4022 shmem_destroy_inodecache(); 4019 shmem_destroy_inodecache();
4023out3:
4024 shm_mnt = ERR_PTR(error); 4020 shm_mnt = ERR_PTR(error);
4025 return error; 4021 return error;
4026} 4022}
@@ -4102,6 +4098,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
4102 if (i_size >= HPAGE_PMD_SIZE && 4098 if (i_size >= HPAGE_PMD_SIZE &&
4103 i_size >> PAGE_SHIFT >= off) 4099 i_size >> PAGE_SHIFT >= off)
4104 return true; 4100 return true;
4101 /* fall through */
4105 case SHMEM_HUGE_ADVISE: 4102 case SHMEM_HUGE_ADVISE:
4106 /* TODO: implement fadvise() hints */ 4103 /* TODO: implement fadvise() hints */
4107 return (vma->vm_flags & VM_HUGEPAGE); 4104 return (vma->vm_flags & VM_HUGEPAGE);
diff --git a/mm/slab.c b/mm/slab.c
index b7095884fd93..183e996dde5f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,7 +114,6 @@
114#include <linux/rtmutex.h> 114#include <linux/rtmutex.h>
115#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
116#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h>
118#include <linux/memory.h> 117#include <linux/memory.h>
119#include <linux/prefetch.h> 118#include <linux/prefetch.h>
120#include <linux/sched/task_stack.h> 119#include <linux/sched/task_stack.h>
@@ -252,8 +251,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
252 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 251 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
253 } while (0) 252 } while (0)
254 253
255#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) 254#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U)
256#define CFLGS_OFF_SLAB (0x80000000UL) 255#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U)
257#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) 256#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
258#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 257#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
259 258
@@ -441,7 +440,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
441 * Calculate the number of objects and left-over bytes for a given buffer size. 440 * Calculate the number of objects and left-over bytes for a given buffer size.
442 */ 441 */
443static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, 442static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
444 unsigned long flags, size_t *left_over) 443 slab_flags_t flags, size_t *left_over)
445{ 444{
446 unsigned int num; 445 unsigned int num;
447 size_t slab_size = PAGE_SIZE << gfporder; 446 size_t slab_size = PAGE_SIZE << gfporder;
@@ -1410,10 +1409,8 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1410 int nr_pages; 1409 int nr_pages;
1411 1410
1412 flags |= cachep->allocflags; 1411 flags |= cachep->allocflags;
1413 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1414 flags |= __GFP_RECLAIMABLE;
1415 1412
1416 page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1413 page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
1417 if (!page) { 1414 if (!page) {
1418 slab_out_of_memory(cachep, flags, nodeid); 1415 slab_out_of_memory(cachep, flags, nodeid);
1419 return NULL; 1416 return NULL;
@@ -1435,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1435 if (sk_memalloc_socks() && page_is_pfmemalloc(page)) 1432 if (sk_memalloc_socks() && page_is_pfmemalloc(page))
1436 SetPageSlabPfmemalloc(page); 1433 SetPageSlabPfmemalloc(page);
1437 1434
1438 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1439 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1440
1441 if (cachep->ctor)
1442 kmemcheck_mark_uninitialized_pages(page, nr_pages);
1443 else
1444 kmemcheck_mark_unallocated_pages(page, nr_pages);
1445 }
1446
1447 return page; 1435 return page;
1448} 1436}
1449 1437
@@ -1455,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1455 int order = cachep->gfporder; 1443 int order = cachep->gfporder;
1456 unsigned long nr_freed = (1 << order); 1444 unsigned long nr_freed = (1 << order);
1457 1445
1458 kmemcheck_free_shadow(page, order);
1459
1460 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1446 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1461 mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); 1447 mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
1462 else 1448 else
@@ -1761,7 +1747,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1761 * towards high-order requests, this should be changed. 1747 * towards high-order requests, this should be changed.
1762 */ 1748 */
1763static size_t calculate_slab_order(struct kmem_cache *cachep, 1749static size_t calculate_slab_order(struct kmem_cache *cachep,
1764 size_t size, unsigned long flags) 1750 size_t size, slab_flags_t flags)
1765{ 1751{
1766 size_t left_over = 0; 1752 size_t left_over = 0;
1767 int gfporder; 1753 int gfporder;
@@ -1888,8 +1874,8 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
1888 return 0; 1874 return 0;
1889} 1875}
1890 1876
1891unsigned long kmem_cache_flags(unsigned long object_size, 1877slab_flags_t kmem_cache_flags(unsigned long object_size,
1892 unsigned long flags, const char *name, 1878 slab_flags_t flags, const char *name,
1893 void (*ctor)(void *)) 1879 void (*ctor)(void *))
1894{ 1880{
1895 return flags; 1881 return flags;
@@ -1897,7 +1883,7 @@ unsigned long kmem_cache_flags(unsigned long object_size,
1897 1883
1898struct kmem_cache * 1884struct kmem_cache *
1899__kmem_cache_alias(const char *name, size_t size, size_t align, 1885__kmem_cache_alias(const char *name, size_t size, size_t align,
1900 unsigned long flags, void (*ctor)(void *)) 1886 slab_flags_t flags, void (*ctor)(void *))
1901{ 1887{
1902 struct kmem_cache *cachep; 1888 struct kmem_cache *cachep;
1903 1889
@@ -1915,7 +1901,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
1915} 1901}
1916 1902
1917static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, 1903static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1918 size_t size, unsigned long flags) 1904 size_t size, slab_flags_t flags)
1919{ 1905{
1920 size_t left; 1906 size_t left;
1921 1907
@@ -1938,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1938} 1924}
1939 1925
1940static bool set_off_slab_cache(struct kmem_cache *cachep, 1926static bool set_off_slab_cache(struct kmem_cache *cachep,
1941 size_t size, unsigned long flags) 1927 size_t size, slab_flags_t flags)
1942{ 1928{
1943 size_t left; 1929 size_t left;
1944 1930
@@ -1972,7 +1958,7 @@ static bool set_off_slab_cache(struct kmem_cache *cachep,
1972} 1958}
1973 1959
1974static bool set_on_slab_cache(struct kmem_cache *cachep, 1960static bool set_on_slab_cache(struct kmem_cache *cachep,
1975 size_t size, unsigned long flags) 1961 size_t size, slab_flags_t flags)
1976{ 1962{
1977 size_t left; 1963 size_t left;
1978 1964
@@ -2008,8 +1994,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
2008 * cacheline. This can be beneficial if you're counting cycles as closely 1994 * cacheline. This can be beneficial if you're counting cycles as closely
2009 * as davem. 1995 * as davem.
2010 */ 1996 */
2011int 1997int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
2012__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2013{ 1998{
2014 size_t ralign = BYTES_PER_WORD; 1999 size_t ralign = BYTES_PER_WORD;
2015 gfp_t gfp; 2000 gfp_t gfp;
@@ -2144,6 +2129,8 @@ done:
2144 cachep->allocflags = __GFP_COMP; 2129 cachep->allocflags = __GFP_COMP;
2145 if (flags & SLAB_CACHE_DMA) 2130 if (flags & SLAB_CACHE_DMA)
2146 cachep->allocflags |= GFP_DMA; 2131 cachep->allocflags |= GFP_DMA;
2132 if (flags & SLAB_RECLAIM_ACCOUNT)
2133 cachep->allocflags |= __GFP_RECLAIMABLE;
2147 cachep->size = size; 2134 cachep->size = size;
2148 cachep->reciprocal_buffer_size = reciprocal_value(size); 2135 cachep->reciprocal_buffer_size = reciprocal_value(size);
2149 2136
@@ -3516,8 +3503,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
3516 kmemleak_free_recursive(objp, cachep->flags); 3503 kmemleak_free_recursive(objp, cachep->flags);
3517 objp = cache_free_debugcheck(cachep, objp, caller); 3504 objp = cache_free_debugcheck(cachep, objp, caller);
3518 3505
3519 kmemcheck_slab_free(cachep, objp, cachep->object_size);
3520
3521 /* 3506 /*
3522 * Skip calling cache_free_alien() when the platform is not numa. 3507 * Skip calling cache_free_alien() when the platform is not numa.
3523 * This will avoid cache misses that happen while accessing slabp (which 3508 * This will avoid cache misses that happen while accessing slabp (which
@@ -4097,7 +4082,6 @@ out:
4097 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); 4082 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
4098} 4083}
4099 4084
4100#ifdef CONFIG_SLABINFO
4101void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) 4085void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4102{ 4086{
4103 unsigned long active_objs, num_objs, active_slabs; 4087 unsigned long active_objs, num_objs, active_slabs;
@@ -4405,7 +4389,6 @@ static int __init slab_proc_init(void)
4405 return 0; 4389 return 0;
4406} 4390}
4407module_init(slab_proc_init); 4391module_init(slab_proc_init);
4408#endif
4409 4392
4410#ifdef CONFIG_HARDENED_USERCOPY 4393#ifdef CONFIG_HARDENED_USERCOPY
4411/* 4394/*
diff --git a/mm/slab.h b/mm/slab.h
index 86d7c7d860f9..ad657ffa44e5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -21,7 +21,7 @@ struct kmem_cache {
21 unsigned int object_size;/* The original size of the object */ 21 unsigned int object_size;/* The original size of the object */
22 unsigned int size; /* The aligned/padded/added on size */ 22 unsigned int size; /* The aligned/padded/added on size */
23 unsigned int align; /* Alignment as calculated */ 23 unsigned int align; /* Alignment as calculated */
24 unsigned long flags; /* Active flags on the slab */ 24 slab_flags_t flags; /* Active flags on the slab */
25 const char *name; /* Slab name for sysfs */ 25 const char *name; /* Slab name for sysfs */
26 int refcount; /* Use counter */ 26 int refcount; /* Use counter */
27 void (*ctor)(void *); /* Called on object slot creation */ 27 void (*ctor)(void *); /* Called on object slot creation */
@@ -40,7 +40,6 @@ struct kmem_cache {
40 40
41#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
42#include <linux/fault-inject.h> 42#include <linux/fault-inject.h>
43#include <linux/kmemcheck.h>
44#include <linux/kasan.h> 43#include <linux/kasan.h>
45#include <linux/kmemleak.h> 44#include <linux/kmemleak.h>
46#include <linux/random.h> 45#include <linux/random.h>
@@ -79,13 +78,13 @@ extern const struct kmalloc_info_struct {
79 unsigned long size; 78 unsigned long size;
80} kmalloc_info[]; 79} kmalloc_info[];
81 80
82unsigned long calculate_alignment(unsigned long flags, 81unsigned long calculate_alignment(slab_flags_t flags,
83 unsigned long align, unsigned long size); 82 unsigned long align, unsigned long size);
84 83
85#ifndef CONFIG_SLOB 84#ifndef CONFIG_SLOB
86/* Kmalloc array related functions */ 85/* Kmalloc array related functions */
87void setup_kmalloc_cache_index_table(void); 86void setup_kmalloc_cache_index_table(void);
88void create_kmalloc_caches(unsigned long); 87void create_kmalloc_caches(slab_flags_t);
89 88
90/* Find the kmalloc slab corresponding for a certain size */ 89/* Find the kmalloc slab corresponding for a certain size */
91struct kmem_cache *kmalloc_slab(size_t, gfp_t); 90struct kmem_cache *kmalloc_slab(size_t, gfp_t);
@@ -93,32 +92,32 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t);
93 92
94 93
95/* Functions provided by the slab allocators */ 94/* Functions provided by the slab allocators */
96extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); 95int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
97 96
98extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, 97extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
99 unsigned long flags); 98 slab_flags_t flags);
100extern void create_boot_cache(struct kmem_cache *, const char *name, 99extern void create_boot_cache(struct kmem_cache *, const char *name,
101 size_t size, unsigned long flags); 100 size_t size, slab_flags_t flags);
102 101
103int slab_unmergeable(struct kmem_cache *s); 102int slab_unmergeable(struct kmem_cache *s);
104struct kmem_cache *find_mergeable(size_t size, size_t align, 103struct kmem_cache *find_mergeable(size_t size, size_t align,
105 unsigned long flags, const char *name, void (*ctor)(void *)); 104 slab_flags_t flags, const char *name, void (*ctor)(void *));
106#ifndef CONFIG_SLOB 105#ifndef CONFIG_SLOB
107struct kmem_cache * 106struct kmem_cache *
108__kmem_cache_alias(const char *name, size_t size, size_t align, 107__kmem_cache_alias(const char *name, size_t size, size_t align,
109 unsigned long flags, void (*ctor)(void *)); 108 slab_flags_t flags, void (*ctor)(void *));
110 109
111unsigned long kmem_cache_flags(unsigned long object_size, 110slab_flags_t kmem_cache_flags(unsigned long object_size,
112 unsigned long flags, const char *name, 111 slab_flags_t flags, const char *name,
113 void (*ctor)(void *)); 112 void (*ctor)(void *));
114#else 113#else
115static inline struct kmem_cache * 114static inline struct kmem_cache *
116__kmem_cache_alias(const char *name, size_t size, size_t align, 115__kmem_cache_alias(const char *name, size_t size, size_t align,
117 unsigned long flags, void (*ctor)(void *)) 116 slab_flags_t flags, void (*ctor)(void *))
118{ return NULL; } 117{ return NULL; }
119 118
120static inline unsigned long kmem_cache_flags(unsigned long object_size, 119static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
121 unsigned long flags, const char *name, 120 slab_flags_t flags, const char *name,
122 void (*ctor)(void *)) 121 void (*ctor)(void *))
123{ 122{
124 return flags; 123 return flags;
@@ -142,10 +141,10 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
142#if defined(CONFIG_SLAB) 141#if defined(CONFIG_SLAB)
143#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ 142#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
144 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ 143 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
145 SLAB_NOTRACK | SLAB_ACCOUNT) 144 SLAB_ACCOUNT)
146#elif defined(CONFIG_SLUB) 145#elif defined(CONFIG_SLUB)
147#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ 146#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
148 SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) 147 SLAB_TEMPORARY | SLAB_ACCOUNT)
149#else 148#else
150#define SLAB_CACHE_FLAGS (0) 149#define SLAB_CACHE_FLAGS (0)
151#endif 150#endif
@@ -164,7 +163,6 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
164 SLAB_NOLEAKTRACE | \ 163 SLAB_NOLEAKTRACE | \
165 SLAB_RECLAIM_ACCOUNT | \ 164 SLAB_RECLAIM_ACCOUNT | \
166 SLAB_TEMPORARY | \ 165 SLAB_TEMPORARY | \
167 SLAB_NOTRACK | \
168 SLAB_ACCOUNT) 166 SLAB_ACCOUNT)
169 167
170int __kmem_cache_shutdown(struct kmem_cache *); 168int __kmem_cache_shutdown(struct kmem_cache *);
@@ -439,7 +437,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
439 for (i = 0; i < size; i++) { 437 for (i = 0; i < size; i++) {
440 void *object = p[i]; 438 void *object = p[i];
441 439
442 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
443 kmemleak_alloc_recursive(object, s->object_size, 1, 440 kmemleak_alloc_recursive(object, s->object_size, 1,
444 s->flags, flags); 441 s->flags, flags);
445 kasan_slab_alloc(s, object, flags); 442 kasan_slab_alloc(s, object, flags);
@@ -506,6 +503,14 @@ void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
506void memcg_slab_stop(struct seq_file *m, void *p); 503void memcg_slab_stop(struct seq_file *m, void *p);
507int memcg_slab_show(struct seq_file *m, void *p); 504int memcg_slab_show(struct seq_file *m, void *p);
508 505
506#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
507void dump_unreclaimable_slab(void);
508#else
509static inline void dump_unreclaimable_slab(void)
510{
511}
512#endif
513
509void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); 514void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
510 515
511#ifdef CONFIG_SLAB_FREELIST_RANDOM 516#ifdef CONFIG_SLAB_FREELIST_RANDOM
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0d7fe71ff5e4..c8cb36774ba1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
44 SLAB_FAILSLAB | SLAB_KASAN) 44 SLAB_FAILSLAB | SLAB_KASAN)
45 45
46#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 46#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
47 SLAB_NOTRACK | SLAB_ACCOUNT) 47 SLAB_ACCOUNT)
48 48
49/* 49/*
50 * Merge control. If this is set then no merging of slab caches will occur. 50 * Merge control. If this is set then no merging of slab caches will occur.
@@ -291,7 +291,7 @@ int slab_unmergeable(struct kmem_cache *s)
291} 291}
292 292
293struct kmem_cache *find_mergeable(size_t size, size_t align, 293struct kmem_cache *find_mergeable(size_t size, size_t align,
294 unsigned long flags, const char *name, void (*ctor)(void *)) 294 slab_flags_t flags, const char *name, void (*ctor)(void *))
295{ 295{
296 struct kmem_cache *s; 296 struct kmem_cache *s;
297 297
@@ -341,7 +341,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
341 * Figure out what the alignment of the objects will be given a set of 341 * Figure out what the alignment of the objects will be given a set of
342 * flags, a user specified alignment and the size of the objects. 342 * flags, a user specified alignment and the size of the objects.
343 */ 343 */
344unsigned long calculate_alignment(unsigned long flags, 344unsigned long calculate_alignment(slab_flags_t flags,
345 unsigned long align, unsigned long size) 345 unsigned long align, unsigned long size)
346{ 346{
347 /* 347 /*
@@ -366,7 +366,7 @@ unsigned long calculate_alignment(unsigned long flags,
366 366
367static struct kmem_cache *create_cache(const char *name, 367static struct kmem_cache *create_cache(const char *name,
368 size_t object_size, size_t size, size_t align, 368 size_t object_size, size_t size, size_t align,
369 unsigned long flags, void (*ctor)(void *), 369 slab_flags_t flags, void (*ctor)(void *),
370 struct mem_cgroup *memcg, struct kmem_cache *root_cache) 370 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
371{ 371{
372 struct kmem_cache *s; 372 struct kmem_cache *s;
@@ -431,7 +431,7 @@ out_free_cache:
431 */ 431 */
432struct kmem_cache * 432struct kmem_cache *
433kmem_cache_create(const char *name, size_t size, size_t align, 433kmem_cache_create(const char *name, size_t size, size_t align,
434 unsigned long flags, void (*ctor)(void *)) 434 slab_flags_t flags, void (*ctor)(void *))
435{ 435{
436 struct kmem_cache *s = NULL; 436 struct kmem_cache *s = NULL;
437 const char *cache_name; 437 const char *cache_name;
@@ -879,7 +879,7 @@ bool slab_is_available(void)
879#ifndef CONFIG_SLOB 879#ifndef CONFIG_SLOB
880/* Create a cache during boot when no slab services are available yet */ 880/* Create a cache during boot when no slab services are available yet */
881void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, 881void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
882 unsigned long flags) 882 slab_flags_t flags)
883{ 883{
884 int err; 884 int err;
885 885
@@ -899,7 +899,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
899} 899}
900 900
901struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, 901struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
902 unsigned long flags) 902 slab_flags_t flags)
903{ 903{
904 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 904 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
905 905
@@ -1057,7 +1057,7 @@ void __init setup_kmalloc_cache_index_table(void)
1057 } 1057 }
1058} 1058}
1059 1059
1060static void __init new_kmalloc_cache(int idx, unsigned long flags) 1060static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
1061{ 1061{
1062 kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, 1062 kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
1063 kmalloc_info[idx].size, flags); 1063 kmalloc_info[idx].size, flags);
@@ -1068,7 +1068,7 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags)
1068 * may already have been created because they were needed to 1068 * may already have been created because they were needed to
1069 * enable allocations for slab creation. 1069 * enable allocations for slab creation.
1070 */ 1070 */
1071void __init create_kmalloc_caches(unsigned long flags) 1071void __init create_kmalloc_caches(slab_flags_t flags)
1072{ 1072{
1073 int i; 1073 int i;
1074 1074
@@ -1184,8 +1184,7 @@ void cache_random_seq_destroy(struct kmem_cache *cachep)
1184} 1184}
1185#endif /* CONFIG_SLAB_FREELIST_RANDOM */ 1185#endif /* CONFIG_SLAB_FREELIST_RANDOM */
1186 1186
1187#ifdef CONFIG_SLABINFO 1187#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
1188
1189#ifdef CONFIG_SLAB 1188#ifdef CONFIG_SLAB
1190#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) 1189#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
1191#else 1190#else
@@ -1281,7 +1280,41 @@ static int slab_show(struct seq_file *m, void *p)
1281 return 0; 1280 return 0;
1282} 1281}
1283 1282
1284#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) 1283void dump_unreclaimable_slab(void)
1284{
1285 struct kmem_cache *s, *s2;
1286 struct slabinfo sinfo;
1287
1288 /*
1289 * Here acquiring slab_mutex is risky since we don't prefer to get
1290 * sleep in oom path. But, without mutex hold, it may introduce a
1291 * risk of crash.
1292 * Use mutex_trylock to protect the list traverse, dump nothing
1293 * without acquiring the mutex.
1294 */
1295 if (!mutex_trylock(&slab_mutex)) {
1296 pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1297 return;
1298 }
1299
1300 pr_info("Unreclaimable slab info:\n");
1301 pr_info("Name Used Total\n");
1302
1303 list_for_each_entry_safe(s, s2, &slab_caches, list) {
1304 if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT))
1305 continue;
1306
1307 get_slabinfo(s, &sinfo);
1308
1309 if (sinfo.num_objs > 0)
1310 pr_info("%-17s %10luKB %10luKB\n", cache_name(s),
1311 (sinfo.active_objs * s->size) / 1024,
1312 (sinfo.num_objs * s->size) / 1024);
1313 }
1314 mutex_unlock(&slab_mutex);
1315}
1316
1317#if defined(CONFIG_MEMCG)
1285void *memcg_slab_start(struct seq_file *m, loff_t *pos) 1318void *memcg_slab_start(struct seq_file *m, loff_t *pos)
1286{ 1319{
1287 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 1320 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -1355,7 +1388,7 @@ static int __init slab_proc_init(void)
1355 return 0; 1388 return 0;
1356} 1389}
1357module_init(slab_proc_init); 1390module_init(slab_proc_init);
1358#endif /* CONFIG_SLABINFO */ 1391#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
1359 1392
1360static __always_inline void *__do_krealloc(const void *p, size_t new_size, 1393static __always_inline void *__do_krealloc(const void *p, size_t new_size,
1361 gfp_t flags) 1394 gfp_t flags)
diff --git a/mm/slob.c b/mm/slob.c
index 10249160b693..623e8a5c46ce 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -330,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
330 BUG_ON(!b); 330 BUG_ON(!b);
331 spin_unlock_irqrestore(&slob_lock, flags); 331 spin_unlock_irqrestore(&slob_lock, flags);
332 } 332 }
333 if (unlikely((gfp & __GFP_ZERO) && b)) 333 if (unlikely(gfp & __GFP_ZERO))
334 memset(b, 0, size); 334 memset(b, 0, size);
335 return b; 335 return b;
336} 336}
@@ -524,7 +524,7 @@ size_t ksize(const void *block)
524} 524}
525EXPORT_SYMBOL(ksize); 525EXPORT_SYMBOL(ksize);
526 526
527int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) 527int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
528{ 528{
529 if (flags & SLAB_TYPESAFE_BY_RCU) { 529 if (flags & SLAB_TYPESAFE_BY_RCU) {
530 /* leave room for rcu footer at the end of object */ 530 /* leave room for rcu footer at the end of object */
diff --git a/mm/slub.c b/mm/slub.c
index 1efbb8123037..cfd56e5a35fb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,7 +22,6 @@
22#include <linux/notifier.h> 22#include <linux/notifier.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24#include <linux/kasan.h> 24#include <linux/kasan.h>
25#include <linux/kmemcheck.h>
26#include <linux/cpu.h> 25#include <linux/cpu.h>
27#include <linux/cpuset.h> 26#include <linux/cpuset.h>
28#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
@@ -193,8 +192,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
193#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 192#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
194 193
195/* Internal SLUB flags */ 194/* Internal SLUB flags */
196#define __OBJECT_POISON 0x80000000UL /* Poison object */ 195/* Poison object */
197#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 196#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
197/* Use cmpxchg_double */
198#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
198 199
199/* 200/*
200 * Tracking user of a slab. 201 * Tracking user of a slab.
@@ -485,9 +486,9 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
485 * Debug settings: 486 * Debug settings:
486 */ 487 */
487#if defined(CONFIG_SLUB_DEBUG_ON) 488#if defined(CONFIG_SLUB_DEBUG_ON)
488static int slub_debug = DEBUG_DEFAULT_FLAGS; 489static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
489#else 490#else
490static int slub_debug; 491static slab_flags_t slub_debug;
491#endif 492#endif
492 493
493static char *slub_debug_slabs; 494static char *slub_debug_slabs;
@@ -1289,8 +1290,8 @@ out:
1289 1290
1290__setup("slub_debug", setup_slub_debug); 1291__setup("slub_debug", setup_slub_debug);
1291 1292
1292unsigned long kmem_cache_flags(unsigned long object_size, 1293slab_flags_t kmem_cache_flags(unsigned long object_size,
1293 unsigned long flags, const char *name, 1294 slab_flags_t flags, const char *name,
1294 void (*ctor)(void *)) 1295 void (*ctor)(void *))
1295{ 1296{
1296 /* 1297 /*
@@ -1322,8 +1323,8 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1322 struct page *page) {} 1323 struct page *page) {}
1323static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, 1324static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1324 struct page *page) {} 1325 struct page *page) {}
1325unsigned long kmem_cache_flags(unsigned long object_size, 1326slab_flags_t kmem_cache_flags(unsigned long object_size,
1326 unsigned long flags, const char *name, 1327 slab_flags_t flags, const char *name,
1327 void (*ctor)(void *)) 1328 void (*ctor)(void *))
1328{ 1329{
1329 return flags; 1330 return flags;
@@ -1370,12 +1371,11 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
1370 * So in order to make the debug calls that expect irqs to be 1371 * So in order to make the debug calls that expect irqs to be
1371 * disabled we need to disable interrupts temporarily. 1372 * disabled we need to disable interrupts temporarily.
1372 */ 1373 */
1373#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 1374#ifdef CONFIG_LOCKDEP
1374 { 1375 {
1375 unsigned long flags; 1376 unsigned long flags;
1376 1377
1377 local_irq_save(flags); 1378 local_irq_save(flags);
1378 kmemcheck_slab_free(s, x, s->object_size);
1379 debug_check_no_locks_freed(x, s->object_size); 1379 debug_check_no_locks_freed(x, s->object_size);
1380 local_irq_restore(flags); 1380 local_irq_restore(flags);
1381 } 1381 }
@@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
1399 * Compiler cannot detect this function can be removed if slab_free_hook() 1399 * Compiler cannot detect this function can be removed if slab_free_hook()
1400 * evaluates to nothing. Thus, catch all relevant config debug options here. 1400 * evaluates to nothing. Thus, catch all relevant config debug options here.
1401 */ 1401 */
1402#if defined(CONFIG_KMEMCHECK) || \ 1402#if defined(CONFIG_LOCKDEP) || \
1403 defined(CONFIG_LOCKDEP) || \
1404 defined(CONFIG_DEBUG_KMEMLEAK) || \ 1403 defined(CONFIG_DEBUG_KMEMLEAK) || \
1405 defined(CONFIG_DEBUG_OBJECTS_FREE) || \ 1404 defined(CONFIG_DEBUG_OBJECTS_FREE) || \
1406 defined(CONFIG_KASAN) 1405 defined(CONFIG_KASAN)
@@ -1436,8 +1435,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
1436 struct page *page; 1435 struct page *page;
1437 int order = oo_order(oo); 1436 int order = oo_order(oo);
1438 1437
1439 flags |= __GFP_NOTRACK;
1440
1441 if (node == NUMA_NO_NODE) 1438 if (node == NUMA_NO_NODE)
1442 page = alloc_pages(flags, order); 1439 page = alloc_pages(flags, order);
1443 else 1440 else
@@ -1596,22 +1593,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1596 stat(s, ORDER_FALLBACK); 1593 stat(s, ORDER_FALLBACK);
1597 } 1594 }
1598 1595
1599 if (kmemcheck_enabled &&
1600 !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1601 int pages = 1 << oo_order(oo);
1602
1603 kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
1604
1605 /*
1606 * Objects from caches that have a constructor don't get
1607 * cleared when they're allocated, so we need to do it here.
1608 */
1609 if (s->ctor)
1610 kmemcheck_mark_uninitialized_pages(page, pages);
1611 else
1612 kmemcheck_mark_unallocated_pages(page, pages);
1613 }
1614
1615 page->objects = oo_objects(oo); 1596 page->objects = oo_objects(oo);
1616 1597
1617 order = compound_order(page); 1598 order = compound_order(page);
@@ -1687,8 +1668,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1687 check_object(s, page, p, SLUB_RED_INACTIVE); 1668 check_object(s, page, p, SLUB_RED_INACTIVE);
1688 } 1669 }
1689 1670
1690 kmemcheck_free_shadow(page, compound_order(page));
1691
1692 mod_lruvec_page_state(page, 1671 mod_lruvec_page_state(page,
1693 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1672 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1694 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1673 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -3477,7 +3456,7 @@ static void set_cpu_partial(struct kmem_cache *s)
3477 */ 3456 */
3478static int calculate_sizes(struct kmem_cache *s, int forced_order) 3457static int calculate_sizes(struct kmem_cache *s, int forced_order)
3479{ 3458{
3480 unsigned long flags = s->flags; 3459 slab_flags_t flags = s->flags;
3481 size_t size = s->object_size; 3460 size_t size = s->object_size;
3482 int order; 3461 int order;
3483 3462
@@ -3593,7 +3572,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3593 return !!oo_objects(s->oo); 3572 return !!oo_objects(s->oo);
3594} 3573}
3595 3574
3596static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 3575static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
3597{ 3576{
3598 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 3577 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3599 s->reserved = 0; 3578 s->reserved = 0;
@@ -3655,7 +3634,7 @@ error:
3655 if (flags & SLAB_PANIC) 3634 if (flags & SLAB_PANIC)
3656 panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", 3635 panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
3657 s->name, (unsigned long)s->size, s->size, 3636 s->name, (unsigned long)s->size, s->size,
3658 oo_order(s->oo), s->offset, flags); 3637 oo_order(s->oo), s->offset, (unsigned long)flags);
3659 return -EINVAL; 3638 return -EINVAL;
3660} 3639}
3661 3640
@@ -3792,7 +3771,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3792 struct page *page; 3771 struct page *page;
3793 void *ptr = NULL; 3772 void *ptr = NULL;
3794 3773
3795 flags |= __GFP_COMP | __GFP_NOTRACK; 3774 flags |= __GFP_COMP;
3796 page = alloc_pages_node(node, flags, get_order(size)); 3775 page = alloc_pages_node(node, flags, get_order(size));
3797 if (page) 3776 if (page)
3798 ptr = page_address(page); 3777 ptr = page_address(page);
@@ -4245,7 +4224,7 @@ void __init kmem_cache_init_late(void)
4245 4224
4246struct kmem_cache * 4225struct kmem_cache *
4247__kmem_cache_alias(const char *name, size_t size, size_t align, 4226__kmem_cache_alias(const char *name, size_t size, size_t align,
4248 unsigned long flags, void (*ctor)(void *)) 4227 slab_flags_t flags, void (*ctor)(void *))
4249{ 4228{
4250 struct kmem_cache *s, *c; 4229 struct kmem_cache *s, *c;
4251 4230
@@ -4275,7 +4254,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
4275 return s; 4254 return s;
4276} 4255}
4277 4256
4278int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) 4257int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
4279{ 4258{
4280 int err; 4259 int err;
4281 4260
@@ -5655,8 +5634,6 @@ static char *create_unique_id(struct kmem_cache *s)
5655 *p++ = 'a'; 5634 *p++ = 'a';
5656 if (s->flags & SLAB_CONSISTENCY_CHECKS) 5635 if (s->flags & SLAB_CONSISTENCY_CHECKS)
5657 *p++ = 'F'; 5636 *p++ = 'F';
5658 if (!(s->flags & SLAB_NOTRACK))
5659 *p++ = 't';
5660 if (s->flags & SLAB_ACCOUNT) 5637 if (s->flags & SLAB_ACCOUNT)
5661 *p++ = 'A'; 5638 *p++ = 'A';
5662 if (p != name + 1) 5639 if (p != name + 1)
@@ -5704,6 +5681,10 @@ static int sysfs_slab_add(struct kmem_cache *s)
5704 return 0; 5681 return 0;
5705 } 5682 }
5706 5683
5684 if (!unmergeable && disable_higher_order_debug &&
5685 (slub_debug & DEBUG_METADATA_FLAGS))
5686 unmergeable = 1;
5687
5707 if (unmergeable) { 5688 if (unmergeable) {
5708 /* 5689 /*
5709 * Slabcache can never be merged so we can use the name proper. 5690 * Slabcache can never be merged so we can use the name proper.
@@ -5852,7 +5833,7 @@ __initcall(slab_sysfs_init);
5852/* 5833/*
5853 * The /proc/slabinfo ABI 5834 * The /proc/slabinfo ABI
5854 */ 5835 */
5855#ifdef CONFIG_SLABINFO 5836#ifdef CONFIG_SLUB_DEBUG
5856void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 5837void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5857{ 5838{
5858 unsigned long nr_slabs = 0; 5839 unsigned long nr_slabs = 0;
@@ -5884,4 +5865,4 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5884{ 5865{
5885 return -EIO; 5866 return -EIO;
5886} 5867}
5887#endif /* CONFIG_SLABINFO */ 5868#endif /* CONFIG_SLUB_DEBUG */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 478ce6d4a2c4..17acf01791fa 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -42,7 +42,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
42 unsigned long align, 42 unsigned long align,
43 unsigned long goal) 43 unsigned long goal)
44{ 44{
45 return memblock_virt_alloc_try_nid(size, align, goal, 45 return memblock_virt_alloc_try_nid_raw(size, align, goal,
46 BOOTMEM_ALLOC_ACCESSIBLE, node); 46 BOOTMEM_ALLOC_ACCESSIBLE, node);
47} 47}
48 48
@@ -53,13 +53,20 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
53{ 53{
54 /* If the main allocator is up use that, fallback to bootmem. */ 54 /* If the main allocator is up use that, fallback to bootmem. */
55 if (slab_is_available()) { 55 if (slab_is_available()) {
56 gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
57 int order = get_order(size);
58 static bool warned;
56 struct page *page; 59 struct page *page;
57 60
58 page = alloc_pages_node(node, 61 page = alloc_pages_node(node, gfp_mask, order);
59 GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
60 get_order(size));
61 if (page) 62 if (page)
62 return page_address(page); 63 return page_address(page);
64
65 if (!warned) {
66 warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
67 "vmemmap alloc failure: order:%u", order);
68 warned = true;
69 }
63 return NULL; 70 return NULL;
64 } else 71 } else
65 return __earlyonly_bootmem_alloc(node, size, size, 72 return __earlyonly_bootmem_alloc(node, size, size,
@@ -180,11 +187,22 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
180 return pte; 187 return pte;
181} 188}
182 189
190static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
191{
192 void *p = vmemmap_alloc_block(size, node);
193
194 if (!p)
195 return NULL;
196 memset(p, 0, size);
197
198 return p;
199}
200
183pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) 201pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
184{ 202{
185 pmd_t *pmd = pmd_offset(pud, addr); 203 pmd_t *pmd = pmd_offset(pud, addr);
186 if (pmd_none(*pmd)) { 204 if (pmd_none(*pmd)) {
187 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 205 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
188 if (!p) 206 if (!p)
189 return NULL; 207 return NULL;
190 pmd_populate_kernel(&init_mm, pmd, p); 208 pmd_populate_kernel(&init_mm, pmd, p);
@@ -196,7 +214,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
196{ 214{
197 pud_t *pud = pud_offset(p4d, addr); 215 pud_t *pud = pud_offset(p4d, addr);
198 if (pud_none(*pud)) { 216 if (pud_none(*pud)) {
199 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 217 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
200 if (!p) 218 if (!p)
201 return NULL; 219 return NULL;
202 pud_populate(&init_mm, pud, p); 220 pud_populate(&init_mm, pud, p);
@@ -208,7 +226,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
208{ 226{
209 p4d_t *p4d = p4d_offset(pgd, addr); 227 p4d_t *p4d = p4d_offset(pgd, addr);
210 if (p4d_none(*p4d)) { 228 if (p4d_none(*p4d)) {
211 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 229 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
212 if (!p) 230 if (!p)
213 return NULL; 231 return NULL;
214 p4d_populate(&init_mm, p4d, p); 232 p4d_populate(&init_mm, p4d, p);
@@ -220,7 +238,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
220{ 238{
221 pgd_t *pgd = pgd_offset_k(addr); 239 pgd_t *pgd = pgd_offset_k(addr);
222 if (pgd_none(*pgd)) { 240 if (pgd_none(*pgd)) {
223 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 241 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
224 if (!p) 242 if (!p)
225 return NULL; 243 return NULL;
226 pgd_populate(&init_mm, pgd, p); 244 pgd_populate(&init_mm, pgd, p);
diff --git a/mm/sparse.c b/mm/sparse.c
index 60805abf98af..7a5dacaa06e3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -453,9 +453,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
453 } 453 }
454 454
455 size = PAGE_ALIGN(size); 455 size = PAGE_ALIGN(size);
456 map = memblock_virt_alloc_try_nid(size * map_count, 456 map = memblock_virt_alloc_try_nid_raw(size * map_count,
457 PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 457 PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
458 BOOTMEM_ALLOC_ACCESSIBLE, nodeid); 458 BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
459 if (map) { 459 if (map) {
460 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 460 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
461 if (!present_section_nr(pnum)) 461 if (!present_section_nr(pnum))
diff --git a/mm/swap.c b/mm/swap.c
index a77d68f2c1b6..38e1b6374a97 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -76,7 +76,7 @@ static void __page_cache_release(struct page *page)
76static void __put_single_page(struct page *page) 76static void __put_single_page(struct page *page)
77{ 77{
78 __page_cache_release(page); 78 __page_cache_release(page);
79 free_hot_cold_page(page, false); 79 free_unref_page(page);
80} 80}
81 81
82static void __put_compound_page(struct page *page) 82static void __put_compound_page(struct page *page)
@@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
210 } 210 }
211 if (pgdat) 211 if (pgdat)
212 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 212 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
213 release_pages(pvec->pages, pvec->nr, pvec->cold); 213 release_pages(pvec->pages, pvec->nr);
214 pagevec_reinit(pvec); 214 pagevec_reinit(pvec);
215} 215}
216 216
@@ -740,7 +740,7 @@ void lru_add_drain_all(void)
740 * Decrement the reference count on all the pages in @pages. If it 740 * Decrement the reference count on all the pages in @pages. If it
741 * fell to zero, remove the page from the LRU and free it. 741 * fell to zero, remove the page from the LRU and free it.
742 */ 742 */
743void release_pages(struct page **pages, int nr, bool cold) 743void release_pages(struct page **pages, int nr)
744{ 744{
745 int i; 745 int i;
746 LIST_HEAD(pages_to_free); 746 LIST_HEAD(pages_to_free);
@@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr, bool cold)
817 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 817 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
818 818
819 mem_cgroup_uncharge_list(&pages_to_free); 819 mem_cgroup_uncharge_list(&pages_to_free);
820 free_hot_cold_page_list(&pages_to_free, cold); 820 free_unref_page_list(&pages_to_free);
821} 821}
822EXPORT_SYMBOL(release_pages); 822EXPORT_SYMBOL(release_pages);
823 823
@@ -833,8 +833,11 @@ EXPORT_SYMBOL(release_pages);
833 */ 833 */
834void __pagevec_release(struct pagevec *pvec) 834void __pagevec_release(struct pagevec *pvec)
835{ 835{
836 lru_add_drain(); 836 if (!pvec->percpu_pvec_drained) {
837 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 837 lru_add_drain();
838 pvec->percpu_pvec_drained = true;
839 }
840 release_pages(pvec->pages, pagevec_count(pvec));
838 pagevec_reinit(pvec); 841 pagevec_reinit(pvec);
839} 842}
840EXPORT_SYMBOL(__pagevec_release); 843EXPORT_SYMBOL(__pagevec_release);
@@ -986,15 +989,25 @@ unsigned pagevec_lookup_range(struct pagevec *pvec,
986} 989}
987EXPORT_SYMBOL(pagevec_lookup_range); 990EXPORT_SYMBOL(pagevec_lookup_range);
988 991
989unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 992unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
990 pgoff_t *index, int tag, unsigned nr_pages) 993 struct address_space *mapping, pgoff_t *index, pgoff_t end,
994 int tag)
991{ 995{
992 pvec->nr = find_get_pages_tag(mapping, index, tag, 996 pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
993 nr_pages, pvec->pages); 997 PAGEVEC_SIZE, pvec->pages);
994 return pagevec_count(pvec); 998 return pagevec_count(pvec);
995} 999}
996EXPORT_SYMBOL(pagevec_lookup_tag); 1000EXPORT_SYMBOL(pagevec_lookup_range_tag);
997 1001
1002unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
1003 struct address_space *mapping, pgoff_t *index, pgoff_t end,
1004 int tag, unsigned max_pages)
1005{
1006 pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1007 min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
1008 return pagevec_count(pvec);
1009}
1010EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
998/* 1011/*
999 * Perform any setup for the swap system 1012 * Perform any setup for the swap system
1000 */ 1013 */
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index d81cfc5a43d5..bebc19292018 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -149,6 +149,13 @@ static int alloc_swap_slot_cache(unsigned int cpu)
149 cache->nr = 0; 149 cache->nr = 0;
150 cache->cur = 0; 150 cache->cur = 0;
151 cache->n_ret = 0; 151 cache->n_ret = 0;
152 /*
153 * We initialized alloc_lock and free_lock earlier. We use
154 * !cache->slots or !cache->slots_ret to know if it is safe to acquire
155 * the corresponding lock and use the cache. Memory barrier below
156 * ensures the assumption.
157 */
158 mb();
152 cache->slots = slots; 159 cache->slots = slots;
153 slots = NULL; 160 slots = NULL;
154 cache->slots_ret = slots_ret; 161 cache->slots_ret = slots_ret;
@@ -275,7 +282,7 @@ int free_swap_slot(swp_entry_t entry)
275 struct swap_slots_cache *cache; 282 struct swap_slots_cache *cache;
276 283
277 cache = raw_cpu_ptr(&swp_slots); 284 cache = raw_cpu_ptr(&swp_slots);
278 if (use_swap_slot_cache && cache->slots_ret) { 285 if (likely(use_swap_slot_cache && cache->slots_ret)) {
279 spin_lock_irq(&cache->free_lock); 286 spin_lock_irq(&cache->free_lock);
280 /* Swap slots cache may be deactivated before acquiring lock */ 287 /* Swap slots cache may be deactivated before acquiring lock */
281 if (!use_swap_slot_cache || !cache->slots_ret) { 288 if (!use_swap_slot_cache || !cache->slots_ret) {
@@ -326,7 +333,7 @@ swp_entry_t get_swap_page(struct page *page)
326 */ 333 */
327 cache = raw_cpu_ptr(&swp_slots); 334 cache = raw_cpu_ptr(&swp_slots);
328 335
329 if (check_cache_active()) { 336 if (likely(check_cache_active() && cache->slots)) {
330 mutex_lock(&cache->alloc_lock); 337 mutex_lock(&cache->alloc_lock);
331 if (cache->slots) { 338 if (cache->slots) {
332repeat: 339repeat:
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 326439428daf..39ae7cfad90f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,9 +36,9 @@ static const struct address_space_operations swap_aops = {
36#endif 36#endif
37}; 37};
38 38
39struct address_space *swapper_spaces[MAX_SWAPFILES]; 39struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
40static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; 40static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
41bool swap_vma_readahead = true; 41bool swap_vma_readahead __read_mostly = true;
42 42
43#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 43#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
44#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 44#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
@@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
319 lru_add_drain(); 319 lru_add_drain();
320 for (i = 0; i < nr; i++) 320 for (i = 0; i < nr; i++)
321 free_swap_cache(pagep[i]); 321 free_swap_cache(pagep[i]);
322 release_pages(pagep, nr, false); 322 release_pages(pagep, nr);
323} 323}
324 324
325/* 325/*
@@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
559 unsigned long offset = entry_offset; 559 unsigned long offset = entry_offset;
560 unsigned long start_offset, end_offset; 560 unsigned long start_offset, end_offset;
561 unsigned long mask; 561 unsigned long mask;
562 struct swap_info_struct *si = swp_swap_info(entry);
562 struct blk_plug plug; 563 struct blk_plug plug;
563 bool do_poll = true, page_allocated; 564 bool do_poll = true, page_allocated;
564 565
@@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
572 end_offset = offset | mask; 573 end_offset = offset | mask;
573 if (!start_offset) /* First page is swap header. */ 574 if (!start_offset) /* First page is swap header. */
574 start_offset++; 575 start_offset++;
576 if (end_offset >= si->max)
577 end_offset = si->max - 1;
575 578
576 blk_start_plug(&plug); 579 blk_start_plug(&plug);
577 for (offset = start_offset; offset <= end_offset ; offset++) { 580 for (offset = start_offset; offset <= end_offset ; offset++) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e47a21e64764..3074b02eaa09 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1328,6 +1328,13 @@ int page_swapcount(struct page *page)
1328 return count; 1328 return count;
1329} 1329}
1330 1330
1331int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1332{
1333 pgoff_t offset = swp_offset(entry);
1334
1335 return swap_count(si->swap_map[offset]);
1336}
1337
1331static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) 1338static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1332{ 1339{
1333 int count = 0; 1340 int count = 0;
@@ -3169,6 +3176,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3169 if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) 3176 if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3170 p->flags |= SWP_STABLE_WRITES; 3177 p->flags |= SWP_STABLE_WRITES;
3171 3178
3179 if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3180 p->flags |= SWP_SYNCHRONOUS_IO;
3181
3172 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { 3182 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3173 int cpu; 3183 int cpu;
3174 unsigned long ci, nr_cluster; 3184 unsigned long ci, nr_cluster;
@@ -3452,10 +3462,15 @@ int swapcache_prepare(swp_entry_t entry)
3452 return __swap_duplicate(entry, SWAP_HAS_CACHE); 3462 return __swap_duplicate(entry, SWAP_HAS_CACHE);
3453} 3463}
3454 3464
3465struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3466{
3467 return swap_info[swp_type(entry)];
3468}
3469
3455struct swap_info_struct *page_swap_info(struct page *page) 3470struct swap_info_struct *page_swap_info(struct page *page)
3456{ 3471{
3457 swp_entry_t swap = { .val = page_private(page) }; 3472 swp_entry_t entry = { .val = page_private(page) };
3458 return swap_info[swp_type(swap)]; 3473 return swp_swap_info(entry);
3459} 3474}
3460 3475
3461/* 3476/*
@@ -3463,7 +3478,6 @@ struct swap_info_struct *page_swap_info(struct page *page)
3463 */ 3478 */
3464struct address_space *__page_file_mapping(struct page *page) 3479struct address_space *__page_file_mapping(struct page *page)
3465{ 3480{
3466 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
3467 return page_swap_info(page)->swap_file->f_mapping; 3481 return page_swap_info(page)->swap_file->f_mapping;
3468} 3482}
3469EXPORT_SYMBOL_GPL(__page_file_mapping); 3483EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -3471,7 +3485,6 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
3471pgoff_t __page_file_index(struct page *page) 3485pgoff_t __page_file_index(struct page *page)
3472{ 3486{
3473 swp_entry_t swap = { .val = page_private(page) }; 3487 swp_entry_t swap = { .val = page_private(page) };
3474 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
3475 return swp_offset(swap); 3488 return swp_offset(swap);
3476} 3489}
3477EXPORT_SYMBOL_GPL(__page_file_index); 3490EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/truncate.c b/mm/truncate.c
index 2330223841fb..e4b4cf0f4070 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -25,44 +25,85 @@
25#include <linux/rmap.h> 25#include <linux/rmap.h>
26#include "internal.h" 26#include "internal.h"
27 27
28static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, 28/*
29 void *entry) 29 * Regular page slots are stabilized by the page lock even without the tree
30 * itself locked. These unlocked entries need verification under the tree
31 * lock.
32 */
33static inline void __clear_shadow_entry(struct address_space *mapping,
34 pgoff_t index, void *entry)
30{ 35{
31 struct radix_tree_node *node; 36 struct radix_tree_node *node;
32 void **slot; 37 void **slot;
33 38
34 spin_lock_irq(&mapping->tree_lock);
35 /*
36 * Regular page slots are stabilized by the page lock even
37 * without the tree itself locked. These unlocked entries
38 * need verification under the tree lock.
39 */
40 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) 39 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
41 goto unlock; 40 return;
42 if (*slot != entry) 41 if (*slot != entry)
43 goto unlock; 42 return;
44 __radix_tree_replace(&mapping->page_tree, node, slot, NULL, 43 __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
45 workingset_update_node, mapping); 44 workingset_update_node);
46 mapping->nrexceptional--; 45 mapping->nrexceptional--;
47unlock: 46}
47
48static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
49 void *entry)
50{
51 spin_lock_irq(&mapping->tree_lock);
52 __clear_shadow_entry(mapping, index, entry);
48 spin_unlock_irq(&mapping->tree_lock); 53 spin_unlock_irq(&mapping->tree_lock);
49} 54}
50 55
51/* 56/*
52 * Unconditionally remove exceptional entry. Usually called from truncate path. 57 * Unconditionally remove exceptional entries. Usually called from truncate
58 * path. Note that the pagevec may be altered by this function by removing
59 * exceptional entries similar to what pagevec_remove_exceptionals does.
53 */ 60 */
54static void truncate_exceptional_entry(struct address_space *mapping, 61static void truncate_exceptional_pvec_entries(struct address_space *mapping,
55 pgoff_t index, void *entry) 62 struct pagevec *pvec, pgoff_t *indices,
63 pgoff_t end)
56{ 64{
65 int i, j;
66 bool dax, lock;
67
57 /* Handled by shmem itself */ 68 /* Handled by shmem itself */
58 if (shmem_mapping(mapping)) 69 if (shmem_mapping(mapping))
59 return; 70 return;
60 71
61 if (dax_mapping(mapping)) { 72 for (j = 0; j < pagevec_count(pvec); j++)
62 dax_delete_mapping_entry(mapping, index); 73 if (radix_tree_exceptional_entry(pvec->pages[j]))
74 break;
75
76 if (j == pagevec_count(pvec))
63 return; 77 return;
78
79 dax = dax_mapping(mapping);
80 lock = !dax && indices[j] < end;
81 if (lock)
82 spin_lock_irq(&mapping->tree_lock);
83
84 for (i = j; i < pagevec_count(pvec); i++) {
85 struct page *page = pvec->pages[i];
86 pgoff_t index = indices[i];
87
88 if (!radix_tree_exceptional_entry(page)) {
89 pvec->pages[j++] = page;
90 continue;
91 }
92
93 if (index >= end)
94 continue;
95
96 if (unlikely(dax)) {
97 dax_delete_mapping_entry(mapping, index);
98 continue;
99 }
100
101 __clear_shadow_entry(mapping, index, page);
64 } 102 }
65 clear_shadow_entry(mapping, index, entry); 103
104 if (lock)
105 spin_unlock_irq(&mapping->tree_lock);
106 pvec->nr = j;
66} 107}
67 108
68/* 109/*
@@ -134,11 +175,17 @@ void do_invalidatepage(struct page *page, unsigned int offset,
134 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 175 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
135 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 176 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
136 */ 177 */
137static int 178static void
138truncate_complete_page(struct address_space *mapping, struct page *page) 179truncate_cleanup_page(struct address_space *mapping, struct page *page)
139{ 180{
140 if (page->mapping != mapping) 181 if (page_mapped(page)) {
141 return -EIO; 182 loff_t holelen;
183
184 holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
185 unmap_mapping_range(mapping,
186 (loff_t)page->index << PAGE_SHIFT,
187 holelen, 0);
188 }
142 189
143 if (page_has_private(page)) 190 if (page_has_private(page))
144 do_invalidatepage(page, 0, PAGE_SIZE); 191 do_invalidatepage(page, 0, PAGE_SIZE);
@@ -150,8 +197,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
150 */ 197 */
151 cancel_dirty_page(page); 198 cancel_dirty_page(page);
152 ClearPageMappedToDisk(page); 199 ClearPageMappedToDisk(page);
153 delete_from_page_cache(page);
154 return 0;
155} 200}
156 201
157/* 202/*
@@ -180,16 +225,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
180 225
181int truncate_inode_page(struct address_space *mapping, struct page *page) 226int truncate_inode_page(struct address_space *mapping, struct page *page)
182{ 227{
183 loff_t holelen;
184 VM_BUG_ON_PAGE(PageTail(page), page); 228 VM_BUG_ON_PAGE(PageTail(page), page);
185 229
186 holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; 230 if (page->mapping != mapping)
187 if (page_mapped(page)) { 231 return -EIO;
188 unmap_mapping_range(mapping, 232
189 (loff_t)page->index << PAGE_SHIFT, 233 truncate_cleanup_page(mapping, page);
190 holelen, 0); 234 delete_from_page_cache(page);
191 } 235 return 0;
192 return truncate_complete_page(mapping, page);
193} 236}
194 237
195/* 238/*
@@ -287,11 +330,19 @@ void truncate_inode_pages_range(struct address_space *mapping,
287 else 330 else
288 end = (lend + 1) >> PAGE_SHIFT; 331 end = (lend + 1) >> PAGE_SHIFT;
289 332
290 pagevec_init(&pvec, 0); 333 pagevec_init(&pvec);
291 index = start; 334 index = start;
292 while (index < end && pagevec_lookup_entries(&pvec, mapping, index, 335 while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
293 min(end - index, (pgoff_t)PAGEVEC_SIZE), 336 min(end - index, (pgoff_t)PAGEVEC_SIZE),
294 indices)) { 337 indices)) {
338 /*
339 * Pagevec array has exceptional entries and we may also fail
340 * to lock some pages. So we store pages that can be deleted
341 * in a new pagevec.
342 */
343 struct pagevec locked_pvec;
344
345 pagevec_init(&locked_pvec);
295 for (i = 0; i < pagevec_count(&pvec); i++) { 346 for (i = 0; i < pagevec_count(&pvec); i++) {
296 struct page *page = pvec.pages[i]; 347 struct page *page = pvec.pages[i];
297 348
@@ -300,11 +351,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
300 if (index >= end) 351 if (index >= end)
301 break; 352 break;
302 353
303 if (radix_tree_exceptional_entry(page)) { 354 if (radix_tree_exceptional_entry(page))
304 truncate_exceptional_entry(mapping, index,
305 page);
306 continue; 355 continue;
307 }
308 356
309 if (!trylock_page(page)) 357 if (!trylock_page(page))
310 continue; 358 continue;
@@ -313,15 +361,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
313 unlock_page(page); 361 unlock_page(page);
314 continue; 362 continue;
315 } 363 }
316 truncate_inode_page(mapping, page); 364 if (page->mapping != mapping) {
317 unlock_page(page); 365 unlock_page(page);
366 continue;
367 }
368 pagevec_add(&locked_pvec, page);
318 } 369 }
319 pagevec_remove_exceptionals(&pvec); 370 for (i = 0; i < pagevec_count(&locked_pvec); i++)
371 truncate_cleanup_page(mapping, locked_pvec.pages[i]);
372 delete_from_page_cache_batch(mapping, &locked_pvec);
373 for (i = 0; i < pagevec_count(&locked_pvec); i++)
374 unlock_page(locked_pvec.pages[i]);
375 truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
320 pagevec_release(&pvec); 376 pagevec_release(&pvec);
321 cond_resched(); 377 cond_resched();
322 index++; 378 index++;
323 } 379 }
324
325 if (partial_start) { 380 if (partial_start) {
326 struct page *page = find_lock_page(mapping, start - 1); 381 struct page *page = find_lock_page(mapping, start - 1);
327 if (page) { 382 if (page) {
@@ -379,6 +434,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
379 pagevec_release(&pvec); 434 pagevec_release(&pvec);
380 break; 435 break;
381 } 436 }
437
382 for (i = 0; i < pagevec_count(&pvec); i++) { 438 for (i = 0; i < pagevec_count(&pvec); i++) {
383 struct page *page = pvec.pages[i]; 439 struct page *page = pvec.pages[i];
384 440
@@ -390,11 +446,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
390 break; 446 break;
391 } 447 }
392 448
393 if (radix_tree_exceptional_entry(page)) { 449 if (radix_tree_exceptional_entry(page))
394 truncate_exceptional_entry(mapping, index,
395 page);
396 continue; 450 continue;
397 }
398 451
399 lock_page(page); 452 lock_page(page);
400 WARN_ON(page_to_index(page) != index); 453 WARN_ON(page_to_index(page) != index);
@@ -402,7 +455,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
402 truncate_inode_page(mapping, page); 455 truncate_inode_page(mapping, page);
403 unlock_page(page); 456 unlock_page(page);
404 } 457 }
405 pagevec_remove_exceptionals(&pvec); 458 truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
406 pagevec_release(&pvec); 459 pagevec_release(&pvec);
407 index++; 460 index++;
408 } 461 }
@@ -500,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
500 unsigned long count = 0; 553 unsigned long count = 0;
501 int i; 554 int i;
502 555
503 pagevec_init(&pvec, 0); 556 pagevec_init(&pvec);
504 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, 557 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
505 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 558 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
506 indices)) { 559 indices)) {
@@ -630,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
630 if (mapping->nrpages == 0 && mapping->nrexceptional == 0) 683 if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
631 goto out; 684 goto out;
632 685
633 pagevec_init(&pvec, 0); 686 pagevec_init(&pvec);
634 index = start; 687 index = start;
635 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, 688 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
636 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 689 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 15b483ef6440..c02c850ea349 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1349,7 +1349,7 @@ keep:
1349 1349
1350 mem_cgroup_uncharge_list(&free_pages); 1350 mem_cgroup_uncharge_list(&free_pages);
1351 try_to_unmap_flush(); 1351 try_to_unmap_flush();
1352 free_hot_cold_page_list(&free_pages, true); 1352 free_unref_page_list(&free_pages);
1353 1353
1354 list_splice(&ret_pages, page_list); 1354 list_splice(&ret_pages, page_list);
1355 count_vm_events(PGACTIVATE, pgactivate); 1355 count_vm_events(PGACTIVATE, pgactivate);
@@ -1824,7 +1824,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1824 spin_unlock_irq(&pgdat->lru_lock); 1824 spin_unlock_irq(&pgdat->lru_lock);
1825 1825
1826 mem_cgroup_uncharge_list(&page_list); 1826 mem_cgroup_uncharge_list(&page_list);
1827 free_hot_cold_page_list(&page_list, true); 1827 free_unref_page_list(&page_list);
1828 1828
1829 /* 1829 /*
1830 * If reclaim is isolating dirty pages under writeback, it implies 1830 * If reclaim is isolating dirty pages under writeback, it implies
@@ -2063,7 +2063,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
2063 spin_unlock_irq(&pgdat->lru_lock); 2063 spin_unlock_irq(&pgdat->lru_lock);
2064 2064
2065 mem_cgroup_uncharge_list(&l_hold); 2065 mem_cgroup_uncharge_list(&l_hold);
2066 free_hot_cold_page_list(&l_hold, true); 2066 free_unref_page_list(&l_hold);
2067 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, 2067 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2068 nr_deactivate, nr_rotated, sc->priority, file); 2068 nr_deactivate, nr_rotated, sc->priority, file);
2069} 2069}
@@ -2082,7 +2082,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
2082 * If that fails and refaulting is observed, the inactive list grows. 2082 * If that fails and refaulting is observed, the inactive list grows.
2083 * 2083 *
2084 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages 2084 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
2085 * on this LRU, maintained by the pageout code. A zone->inactive_ratio 2085 * on this LRU, maintained by the pageout code. An inactive_ratio
2086 * of 3 means 3:1 or 25% of the pages are kept on the inactive list. 2086 * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
2087 * 2087 *
2088 * total target max 2088 * total target max
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e72ac97..40b2db6db6b1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,77 @@
32 32
33#define NUMA_STATS_THRESHOLD (U16_MAX - 2) 33#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
34 34
35#ifdef CONFIG_NUMA
36int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
37
38/* zero numa counters within a zone */
39static void zero_zone_numa_counters(struct zone *zone)
40{
41 int item, cpu;
42
43 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
44 atomic_long_set(&zone->vm_numa_stat[item], 0);
45 for_each_online_cpu(cpu)
46 per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
47 = 0;
48 }
49}
50
51/* zero numa counters of all the populated zones */
52static void zero_zones_numa_counters(void)
53{
54 struct zone *zone;
55
56 for_each_populated_zone(zone)
57 zero_zone_numa_counters(zone);
58}
59
60/* zero global numa counters */
61static void zero_global_numa_counters(void)
62{
63 int item;
64
65 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
66 atomic_long_set(&vm_numa_stat[item], 0);
67}
68
69static void invalid_numa_statistics(void)
70{
71 zero_zones_numa_counters();
72 zero_global_numa_counters();
73}
74
75static DEFINE_MUTEX(vm_numa_stat_lock);
76
77int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
78 void __user *buffer, size_t *length, loff_t *ppos)
79{
80 int ret, oldval;
81
82 mutex_lock(&vm_numa_stat_lock);
83 if (write)
84 oldval = sysctl_vm_numa_stat;
85 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
86 if (ret || !write)
87 goto out;
88
89 if (oldval == sysctl_vm_numa_stat)
90 goto out;
91 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
92 static_branch_enable(&vm_numa_stat_key);
93 pr_info("enable numa statistics\n");
94 } else {
95 static_branch_disable(&vm_numa_stat_key);
96 invalid_numa_statistics();
97 pr_info("disable numa statistics, and clear numa counters\n");
98 }
99
100out:
101 mutex_unlock(&vm_numa_stat_lock);
102 return ret;
103}
104#endif
105
35#ifdef CONFIG_VM_EVENT_COUNTERS 106#ifdef CONFIG_VM_EVENT_COUNTERS
36DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 107DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
37EXPORT_PER_CPU_SYMBOL(vm_event_states); 108EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -1564,11 +1635,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1564 } 1635 }
1565 seq_printf(m, 1636 seq_printf(m,
1566 "\n node_unreclaimable: %u" 1637 "\n node_unreclaimable: %u"
1567 "\n start_pfn: %lu" 1638 "\n start_pfn: %lu",
1568 "\n node_inactive_ratio: %u",
1569 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, 1639 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1570 zone->zone_start_pfn, 1640 zone->zone_start_pfn);
1571 zone->zone_pgdat->inactive_ratio);
1572 seq_putc(m, '\n'); 1641 seq_putc(m, '\n');
1573} 1642}
1574 1643
diff --git a/mm/workingset.c b/mm/workingset.c
index b997c9de28f6..b7d616a3bbbe 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -340,14 +340,8 @@ out:
340 340
341static struct list_lru shadow_nodes; 341static struct list_lru shadow_nodes;
342 342
343void workingset_update_node(struct radix_tree_node *node, void *private) 343void workingset_update_node(struct radix_tree_node *node)
344{ 344{
345 struct address_space *mapping = private;
346
347 /* Only regular page cache has shadow entries */
348 if (dax_mapping(mapping) || shmem_mapping(mapping))
349 return;
350
351 /* 345 /*
352 * Track non-empty nodes that contain only shadow entries; 346 * Track non-empty nodes that contain only shadow entries;
353 * unlink those that contain pages or are being freed. 347 * unlink those that contain pages or are being freed.
@@ -475,7 +469,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
475 goto out_invalid; 469 goto out_invalid;
476 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); 470 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
477 __radix_tree_delete_node(&mapping->page_tree, node, 471 __radix_tree_delete_node(&mapping->page_tree, node,
478 workingset_update_node, mapping); 472 workingset_lookup_update(mapping));
479 473
480out_invalid: 474out_invalid:
481 spin_unlock(&mapping->tree_lock); 475 spin_unlock(&mapping->tree_lock);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7c38e850a8fc..685049a9048d 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1349,7 +1349,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1349 * pools/users, we can't allow mapping in interrupt context 1349 * pools/users, we can't allow mapping in interrupt context
1350 * because it can corrupt another users mappings. 1350 * because it can corrupt another users mappings.
1351 */ 1351 */
1352 WARN_ON_ONCE(in_interrupt()); 1352 BUG_ON(in_interrupt());
1353 1353
1354 /* From now on, migration cannot move the object */ 1354 /* From now on, migration cannot move the object */
1355 pin_tag(handle); 1355 pin_tag(handle);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8134c00df6c2..6b0ff396fa9d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -41,7 +41,6 @@
41#include <linux/module.h> 41#include <linux/module.h>
42#include <linux/types.h> 42#include <linux/types.h>
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44#include <linux/kmemcheck.h>
45#include <linux/mm.h> 44#include <linux/mm.h>
46#include <linux/interrupt.h> 45#include <linux/interrupt.h>
47#include <linux/in.h> 46#include <linux/in.h>
@@ -234,14 +233,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
234 shinfo = skb_shinfo(skb); 233 shinfo = skb_shinfo(skb);
235 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 234 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
236 atomic_set(&shinfo->dataref, 1); 235 atomic_set(&shinfo->dataref, 1);
237 kmemcheck_annotate_variable(shinfo->destructor_arg);
238 236
239 if (flags & SKB_ALLOC_FCLONE) { 237 if (flags & SKB_ALLOC_FCLONE) {
240 struct sk_buff_fclones *fclones; 238 struct sk_buff_fclones *fclones;
241 239
242 fclones = container_of(skb, struct sk_buff_fclones, skb1); 240 fclones = container_of(skb, struct sk_buff_fclones, skb1);
243 241
244 kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
245 skb->fclone = SKB_FCLONE_ORIG; 242 skb->fclone = SKB_FCLONE_ORIG;
246 refcount_set(&fclones->fclone_ref, 1); 243 refcount_set(&fclones->fclone_ref, 1);
247 244
@@ -301,7 +298,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
301 shinfo = skb_shinfo(skb); 298 shinfo = skb_shinfo(skb);
302 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 299 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
303 atomic_set(&shinfo->dataref, 1); 300 atomic_set(&shinfo->dataref, 1);
304 kmemcheck_annotate_variable(shinfo->destructor_arg);
305 301
306 return skb; 302 return skb;
307} 303}
@@ -357,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
357 */ 353 */
358void *netdev_alloc_frag(unsigned int fragsz) 354void *netdev_alloc_frag(unsigned int fragsz)
359{ 355{
360 return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); 356 return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
361} 357}
362EXPORT_SYMBOL(netdev_alloc_frag); 358EXPORT_SYMBOL(netdev_alloc_frag);
363 359
@@ -370,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
370 366
371void *napi_alloc_frag(unsigned int fragsz) 367void *napi_alloc_frag(unsigned int fragsz)
372{ 368{
373 return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); 369 return __napi_alloc_frag(fragsz, GFP_ATOMIC);
374} 370}
375EXPORT_SYMBOL(napi_alloc_frag); 371EXPORT_SYMBOL(napi_alloc_frag);
376 372
@@ -1283,7 +1279,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
1283 if (!n) 1279 if (!n)
1284 return NULL; 1280 return NULL;
1285 1281
1286 kmemcheck_annotate_bitfield(n, flags1);
1287 n->fclone = SKB_FCLONE_UNAVAILABLE; 1282 n->fclone = SKB_FCLONE_UNAVAILABLE;
1288 } 1283 }
1289 1284
diff --git a/net/core/sock.c b/net/core/sock.c
index 13719af7b4e3..c0b5b2f17412 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1469,8 +1469,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1469 sk = kmalloc(prot->obj_size, priority); 1469 sk = kmalloc(prot->obj_size, priority);
1470 1470
1471 if (sk != NULL) { 1471 if (sk != NULL) {
1472 kmemcheck_annotate_bitfield(sk, flags);
1473
1474 if (security_sk_alloc(sk, family, priority)) 1472 if (security_sk_alloc(sk, family, priority))
1475 goto out_free; 1473 goto out_free;
1476 1474
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a4bab81f1462..c690cd0d9b3f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,7 +9,6 @@
9 */ 9 */
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/kmemcheck.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
14#include <linux/module.h> 13#include <linux/module.h>
15#include <net/inet_hashtables.h> 14#include <net/inet_hashtables.h>
@@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
167 if (tw) { 166 if (tw) {
168 const struct inet_sock *inet = inet_sk(sk); 167 const struct inet_sock *inet = inet_sk(sk);
169 168
170 kmemcheck_annotate_bitfield(tw, flags);
171
172 tw->tw_dr = dr; 169 tw->tw_dr = dr;
173 /* Give us an identity. */ 170 /* Give us an identity. */
174 tw->tw_daddr = inet->inet_daddr; 171 tw->tw_daddr = inet->inet_daddr;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dabbf1d392fb..f844c06c0676 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6130,7 +6130,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
6130 if (req) { 6130 if (req) {
6131 struct inet_request_sock *ireq = inet_rsk(req); 6131 struct inet_request_sock *ireq = inet_rsk(req);
6132 6132
6133 kmemcheck_annotate_bitfield(ireq, flags);
6134 ireq->ireq_opt = NULL; 6133 ireq->ireq_opt = NULL;
6135#if IS_ENABLED(CONFIG_IPV6) 6134#if IS_ENABLED(CONFIG_IPV6)
6136 ireq->pktopts = NULL; 6135 ireq->pktopts = NULL;
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
index 86ef907067bb..e0f70c4051b6 100644
--- a/net/rds/ib_fmr.c
+++ b/net/rds/ib_fmr.c
@@ -139,8 +139,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev,
139 return -EINVAL; 139 return -EINVAL;
140 } 140 }
141 141
142 dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, 142 dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC,
143 rdsibdev_to_node(rds_ibdev)); 143 rdsibdev_to_node(rds_ibdev));
144 if (!dma_pages) { 144 if (!dma_pages) {
145 ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); 145 ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
146 return -ENOMEM; 146 return -ENOMEM;
diff --git a/net/socket.c b/net/socket.c
index c729625eb5d3..42d8e9c9ccd5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -568,7 +568,6 @@ struct socket *sock_alloc(void)
568 568
569 sock = SOCKET_I(inode); 569 sock = SOCKET_I(inode);
570 570
571 kmemcheck_annotate_bitfield(sock, type);
572 inode->i_ino = get_next_ino(); 571 inode->i_ino = get_next_ino();
573 inode->i_mode = S_IFSOCK | S_IRWXUGO; 572 inode->i_mode = S_IFSOCK | S_IRWXUGO;
574 inode->i_uid = current_fsuid(); 573 inode->i_uid = current_fsuid();
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
index a27677146410..6f099f915dcf 100755
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -12,18 +12,22 @@ from signal import signal, SIGPIPE, SIG_DFL
12 12
13signal(SIGPIPE, SIG_DFL) 13signal(SIGPIPE, SIG_DFL)
14 14
15if len(sys.argv) != 3: 15if len(sys.argv) < 3:
16 sys.stderr.write("usage: %s file1 file2\n" % sys.argv[0]) 16 sys.stderr.write("usage: %s [option] file1 file2\n" % sys.argv[0])
17 sys.stderr.write("The options are:\n")
18 sys.stderr.write("-c cateogrize output based on symbole type\n")
19 sys.stderr.write("-d Show delta of Data Section\n")
20 sys.stderr.write("-t Show delta of text Section\n")
17 sys.exit(-1) 21 sys.exit(-1)
18 22
19re_NUMBER = re.compile(r'\.[0-9]+') 23re_NUMBER = re.compile(r'\.[0-9]+')
20 24
21def getsizes(file): 25def getsizes(file, format):
22 sym = {} 26 sym = {}
23 with os.popen("nm --size-sort " + file) as f: 27 with os.popen("nm --size-sort " + file) as f:
24 for line in f: 28 for line in f:
25 size, type, name = line.split() 29 size, type, name = line.split()
26 if type in "tTdDbBrR": 30 if type in format:
27 # strip generated symbols 31 # strip generated symbols
28 if name.startswith("__mod_"): continue 32 if name.startswith("__mod_"): continue
29 if name.startswith("SyS_"): continue 33 if name.startswith("SyS_"): continue
@@ -34,44 +38,61 @@ def getsizes(file):
34 sym[name] = sym.get(name, 0) + int(size, 16) 38 sym[name] = sym.get(name, 0) + int(size, 16)
35 return sym 39 return sym
36 40
37old = getsizes(sys.argv[1]) 41def calc(oldfile, newfile, format):
38new = getsizes(sys.argv[2]) 42 old = getsizes(oldfile, format)
39grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0 43 new = getsizes(newfile, format)
40delta, common = [], {} 44 grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0
41otot, ntot = 0, 0 45 delta, common = [], {}
46 otot, ntot = 0, 0
42 47
43for a in old: 48 for a in old:
44 if a in new: 49 if a in new:
45 common[a] = 1 50 common[a] = 1
46 51
47for name in old: 52 for name in old:
48 otot += old[name] 53 otot += old[name]
49 if name not in common: 54 if name not in common:
50 remove += 1 55 remove += 1
51 down += old[name] 56 down += old[name]
52 delta.append((-old[name], name)) 57 delta.append((-old[name], name))
53 58
54for name in new: 59 for name in new:
55 ntot += new[name] 60 ntot += new[name]
56 if name not in common: 61 if name not in common:
57 add += 1 62 add += 1
58 up += new[name] 63 up += new[name]
59 delta.append((new[name], name)) 64 delta.append((new[name], name))
60 65
61for name in common: 66 for name in common:
62 d = new.get(name, 0) - old.get(name, 0) 67 d = new.get(name, 0) - old.get(name, 0)
63 if d>0: grow, up = grow+1, up+d 68 if d>0: grow, up = grow+1, up+d
64 if d<0: shrink, down = shrink+1, down-d 69 if d<0: shrink, down = shrink+1, down-d
65 delta.append((d, name)) 70 delta.append((d, name))
66 71
67delta.sort() 72 delta.sort()
68delta.reverse() 73 delta.reverse()
74 return grow, shrink, add, remove, up, down, delta, old, new, otot, ntot
69 75
70print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \ 76def print_result(symboltype, symbolformat, argc):
71 (add, remove, grow, shrink, up, -down, up-down)) 77 grow, shrink, add, remove, up, down, delta, old, new, otot, ntot = \
72print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta")) 78 calc(sys.argv[argc - 1], sys.argv[argc], symbolformat)
73for d, n in delta:
74 if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d))
75 79
76print("Total: Before=%d, After=%d, chg %+.2f%%" % \ 80 print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \
77 (otot, ntot, (ntot - otot)*100.0/otot)) 81 (add, remove, grow, shrink, up, -down, up-down))
82 print("%-40s %7s %7s %+7s" % (symboltype, "old", "new", "delta"))
83 for d, n in delta:
84 if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d))
85
86 print("Total: Before=%d, After=%d, chg %+.2f%%" % \
87 (otot, ntot, (ntot - otot)*100.0/otot))
88
89if sys.argv[1] == "-c":
90 print_result("Function", "tT", 3)
91 print_result("Data", "dDbB", 3)
92 print_result("RO Data", "rR", 3)
93elif sys.argv[1] == "-d":
94 print_result("Data", "dDbBrR", 3)
95elif sys.argv[1] == "-t":
96 print_result("Function", "tT", 3)
97else:
98 print_result("Function", "tTdDbBrR", 2)
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 67d051edd615..7bd52b8f63d4 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -2182,8 +2182,6 @@ sub dump_struct($$) {
2182 # strip comments: 2182 # strip comments:
2183 $members =~ s/\/\*.*?\*\///gos; 2183 $members =~ s/\/\*.*?\*\///gos;
2184 $nested =~ s/\/\*.*?\*\///gos; 2184 $nested =~ s/\/\*.*?\*\///gos;
2185 # strip kmemcheck_bitfield_{begin,end}.*;
2186 $members =~ s/kmemcheck_bitfield_.*?;//gos;
2187 # strip attributes 2185 # strip attributes
2188 $members =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i; 2186 $members =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i;
2189 $members =~ s/__aligned\s*\([^;]*\)//gos; 2187 $members =~ s/__aligned\s*\([^;]*\)//gos;
diff --git a/tools/include/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h
index 2bccd2c7b897..ea32a7d3cf1b 100644
--- a/tools/include/linux/kmemcheck.h
+++ b/tools/include/linux/kmemcheck.h
@@ -1,9 +1 @@
1/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LIBLOCKDEP_LINUX_KMEMCHECK_H_
3#define _LIBLOCKDEP_LINUX_KMEMCHECK_H_
4
5static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
6{
7}
8
9#endif
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 557d391f564a..ae11e4c3516a 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -641,7 +641,6 @@ static const struct {
641 { "__GFP_ATOMIC", "_A" }, 641 { "__GFP_ATOMIC", "_A" },
642 { "__GFP_IO", "I" }, 642 { "__GFP_IO", "I" },
643 { "__GFP_FS", "F" }, 643 { "__GFP_FS", "F" },
644 { "__GFP_COLD", "CO" },
645 { "__GFP_NOWARN", "NWR" }, 644 { "__GFP_NOWARN", "NWR" },
646 { "__GFP_RETRY_MAYFAIL", "R" }, 645 { "__GFP_RETRY_MAYFAIL", "R" },
647 { "__GFP_NOFAIL", "NF" }, 646 { "__GFP_NOFAIL", "NF" },
@@ -655,7 +654,6 @@ static const struct {
655 { "__GFP_RECLAIMABLE", "RC" }, 654 { "__GFP_RECLAIMABLE", "RC" },
656 { "__GFP_MOVABLE", "M" }, 655 { "__GFP_MOVABLE", "M" },
657 { "__GFP_ACCOUNT", "AC" }, 656 { "__GFP_ACCOUNT", "AC" },
658 { "__GFP_NOTRACK", "NT" },
659 { "__GFP_WRITE", "WR" }, 657 { "__GFP_WRITE", "WR" },
660 { "__GFP_RECLAIM", "R" }, 658 { "__GFP_RECLAIM", "R" },
661 { "__GFP_DIRECT_RECLAIM", "DR" }, 659 { "__GFP_DIRECT_RECLAIM", "DR" },
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 06c71178d07d..59245b3d587c 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -618,7 +618,7 @@ static void multiorder_account(void)
618 __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); 618 __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
619 __radix_tree_lookup(&tree, 1 << 5, &node, &slot); 619 __radix_tree_lookup(&tree, 1 << 5, &node, &slot);
620 assert(node->count == node->exceptional * 2); 620 assert(node->count == node->exceptional * 2);
621 __radix_tree_replace(&tree, node, slot, NULL, NULL, NULL); 621 __radix_tree_replace(&tree, node, slot, NULL, NULL);
622 assert(node->exceptional == 0); 622 assert(node->exceptional == 0);
623 623
624 item_kill_tree(&tree); 624 item_kill_tree(&tree);
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
index b0b7ef6d0de1..f82c2eaa859d 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c
@@ -84,6 +84,7 @@ int output_lines = -1;
84int sort_loss; 84int sort_loss;
85int extended_totals; 85int extended_totals;
86int show_bytes; 86int show_bytes;
87int unreclaim_only;
87 88
88/* Debug options */ 89/* Debug options */
89int sanity; 90int sanity;
@@ -133,6 +134,7 @@ static void usage(void)
133 "-L|--Loss Sort by loss\n" 134 "-L|--Loss Sort by loss\n"
134 "-X|--Xtotals Show extended summary information\n" 135 "-X|--Xtotals Show extended summary information\n"
135 "-B|--Bytes Show size in bytes\n" 136 "-B|--Bytes Show size in bytes\n"
137 "-U|--Unreclaim Show unreclaimable slabs only\n"
136 "\nValid debug options (FZPUT may be combined)\n" 138 "\nValid debug options (FZPUT may be combined)\n"
137 "a / A Switch on all debug options (=FZUP)\n" 139 "a / A Switch on all debug options (=FZUP)\n"
138 "- Switch off all debug options\n" 140 "- Switch off all debug options\n"
@@ -569,6 +571,9 @@ static void slabcache(struct slabinfo *s)
569 if (strcmp(s->name, "*") == 0) 571 if (strcmp(s->name, "*") == 0)
570 return; 572 return;
571 573
574 if (unreclaim_only && s->reclaim_account)
575 return;
576
572 if (actual_slabs == 1) { 577 if (actual_slabs == 1) {
573 report(s); 578 report(s);
574 return; 579 return;
@@ -1347,6 +1352,7 @@ struct option opts[] = {
1347 { "Loss", no_argument, NULL, 'L'}, 1352 { "Loss", no_argument, NULL, 'L'},
1348 { "Xtotals", no_argument, NULL, 'X'}, 1353 { "Xtotals", no_argument, NULL, 'X'},
1349 { "Bytes", no_argument, NULL, 'B'}, 1354 { "Bytes", no_argument, NULL, 'B'},
1355 { "Unreclaim", no_argument, NULL, 'U'},
1350 { NULL, 0, NULL, 0 } 1356 { NULL, 0, NULL, 0 }
1351}; 1357};
1352 1358
@@ -1358,7 +1364,7 @@ int main(int argc, char *argv[])
1358 1364
1359 page_size = getpagesize(); 1365 page_size = getpagesize();
1360 1366
1361 while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXB", 1367 while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXBU",
1362 opts, NULL)) != -1) 1368 opts, NULL)) != -1)
1363 switch (c) { 1369 switch (c) {
1364 case '1': 1370 case '1':
@@ -1439,6 +1445,9 @@ int main(int argc, char *argv[])
1439 case 'B': 1445 case 'B':
1440 show_bytes = 1; 1446 show_bytes = 1;
1441 break; 1447 break;
1448 case 'U':
1449 unreclaim_only = 1;
1450 break;
1442 default: 1451 default:
1443 fatal("%s: Invalid option '%c'\n", argv[0], optopt); 1452 fatal("%s: Invalid option '%c'\n", argv[0], optopt);
1444 1453