aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 23:49:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 23:49:49 -0400
commitd34fc1adf01ff87026da85fb972dc259dc347540 (patch)
tree27356073d423187157b7cdb69da32b53102fb9e7
parent1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33 (diff)
parentd2cd9ede6e193dd7d88b6d27399e96229a551b19 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - various misc bits - DAX updates - OCFS2 - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (119 commits) mm,fork: introduce MADV_WIPEONFORK x86,mpx: make mpx depend on x86-64 to free up VMA flag mm: add /proc/pid/smaps_rollup mm: hugetlb: clear target sub-page last when clearing huge page mm: oom: let oom_reap_task and exit_mmap run concurrently swap: choose swap device according to numa node mm: replace TIF_MEMDIE checks by tsk_is_oom_victim mm, oom: do not rely on TIF_MEMDIE for memory reserves access z3fold: use per-cpu unbuddied lists mm, swap: don't use VMA based swap readahead if HDD is used as swap mm, swap: add sysfs interface for VMA based swap readahead mm, swap: VMA based swap readahead mm, swap: fix swap readahead marking mm, swap: add swap readahead hit statistics mm/vmalloc.c: don't reinvent the wheel but use existing llist API mm/vmstat.c: fix wrong comment selftests/memfd: add memfd_create hugetlbfs selftest mm/shmem: add hugetlbfs support to memfd_create() mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas() ...
-rw-r--r--Documentation/ABI/testing/procfs-smaps_rollup31
-rw-r--r--Documentation/ABI/testing/sysfs-block-zram8
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-swap26
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/blockdev/zram.txt11
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt2
-rw-r--r--Documentation/filesystems/dax.txt5
-rw-r--r--Documentation/sysctl/vm.txt4
-rw-r--r--Documentation/vm/numa7
-rw-r--r--Documentation/vm/swap_numa.txt69
-rw-r--r--arch/alpha/include/uapi/asm/mman.h14
-rw-r--r--arch/metag/include/asm/topology.h1
-rw-r--r--arch/mips/include/uapi/asm/mman.h14
-rw-r--r--arch/parisc/include/uapi/asm/mman.h14
-rw-r--r--arch/powerpc/include/uapi/asm/mman.h16
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/include/uapi/asm/mman.h3
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h14
-rw-r--r--drivers/base/memory.c30
-rw-r--r--drivers/block/brd.c6
-rw-r--r--drivers/block/zram/Kconfig12
-rw-r--r--drivers/block/zram/zram_drv.c540
-rw-r--r--drivers/block/zram/zram_drv.h11
-rw-r--r--drivers/gpu/drm/i915/i915_debugfs.c4
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h1
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c4
-rw-r--r--drivers/gpu/drm/i915/i915_gem_gtt.c2
-rw-r--r--drivers/gpu/drm/i915/i915_gem_shrinker.c24
-rw-r--r--drivers/nvdimm/btt.c4
-rw-r--r--drivers/nvdimm/pmem.c41
-rw-r--r--fs/9p/cache.c29
-rw-r--r--fs/afs/cache.c43
-rw-r--r--fs/buffer.c31
-rw-r--r--fs/ceph/cache.c31
-rw-r--r--fs/cifs/cache.c31
-rw-r--r--fs/dax.c363
-rw-r--r--fs/ext2/file.c25
-rw-r--r--fs/ext4/file.c48
-rw-r--r--fs/ext4/inode.c15
-rw-r--r--fs/fscache/page.c5
-rw-r--r--fs/hugetlbfs/inode.c30
-rw-r--r--fs/nfs/fscache-index.c40
-rw-r--r--fs/nilfs2/page.c3
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/acl.h7
-rw-r--r--fs/ocfs2/alloc.c22
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c42
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/file.c7
-rw-r--r--fs/ocfs2/journal.c1
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/meminfo.c10
-rw-r--r--fs/proc/task_mmu.c197
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/sync.c5
-rw-r--r--fs/userfaultfd.c21
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--include/linux/bio.h8
-rw-r--r--include/linux/dax.h45
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/fscache.h9
-rw-r--r--include/linux/memcontrol.h52
-rw-r--r--include/linux/memory_hotplug.h2
-rw-r--r--include/linux/mm.h14
-rw-r--r--include/linux/mm_types.h1
-rw-r--r--include/linux/mmzone.h5
-rw-r--r--include/linux/page-flags.h4
-rw-r--r--include/linux/pagemap.h12
-rw-r--r--include/linux/pagevec.h12
-rw-r--r--include/linux/sched/mm.h6
-rw-r--r--include/linux/shm.h17
-rw-r--r--include/linux/shmem_fs.h6
-rw-r--r--include/linux/shrinker.h7
-rw-r--r--include/linux/slub_def.h4
-rw-r--r--include/linux/swap.h78
-rw-r--r--include/linux/vm_event_item.h6
-rw-r--r--include/linux/vmstat.h4
-rw-r--r--include/trace/events/fs_dax.h2
-rw-r--r--include/trace/events/mmflags.h8
-rw-r--r--include/uapi/asm-generic/hugetlb_encode.h34
-rw-r--r--include/uapi/asm-generic/mman-common.h14
-rw-r--r--include/uapi/linux/memfd.h24
-rw-r--r--include/uapi/linux/mman.h22
-rw-r--r--include/uapi/linux/shm.h31
-rw-r--r--include/uapi/linux/userfaultfd.h16
-rw-r--r--init/Kconfig9
-rw-r--r--init/main.c2
-rw-r--r--kernel/cgroup/cgroup.c3
-rw-r--r--kernel/cgroup/cpuset.c9
-rw-r--r--kernel/fork.c27
-rw-r--r--kernel/memremap.c52
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/filemap.c67
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c32
-rw-r--r--mm/hugetlb.c65
-rw-r--r--mm/internal.h12
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memcontrol.c40
-rw-r--r--mm/memory.c135
-rw-r--r--mm/memory_hotplug.c114
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mremap.c13
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c24
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c438
-rw-r--r--mm/page_ext.c6
-rw-r--r--mm/page_idle.c2
-rw-r--r--mm/page_io.c21
-rw-r--r--mm/page_owner.c68
-rw-r--r--mm/shmem.c206
-rw-r--r--mm/slub.c52
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c24
-rw-r--r--mm/swap_state.c314
-rw-r--r--mm/swapfile.c362
-rw-r--r--mm/userfaultfd.c48
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c113
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/z3fold.c479
-rw-r--r--mm/zsmalloc.c8
-rw-r--r--scripts/mod/modpost.c27
-rw-r--r--tools/testing/selftests/memfd/Makefile2
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c372
-rw-r--r--tools/testing/selftests/memfd/run_tests.sh69
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c279
139 files changed, 3960 insertions, 2068 deletions
diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup
new file mode 100644
index 000000000000..0a54ed0d63c9
--- /dev/null
+++ b/Documentation/ABI/testing/procfs-smaps_rollup
@@ -0,0 +1,31 @@
1What: /proc/pid/smaps_rollup
2Date: August 2017
3Contact: Daniel Colascione <dancol@google.com>
4Description:
5 This file provides pre-summed memory information for a
6 process. The format is identical to /proc/pid/smaps,
7 except instead of an entry for each VMA in a process,
8 smaps_rollup has a single entry (tagged "[rollup]")
9 for which each field is the sum of the corresponding
10 fields from all the maps in /proc/pid/smaps.
11 For more details, see the procfs man page.
12
13 Typical output looks like this:
14
15 00100000-ff709000 ---p 00000000 00:00 0 [rollup]
16 Rss: 884 kB
17 Pss: 385 kB
18 Shared_Clean: 696 kB
19 Shared_Dirty: 0 kB
20 Private_Clean: 120 kB
21 Private_Dirty: 68 kB
22 Referenced: 884 kB
23 Anonymous: 68 kB
24 LazyFree: 0 kB
25 AnonHugePages: 0 kB
26 ShmemPmdMapped: 0 kB
27 Shared_Hugetlb: 0 kB
28 Private_Hugetlb: 0 kB
29 Swap: 0 kB
30 SwapPss: 0 kB
31 Locked: 385 kB
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d882b2c..c1513c756af1 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,11 @@ Description:
90 device's debugging info useful for kernel developers. Its 90 device's debugging info useful for kernel developers. Its
91 format is not documented intentionally and may change 91 format is not documented intentionally and may change
92 anytime without any notice. 92 anytime without any notice.
93
94What: /sys/block/zram<id>/backing_dev
95Date: June 2017
96Contact: Minchan Kim <minchan@kernel.org>
97Description:
98 The backing_dev file is read-write and set up backing
99 device for zram to write incompressible pages.
100 For using, user should enable CONFIG_ZRAM_WRITEBACK.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-swap b/Documentation/ABI/testing/sysfs-kernel-mm-swap
new file mode 100644
index 000000000000..587db52084c7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-swap
@@ -0,0 +1,26 @@
1What: /sys/kernel/mm/swap/
2Date: August 2017
3Contact: Linux memory management mailing list <linux-mm@kvack.org>
4Description: Interface for swapping
5
6What: /sys/kernel/mm/swap/vma_ra_enabled
7Date: August 2017
8Contact: Linux memory management mailing list <linux-mm@kvack.org>
9Description: Enable/disable VMA based swap readahead.
10
11 If set to true, the VMA based swap readahead algorithm
12 will be used for swappable anonymous pages mapped in a
13 VMA, and the global swap readahead algorithm will be
14 still used for tmpfs etc. other users. If set to
15 false, the global swap readahead algorithm will be
16 used for all swappable pages.
17
18What: /sys/kernel/mm/swap/vma_ra_max_order
19Date: August 2017
20Contact: Linux memory management mailing list <linux-mm@kvack.org>
21Description: The max readahead size in order for VMA based swap readahead
22
23 VMA based swap readahead algorithm will readahead at
24 most 1 << max_order pages for each readahead. The
25 real readahead size for each readahead will be scaled
26 according to the estimation algorithm.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6996b7727b85..86b0e8ec8ad7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2783,7 +2783,7 @@
2783 Allowed values are enable and disable 2783 Allowed values are enable and disable
2784 2784
2785 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. 2785 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
2786 one of ['zone', 'node', 'default'] can be specified 2786 'node', 'default' can be specified
2787 This can be set from sysctl after boot. 2787 This can be set from sysctl after boot.
2788 See Documentation/sysctl/vm.txt for details. 2788 See Documentation/sysctl/vm.txt for details.
2789 2789
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a21307..257e65714c6a 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations
168comp_algorithm RW show and change the compression algorithm 168comp_algorithm RW show and change the compression algorithm
169compact WO trigger memory compaction 169compact WO trigger memory compaction
170debug_stat RO this file is used for zram debugging purposes 170debug_stat RO this file is used for zram debugging purposes
171backing_dev RW set up backend storage for zram to write out
171 172
172 173
173User space is advised to use the following files to read the device statistics. 174User space is advised to use the following files to read the device statistics.
@@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace:
231 resets the disksize to zero. You must set the disksize again 232 resets the disksize to zero. You must set the disksize again
232 before reusing the device. 233 before reusing the device.
233 234
235* Optional Feature
236
237= writeback
238
239With incompressible pages, there is no memory saving with zram.
240Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page
241to backing storage rather than keeping it in memory.
242User should set up backing device via /sys/block/zramX/backing_dev
243before disksize setting.
244
234Nitin Gupta 245Nitin Gupta
235ngupta@vflare.org 246ngupta@vflare.org
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index aed6b94160b1..0eb31de3a2c1 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -151,8 +151,6 @@ To define an object, a structure of the following type should be filled out:
151 void (*mark_pages_cached)(void *cookie_netfs_data, 151 void (*mark_pages_cached)(void *cookie_netfs_data,
152 struct address_space *mapping, 152 struct address_space *mapping,
153 struct pagevec *cached_pvec); 153 struct pagevec *cached_pvec);
154
155 void (*now_uncached)(void *cookie_netfs_data);
156 }; 154 };
157 155
158This has the following fields: 156This has the following fields:
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index a7e6e14aeb08..3be3b266be41 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -63,9 +63,8 @@ Filesystem support consists of
63- implementing an mmap file operation for DAX files which sets the 63- implementing an mmap file operation for DAX files which sets the
64 VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to 64 VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
65 include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These 65 include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
66 handlers should probably call dax_iomap_fault() (for fault and page_mkwrite 66 handlers should probably call dax_iomap_fault() passing the appropriate
67 handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate 67 fault size and iomap operations.
68 iomap operations.
69- calling iomap_zero_range() passing appropriate iomap operations instead of 68- calling iomap_zero_range() passing appropriate iomap operations instead of
70 block_truncate_page() for DAX files 69 block_truncate_page() for DAX files
71- ensuring that there is sufficient locking between reads, writes, 70- ensuring that there is sufficient locking between reads, writes,
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 48244c42ff52..9baf66a9ef4e 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -572,7 +572,9 @@ See Documentation/nommu-mmap.txt for more information.
572 572
573numa_zonelist_order 573numa_zonelist_order
574 574
575This sysctl is only for NUMA. 575This sysctl is only for NUMA and it is deprecated. Anything but
576Node order will fail!
577
576'where the memory is allocated from' is controlled by zonelists. 578'where the memory is allocated from' is controlled by zonelists.
577(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation. 579(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
578 you may be able to read ZONE_DMA as ZONE_DMA32...) 580 you may be able to read ZONE_DMA as ZONE_DMA32...)
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
index a08f71647714..a31b85b9bb88 100644
--- a/Documentation/vm/numa
+++ b/Documentation/vm/numa
@@ -79,11 +79,8 @@ memory, Linux must decide whether to order the zonelists such that allocations
79fall back to the same zone type on a different node, or to a different zone 79fall back to the same zone type on a different node, or to a different zone
80type on the same node. This is an important consideration because some zones, 80type on the same node. This is an important consideration because some zones,
81such as DMA or DMA32, represent relatively scarce resources. Linux chooses 81such as DMA or DMA32, represent relatively scarce resources. Linux chooses
82a default zonelist order based on the sizes of the various zone types relative 82a default Node ordered zonelist. This means it tries to fallback to other zones
83to the total memory of the node and the total memory of the system. The 83from the same node before using remote nodes which are ordered by NUMA distance.
84default zonelist order may be overridden using the numa_zonelist_order kernel
85boot parameter or sysctl. [see Documentation/admin-guide/kernel-parameters.rst and
86Documentation/sysctl/vm.txt]
87 84
88By default, Linux will attempt to satisfy memory allocation requests from the 85By default, Linux will attempt to satisfy memory allocation requests from the
89node to which the CPU that executes the request is assigned. Specifically, 86node to which the CPU that executes the request is assigned. Specifically,
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt
new file mode 100644
index 000000000000..d5960c9124f5
--- /dev/null
+++ b/Documentation/vm/swap_numa.txt
@@ -0,0 +1,69 @@
1Automatically bind swap device to numa node
2-------------------------------------------
3
4If the system has more than one swap device and swap device has the node
5information, we can make use of this information to decide which swap
6device to use in get_swap_pages() to get better performance.
7
8
9How to use this feature
10-----------------------
11
12Swap device has priority and that decides the order of it to be used. To make
13use of automatically binding, there is no need to manipulate priority settings
14for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
15swapB, with swapA attached to node 0 and swapB attached to node 1, are going
16to be swapped on. Simply swapping them on by doing:
17# swapon /dev/swapA
18# swapon /dev/swapB
19
20Then node 0 will use the two swap devices in the order of swapA then swapB and
21node 1 will use the two swap devices in the order of swapB then swapA. Note
22that the order of them being swapped on doesn't matter.
23
24A more complex example on a 4 node machine. Assume 6 swap devices are going to
25be swapped on: swapA and swapB are attached to node 0, swapC is attached to
26node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
27The way to swap them on is the same as above:
28# swapon /dev/swapA
29# swapon /dev/swapB
30# swapon /dev/swapC
31# swapon /dev/swapD
32# swapon /dev/swapE
33# swapon /dev/swapF
34
35Then node 0 will use them in the order of:
36swapA/swapB -> swapC -> swapD -> swapE -> swapF
37swapA and swapB will be used in a round robin mode before any other swap device.
38
39node 1 will use them in the order of:
40swapC -> swapA -> swapB -> swapD -> swapE -> swapF
41
42node 2 will use them in the order of:
43swapD/swapE -> swapA -> swapB -> swapC -> swapF
44Similaly, swapD and swapE will be used in a round robin mode before any
45other swap devices.
46
47node 3 will use them in the order of:
48swapF -> swapA -> swapB -> swapC -> swapD -> swapE
49
50
51Implementation details
52----------------------
53
54The current code uses a priority based list, swap_avail_list, to decide
55which swap device to use and if multiple swap devices share the same
56priority, they are used round robin. This change here replaces the single
57global swap_avail_list with a per-numa-node list, i.e. for each numa node,
58it sees its own priority based list of available swap devices. Swap
59device's priority can be promoted on its matching node's swap_avail_list.
60
61The current swap device's priority is set as: user can set a >=0 value,
62or the system will pick one starting from -1 then downwards. The priority
63value in the swap_avail_list is the negated value of the swap device's
64due to plist being sorted from low to high. The new policy doesn't change
65the semantics for priority >=0 cases, the previous starting from -1 then
66downwards now becomes starting from -2 then downwards and -1 is reserved
67as the promoted value. So if multiple swap devices are attached to the same
68node, they will all be promoted to priority -1 on that node's plist and will
69be used round robin before any other swap devices.
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 02760f6e6ca4..3b26cc62dadb 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -64,20 +64,12 @@
64 overrides the coredump filter bits */ 64 overrides the coredump filter bits */
65#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ 65#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
66 66
67#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
68#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
69
67/* compatibility flags */ 70/* compatibility flags */
68#define MAP_FILE 0 71#define MAP_FILE 0
69 72
70/*
71 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
72 * This gives us 6 bits, which is enough until someone invents 128 bit address
73 * spaces.
74 *
75 * Assume these are all power of twos.
76 * When 0 use the default page size.
77 */
78#define MAP_HUGE_SHIFT 26
79#define MAP_HUGE_MASK 0x3f
80
81#define PKEY_DISABLE_ACCESS 0x1 73#define PKEY_DISABLE_ACCESS 0x1
82#define PKEY_DISABLE_WRITE 0x2 74#define PKEY_DISABLE_WRITE 0x2
83#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 75#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h
index e95f874ded1b..707c7f7b6bea 100644
--- a/arch/metag/include/asm/topology.h
+++ b/arch/metag/include/asm/topology.h
@@ -4,7 +4,6 @@
4#ifdef CONFIG_NUMA 4#ifdef CONFIG_NUMA
5 5
6#define cpu_to_node(cpu) ((void)(cpu), 0) 6#define cpu_to_node(cpu) ((void)(cpu), 0)
7#define parent_node(node) ((void)(node), 0)
8 7
9#define cpumask_of_node(node) ((void)node, cpu_online_mask) 8#define cpumask_of_node(node) ((void)node, cpu_online_mask)
10 9
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 655e2fb5395b..da3216007fe0 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -91,20 +91,12 @@
91 overrides the coredump filter bits */ 91 overrides the coredump filter bits */
92#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ 92#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
93 93
94#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
95#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
96
94/* compatibility flags */ 97/* compatibility flags */
95#define MAP_FILE 0 98#define MAP_FILE 0
96 99
97/*
98 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
99 * This gives us 6 bits, which is enough until someone invents 128 bit address
100 * spaces.
101 *
102 * Assume these are all power of twos.
103 * When 0 use the default page size.
104 */
105#define MAP_HUGE_SHIFT 26
106#define MAP_HUGE_MASK 0x3f
107
108#define PKEY_DISABLE_ACCESS 0x1 100#define PKEY_DISABLE_ACCESS 0x1
109#define PKEY_DISABLE_WRITE 0x2 101#define PKEY_DISABLE_WRITE 0x2
110#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 102#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 9a9c2fe4be50..775b5d5e41a1 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -57,6 +57,9 @@
57 overrides the coredump filter bits */ 57 overrides the coredump filter bits */
58#define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ 58#define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */
59 59
60#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
61#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
62
60#define MADV_HWPOISON 100 /* poison a page for testing */ 63#define MADV_HWPOISON 100 /* poison a page for testing */
61#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ 64#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
62 65
@@ -64,17 +67,6 @@
64#define MAP_FILE 0 67#define MAP_FILE 0
65#define MAP_VARIABLE 0 68#define MAP_VARIABLE 0
66 69
67/*
68 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
69 * This gives us 6 bits, which is enough until someone invents 128 bit address
70 * spaces.
71 *
72 * Assume these are all power of twos.
73 * When 0 use the default page size.
74 */
75#define MAP_HUGE_SHIFT 26
76#define MAP_HUGE_MASK 0x3f
77
78#define PKEY_DISABLE_ACCESS 0x1 70#define PKEY_DISABLE_ACCESS 0x1
79#define PKEY_DISABLE_WRITE 0x2 71#define PKEY_DISABLE_WRITE 0x2
80#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 72#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index ab45cc2f3101..03c06ba7464f 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,20 +29,4 @@
29#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ 29#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
30#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ 30#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
31 31
32/*
33 * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
34 * encode the log2 of the huge page size. A value of zero indicates that the
35 * default huge page size should be used. To use a non-default huge page size,
36 * one of these defines can be used, or the size can be encoded by hand. Note
37 * that on most systems only a subset, or possibly none, of these sizes will be
38 * available.
39 */
40#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */
41#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */
42#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */
43#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */
44#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */
45#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */
46#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */
47
48#endif /* _UAPI_ASM_POWERPC_MMAN_H */ 32#endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index acb366bf6bc1..4b278a33ccbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1806,7 +1806,9 @@ config X86_SMAP
1806config X86_INTEL_MPX 1806config X86_INTEL_MPX
1807 prompt "Intel MPX (Memory Protection Extensions)" 1807 prompt "Intel MPX (Memory Protection Extensions)"
1808 def_bool n 1808 def_bool n
1809 depends on CPU_SUP_INTEL 1809 # Note: only available in 64-bit mode due to VMA flags shortage
1810 depends on CPU_SUP_INTEL && X86_64
1811 select ARCH_USES_HIGH_VMA_FLAGS
1810 ---help--- 1812 ---help---
1811 MPX provides hardware features that can be used in 1813 MPX provides hardware features that can be used in
1812 conjunction with compiler-instrumented code to check 1814 conjunction with compiler-instrumented code to check
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 39bca7fac087..3be08f07695c 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -3,9 +3,6 @@
3 3
4#define MAP_32BIT 0x40 /* only give out 32bit addresses */ 4#define MAP_32BIT 0x40 /* only give out 32bit addresses */
5 5
6#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
7#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
8
9#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 6#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
10/* 7/*
11 * Take the 4 protection key bits out of the vma->vm_flags 8 * Take the 4 protection key bits out of the vma->vm_flags
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 24365b30aae9..b15b278aa314 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -103,20 +103,12 @@
103 overrides the coredump filter bits */ 103 overrides the coredump filter bits */
104#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ 104#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
105 105
106#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
107#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
108
106/* compatibility flags */ 109/* compatibility flags */
107#define MAP_FILE 0 110#define MAP_FILE 0
108 111
109/*
110 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
111 * This gives us 6 bits, which is enough until someone invents 128 bit address
112 * spaces.
113 *
114 * Assume these are all power of twos.
115 * When 0 use the default page size.
116 */
117#define MAP_HUGE_SHIFT 26
118#define MAP_HUGE_MASK 0x3f
119
120#define PKEY_DISABLE_ACCESS 0x1 112#define PKEY_DISABLE_ACCESS 0x1
121#define PKEY_DISABLE_WRITE 0x2 113#define PKEY_DISABLE_WRITE 0x2
122#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 114#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c7c4e0325cdb..4e3b61cda520 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -388,6 +388,19 @@ static ssize_t show_phys_device(struct device *dev,
388} 388}
389 389
390#ifdef CONFIG_MEMORY_HOTREMOVE 390#ifdef CONFIG_MEMORY_HOTREMOVE
391static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
392 unsigned long nr_pages, int online_type,
393 struct zone *default_zone)
394{
395 struct zone *zone;
396
397 zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
398 if (zone != default_zone) {
399 strcat(buf, " ");
400 strcat(buf, zone->name);
401 }
402}
403
391static ssize_t show_valid_zones(struct device *dev, 404static ssize_t show_valid_zones(struct device *dev,
392 struct device_attribute *attr, char *buf) 405 struct device_attribute *attr, char *buf)
393{ 406{
@@ -395,7 +408,7 @@ static ssize_t show_valid_zones(struct device *dev,
395 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 408 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
396 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 409 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
397 unsigned long valid_start_pfn, valid_end_pfn; 410 unsigned long valid_start_pfn, valid_end_pfn;
398 bool append = false; 411 struct zone *default_zone;
399 int nid; 412 int nid;
400 413
401 /* 414 /*
@@ -418,16 +431,13 @@ static ssize_t show_valid_zones(struct device *dev,
418 } 431 }
419 432
420 nid = pfn_to_nid(start_pfn); 433 nid = pfn_to_nid(start_pfn);
421 if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { 434 default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
422 strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); 435 strcat(buf, default_zone->name);
423 append = true;
424 }
425 436
426 if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { 437 print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
427 if (append) 438 default_zone);
428 strcat(buf, " "); 439 print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
429 strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); 440 default_zone);
430 }
431out: 441out:
432 strcat(buf, "\n"); 442 strcat(buf, "\n");
433 443
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 104b71c0490d..5d9ed0616413 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -326,7 +326,11 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
326 struct page *page, bool is_write) 326 struct page *page, bool is_write)
327{ 327{
328 struct brd_device *brd = bdev->bd_disk->private_data; 328 struct brd_device *brd = bdev->bd_disk->private_data;
329 int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector); 329 int err;
330
331 if (PageTransHuge(page))
332 return -ENOTSUPP;
333 err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
330 page_endio(page, is_write, err); 334 page_endio(page, is_write, err);
331 return err; 335 return err;
332} 336}
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6dcd3b..7cd4a8ec3c8f 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,15 @@ config ZRAM
13 disks and maybe many more. 13 disks and maybe many more.
14 14
15 See zram.txt for more information. 15 See zram.txt for more information.
16
17config ZRAM_WRITEBACK
18 bool "Write back incompressible page to backing device"
19 depends on ZRAM
20 default n
21 help
22 With incompressible page, there is no memory saving to keep it
23 in memory. Instead, write it out to backing device.
24 For this feature, admin should set up backing device via
25 /sys/block/zramX/backing_dev.
26
27 See zram.txt for more infomration.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3b1b6340ba13..4a0438c4ef2a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -270,6 +270,349 @@ static ssize_t mem_used_max_store(struct device *dev,
270 return len; 270 return len;
271} 271}
272 272
273#ifdef CONFIG_ZRAM_WRITEBACK
274static bool zram_wb_enabled(struct zram *zram)
275{
276 return zram->backing_dev;
277}
278
279static void reset_bdev(struct zram *zram)
280{
281 struct block_device *bdev;
282
283 if (!zram_wb_enabled(zram))
284 return;
285
286 bdev = zram->bdev;
287 if (zram->old_block_size)
288 set_blocksize(bdev, zram->old_block_size);
289 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
290 /* hope filp_close flush all of IO */
291 filp_close(zram->backing_dev, NULL);
292 zram->backing_dev = NULL;
293 zram->old_block_size = 0;
294 zram->bdev = NULL;
295
296 kvfree(zram->bitmap);
297 zram->bitmap = NULL;
298}
299
300static ssize_t backing_dev_show(struct device *dev,
301 struct device_attribute *attr, char *buf)
302{
303 struct zram *zram = dev_to_zram(dev);
304 struct file *file = zram->backing_dev;
305 char *p;
306 ssize_t ret;
307
308 down_read(&zram->init_lock);
309 if (!zram_wb_enabled(zram)) {
310 memcpy(buf, "none\n", 5);
311 up_read(&zram->init_lock);
312 return 5;
313 }
314
315 p = file_path(file, buf, PAGE_SIZE - 1);
316 if (IS_ERR(p)) {
317 ret = PTR_ERR(p);
318 goto out;
319 }
320
321 ret = strlen(p);
322 memmove(buf, p, ret);
323 buf[ret++] = '\n';
324out:
325 up_read(&zram->init_lock);
326 return ret;
327}
328
329static ssize_t backing_dev_store(struct device *dev,
330 struct device_attribute *attr, const char *buf, size_t len)
331{
332 char *file_name;
333 struct file *backing_dev = NULL;
334 struct inode *inode;
335 struct address_space *mapping;
336 unsigned int bitmap_sz, old_block_size = 0;
337 unsigned long nr_pages, *bitmap = NULL;
338 struct block_device *bdev = NULL;
339 int err;
340 struct zram *zram = dev_to_zram(dev);
341
342 file_name = kmalloc(PATH_MAX, GFP_KERNEL);
343 if (!file_name)
344 return -ENOMEM;
345
346 down_write(&zram->init_lock);
347 if (init_done(zram)) {
348 pr_info("Can't setup backing device for initialized device\n");
349 err = -EBUSY;
350 goto out;
351 }
352
353 strlcpy(file_name, buf, len);
354
355 backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
356 if (IS_ERR(backing_dev)) {
357 err = PTR_ERR(backing_dev);
358 backing_dev = NULL;
359 goto out;
360 }
361
362 mapping = backing_dev->f_mapping;
363 inode = mapping->host;
364
365 /* Support only block device in this moment */
366 if (!S_ISBLK(inode->i_mode)) {
367 err = -ENOTBLK;
368 goto out;
369 }
370
371 bdev = bdgrab(I_BDEV(inode));
372 err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
373 if (err < 0)
374 goto out;
375
376 nr_pages = i_size_read(inode) >> PAGE_SHIFT;
377 bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
378 bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
379 if (!bitmap) {
380 err = -ENOMEM;
381 goto out;
382 }
383
384 old_block_size = block_size(bdev);
385 err = set_blocksize(bdev, PAGE_SIZE);
386 if (err)
387 goto out;
388
389 reset_bdev(zram);
390 spin_lock_init(&zram->bitmap_lock);
391
392 zram->old_block_size = old_block_size;
393 zram->bdev = bdev;
394 zram->backing_dev = backing_dev;
395 zram->bitmap = bitmap;
396 zram->nr_pages = nr_pages;
397 up_write(&zram->init_lock);
398
399 pr_info("setup backing device %s\n", file_name);
400 kfree(file_name);
401
402 return len;
403out:
404 if (bitmap)
405 kvfree(bitmap);
406
407 if (bdev)
408 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
409
410 if (backing_dev)
411 filp_close(backing_dev, NULL);
412
413 up_write(&zram->init_lock);
414
415 kfree(file_name);
416
417 return err;
418}
419
420static unsigned long get_entry_bdev(struct zram *zram)
421{
422 unsigned long entry;
423
424 spin_lock(&zram->bitmap_lock);
425 /* skip 0 bit to confuse zram.handle = 0 */
426 entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
427 if (entry == zram->nr_pages) {
428 spin_unlock(&zram->bitmap_lock);
429 return 0;
430 }
431
432 set_bit(entry, zram->bitmap);
433 spin_unlock(&zram->bitmap_lock);
434
435 return entry;
436}
437
438static void put_entry_bdev(struct zram *zram, unsigned long entry)
439{
440 int was_set;
441
442 spin_lock(&zram->bitmap_lock);
443 was_set = test_and_clear_bit(entry, zram->bitmap);
444 spin_unlock(&zram->bitmap_lock);
445 WARN_ON_ONCE(!was_set);
446}
447
448void zram_page_end_io(struct bio *bio)
449{
450 struct page *page = bio->bi_io_vec[0].bv_page;
451
452 page_endio(page, op_is_write(bio_op(bio)),
453 blk_status_to_errno(bio->bi_status));
454 bio_put(bio);
455}
456
457/*
458 * Returns 1 if the submission is successful.
459 */
460static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
461 unsigned long entry, struct bio *parent)
462{
463 struct bio *bio;
464
465 bio = bio_alloc(GFP_ATOMIC, 1);
466 if (!bio)
467 return -ENOMEM;
468
469 bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
470 bio->bi_bdev = zram->bdev;
471 if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
472 bio_put(bio);
473 return -EIO;
474 }
475
476 if (!parent) {
477 bio->bi_opf = REQ_OP_READ;
478 bio->bi_end_io = zram_page_end_io;
479 } else {
480 bio->bi_opf = parent->bi_opf;
481 bio_chain(bio, parent);
482 }
483
484 submit_bio(bio);
485 return 1;
486}
487
488struct zram_work {
489 struct work_struct work;
490 struct zram *zram;
491 unsigned long entry;
492 struct bio *bio;
493};
494
495#if PAGE_SIZE != 4096
496static void zram_sync_read(struct work_struct *work)
497{
498 struct bio_vec bvec;
499 struct zram_work *zw = container_of(work, struct zram_work, work);
500 struct zram *zram = zw->zram;
501 unsigned long entry = zw->entry;
502 struct bio *bio = zw->bio;
503
504 read_from_bdev_async(zram, &bvec, entry, bio);
505}
506
507/*
508 * Block layer want one ->make_request_fn to be active at a time
509 * so if we use chained IO with parent IO in same context,
510 * it's a deadlock. To avoid, it, it uses worker thread context.
511 */
512static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
513 unsigned long entry, struct bio *bio)
514{
515 struct zram_work work;
516
517 work.zram = zram;
518 work.entry = entry;
519 work.bio = bio;
520
521 INIT_WORK_ONSTACK(&work.work, zram_sync_read);
522 queue_work(system_unbound_wq, &work.work);
523 flush_work(&work.work);
524 destroy_work_on_stack(&work.work);
525
526 return 1;
527}
528#else
529static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
530 unsigned long entry, struct bio *bio)
531{
532 WARN_ON(1);
533 return -EIO;
534}
535#endif
536
537static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
538 unsigned long entry, struct bio *parent, bool sync)
539{
540 if (sync)
541 return read_from_bdev_sync(zram, bvec, entry, parent);
542 else
543 return read_from_bdev_async(zram, bvec, entry, parent);
544}
545
546static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
547 u32 index, struct bio *parent,
548 unsigned long *pentry)
549{
550 struct bio *bio;
551 unsigned long entry;
552
553 bio = bio_alloc(GFP_ATOMIC, 1);
554 if (!bio)
555 return -ENOMEM;
556
557 entry = get_entry_bdev(zram);
558 if (!entry) {
559 bio_put(bio);
560 return -ENOSPC;
561 }
562
563 bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
564 bio->bi_bdev = zram->bdev;
565 if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
566 bvec->bv_offset)) {
567 bio_put(bio);
568 put_entry_bdev(zram, entry);
569 return -EIO;
570 }
571
572 if (!parent) {
573 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
574 bio->bi_end_io = zram_page_end_io;
575 } else {
576 bio->bi_opf = parent->bi_opf;
577 bio_chain(bio, parent);
578 }
579
580 submit_bio(bio);
581 *pentry = entry;
582
583 return 0;
584}
585
586static void zram_wb_clear(struct zram *zram, u32 index)
587{
588 unsigned long entry;
589
590 zram_clear_flag(zram, index, ZRAM_WB);
591 entry = zram_get_element(zram, index);
592 zram_set_element(zram, index, 0);
593 put_entry_bdev(zram, entry);
594}
595
596#else
597static bool zram_wb_enabled(struct zram *zram) { return false; }
598static inline void reset_bdev(struct zram *zram) {};
599static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
600 u32 index, struct bio *parent,
601 unsigned long *pentry)
602
603{
604 return -EIO;
605}
606
607static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
608 unsigned long entry, struct bio *parent, bool sync)
609{
610 return -EIO;
611}
612static void zram_wb_clear(struct zram *zram, u32 index) {}
613#endif
614
615
273/* 616/*
274 * We switched to per-cpu streams and this attr is not needed anymore. 617 * We switched to per-cpu streams and this attr is not needed anymore.
275 * However, we will keep it around for some time, because: 618 * However, we will keep it around for some time, because:
@@ -453,30 +796,6 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
453 return false; 796 return false;
454} 797}
455 798
456static bool zram_same_page_write(struct zram *zram, u32 index,
457 struct page *page)
458{
459 unsigned long element;
460 void *mem = kmap_atomic(page);
461
462 if (page_same_filled(mem, &element)) {
463 kunmap_atomic(mem);
464 /* Free memory associated with this sector now. */
465 zram_slot_lock(zram, index);
466 zram_free_page(zram, index);
467 zram_set_flag(zram, index, ZRAM_SAME);
468 zram_set_element(zram, index, element);
469 zram_slot_unlock(zram, index);
470
471 atomic64_inc(&zram->stats.same_pages);
472 atomic64_inc(&zram->stats.pages_stored);
473 return true;
474 }
475 kunmap_atomic(mem);
476
477 return false;
478}
479
480static void zram_meta_free(struct zram *zram, u64 disksize) 799static void zram_meta_free(struct zram *zram, u64 disksize)
481{ 800{
482 size_t num_pages = disksize >> PAGE_SHIFT; 801 size_t num_pages = disksize >> PAGE_SHIFT;
@@ -515,7 +834,13 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
515 */ 834 */
516static void zram_free_page(struct zram *zram, size_t index) 835static void zram_free_page(struct zram *zram, size_t index)
517{ 836{
518 unsigned long handle = zram_get_handle(zram, index); 837 unsigned long handle;
838
839 if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
840 zram_wb_clear(zram, index);
841 atomic64_dec(&zram->stats.pages_stored);
842 return;
843 }
519 844
520 /* 845 /*
521 * No memory is allocated for same element filled pages. 846 * No memory is allocated for same element filled pages.
@@ -529,6 +854,7 @@ static void zram_free_page(struct zram *zram, size_t index)
529 return; 854 return;
530 } 855 }
531 856
857 handle = zram_get_handle(zram, index);
532 if (!handle) 858 if (!handle)
533 return; 859 return;
534 860
@@ -542,13 +868,31 @@ static void zram_free_page(struct zram *zram, size_t index)
542 zram_set_obj_size(zram, index, 0); 868 zram_set_obj_size(zram, index, 0);
543} 869}
544 870
545static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) 871static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
872 struct bio *bio, bool partial_io)
546{ 873{
547 int ret; 874 int ret;
548 unsigned long handle; 875 unsigned long handle;
549 unsigned int size; 876 unsigned int size;
550 void *src, *dst; 877 void *src, *dst;
551 878
879 if (zram_wb_enabled(zram)) {
880 zram_slot_lock(zram, index);
881 if (zram_test_flag(zram, index, ZRAM_WB)) {
882 struct bio_vec bvec;
883
884 zram_slot_unlock(zram, index);
885
886 bvec.bv_page = page;
887 bvec.bv_len = PAGE_SIZE;
888 bvec.bv_offset = 0;
889 return read_from_bdev(zram, &bvec,
890 zram_get_element(zram, index),
891 bio, partial_io);
892 }
893 zram_slot_unlock(zram, index);
894 }
895
552 if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) 896 if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
553 return 0; 897 return 0;
554 898
@@ -581,7 +925,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
581} 925}
582 926
583static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, 927static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
584 u32 index, int offset) 928 u32 index, int offset, struct bio *bio)
585{ 929{
586 int ret; 930 int ret;
587 struct page *page; 931 struct page *page;
@@ -594,7 +938,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
594 return -ENOMEM; 938 return -ENOMEM;
595 } 939 }
596 940
597 ret = zram_decompress_page(zram, page, index); 941 ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
598 if (unlikely(ret)) 942 if (unlikely(ret))
599 goto out; 943 goto out;
600 944
@@ -613,30 +957,57 @@ out:
613 return ret; 957 return ret;
614} 958}
615 959
616static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, 960static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
617 struct page *page, 961 u32 index, struct bio *bio)
618 unsigned long *out_handle, unsigned int *out_comp_len)
619{ 962{
620 int ret; 963 int ret = 0;
621 unsigned int comp_len;
622 void *src;
623 unsigned long alloced_pages; 964 unsigned long alloced_pages;
624 unsigned long handle = 0; 965 unsigned long handle = 0;
966 unsigned int comp_len = 0;
967 void *src, *dst, *mem;
968 struct zcomp_strm *zstrm;
969 struct page *page = bvec->bv_page;
970 unsigned long element = 0;
971 enum zram_pageflags flags = 0;
972 bool allow_wb = true;
973
974 mem = kmap_atomic(page);
975 if (page_same_filled(mem, &element)) {
976 kunmap_atomic(mem);
977 /* Free memory associated with this sector now. */
978 flags = ZRAM_SAME;
979 atomic64_inc(&zram->stats.same_pages);
980 goto out;
981 }
982 kunmap_atomic(mem);
625 983
626compress_again: 984compress_again:
985 zstrm = zcomp_stream_get(zram->comp);
627 src = kmap_atomic(page); 986 src = kmap_atomic(page);
628 ret = zcomp_compress(*zstrm, src, &comp_len); 987 ret = zcomp_compress(zstrm, src, &comp_len);
629 kunmap_atomic(src); 988 kunmap_atomic(src);
630 989
631 if (unlikely(ret)) { 990 if (unlikely(ret)) {
991 zcomp_stream_put(zram->comp);
632 pr_err("Compression failed! err=%d\n", ret); 992 pr_err("Compression failed! err=%d\n", ret);
633 if (handle) 993 zs_free(zram->mem_pool, handle);
634 zs_free(zram->mem_pool, handle);
635 return ret; 994 return ret;
636 } 995 }
637 996
638 if (unlikely(comp_len > max_zpage_size)) 997 if (unlikely(comp_len > max_zpage_size)) {
998 if (zram_wb_enabled(zram) && allow_wb) {
999 zcomp_stream_put(zram->comp);
1000 ret = write_to_bdev(zram, bvec, index, bio, &element);
1001 if (!ret) {
1002 flags = ZRAM_WB;
1003 ret = 1;
1004 goto out;
1005 }
1006 allow_wb = false;
1007 goto compress_again;
1008 }
639 comp_len = PAGE_SIZE; 1009 comp_len = PAGE_SIZE;
1010 }
640 1011
641 /* 1012 /*
642 * handle allocation has 2 paths: 1013 * handle allocation has 2 paths:
@@ -663,7 +1034,6 @@ compress_again:
663 handle = zs_malloc(zram->mem_pool, comp_len, 1034 handle = zs_malloc(zram->mem_pool, comp_len,
664 GFP_NOIO | __GFP_HIGHMEM | 1035 GFP_NOIO | __GFP_HIGHMEM |
665 __GFP_MOVABLE); 1036 __GFP_MOVABLE);
666 *zstrm = zcomp_stream_get(zram->comp);
667 if (handle) 1037 if (handle)
668 goto compress_again; 1038 goto compress_again;
669 return -ENOMEM; 1039 return -ENOMEM;
@@ -673,34 +1043,11 @@ compress_again:
673 update_used_max(zram, alloced_pages); 1043 update_used_max(zram, alloced_pages);
674 1044
675 if (zram->limit_pages && alloced_pages > zram->limit_pages) { 1045 if (zram->limit_pages && alloced_pages > zram->limit_pages) {
1046 zcomp_stream_put(zram->comp);
676 zs_free(zram->mem_pool, handle); 1047 zs_free(zram->mem_pool, handle);
677 return -ENOMEM; 1048 return -ENOMEM;
678 } 1049 }
679 1050
680 *out_handle = handle;
681 *out_comp_len = comp_len;
682 return 0;
683}
684
685static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
686{
687 int ret;
688 unsigned long handle;
689 unsigned int comp_len;
690 void *src, *dst;
691 struct zcomp_strm *zstrm;
692 struct page *page = bvec->bv_page;
693
694 if (zram_same_page_write(zram, index, page))
695 return 0;
696
697 zstrm = zcomp_stream_get(zram->comp);
698 ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
699 if (ret) {
700 zcomp_stream_put(zram->comp);
701 return ret;
702 }
703
704 dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); 1051 dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
705 1052
706 src = zstrm->buffer; 1053 src = zstrm->buffer;
@@ -712,25 +1059,31 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
712 1059
713 zcomp_stream_put(zram->comp); 1060 zcomp_stream_put(zram->comp);
714 zs_unmap_object(zram->mem_pool, handle); 1061 zs_unmap_object(zram->mem_pool, handle);
715 1062 atomic64_add(comp_len, &zram->stats.compr_data_size);
1063out:
716 /* 1064 /*
717 * Free memory associated with this sector 1065 * Free memory associated with this sector
718 * before overwriting unused sectors. 1066 * before overwriting unused sectors.
719 */ 1067 */
720 zram_slot_lock(zram, index); 1068 zram_slot_lock(zram, index);
721 zram_free_page(zram, index); 1069 zram_free_page(zram, index);
722 zram_set_handle(zram, index, handle); 1070
723 zram_set_obj_size(zram, index, comp_len); 1071 if (flags) {
1072 zram_set_flag(zram, index, flags);
1073 zram_set_element(zram, index, element);
1074 } else {
1075 zram_set_handle(zram, index, handle);
1076 zram_set_obj_size(zram, index, comp_len);
1077 }
724 zram_slot_unlock(zram, index); 1078 zram_slot_unlock(zram, index);
725 1079
726 /* Update stats */ 1080 /* Update stats */
727 atomic64_add(comp_len, &zram->stats.compr_data_size);
728 atomic64_inc(&zram->stats.pages_stored); 1081 atomic64_inc(&zram->stats.pages_stored);
729 return 0; 1082 return ret;
730} 1083}
731 1084
732static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, 1085static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
733 u32 index, int offset) 1086 u32 index, int offset, struct bio *bio)
734{ 1087{
735 int ret; 1088 int ret;
736 struct page *page = NULL; 1089 struct page *page = NULL;
@@ -748,7 +1101,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
748 if (!page) 1101 if (!page)
749 return -ENOMEM; 1102 return -ENOMEM;
750 1103
751 ret = zram_decompress_page(zram, page, index); 1104 ret = __zram_bvec_read(zram, page, index, bio, true);
752 if (ret) 1105 if (ret)
753 goto out; 1106 goto out;
754 1107
@@ -763,7 +1116,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
763 vec.bv_offset = 0; 1116 vec.bv_offset = 0;
764 } 1117 }
765 1118
766 ret = __zram_bvec_write(zram, &vec, index); 1119 ret = __zram_bvec_write(zram, &vec, index, bio);
767out: 1120out:
768 if (is_partial_io(bvec)) 1121 if (is_partial_io(bvec))
769 __free_page(page); 1122 __free_page(page);
@@ -808,8 +1161,13 @@ static void zram_bio_discard(struct zram *zram, u32 index,
808 } 1161 }
809} 1162}
810 1163
1164/*
1165 * Returns errno if it has some problem. Otherwise return 0 or 1.
1166 * Returns 0 if IO request was done synchronously
1167 * Returns 1 if IO request was successfully submitted.
1168 */
811static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, 1169static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
812 int offset, bool is_write) 1170 int offset, bool is_write, struct bio *bio)
813{ 1171{
814 unsigned long start_time = jiffies; 1172 unsigned long start_time = jiffies;
815 int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ; 1173 int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
@@ -820,16 +1178,16 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
820 1178
821 if (!is_write) { 1179 if (!is_write) {
822 atomic64_inc(&zram->stats.num_reads); 1180 atomic64_inc(&zram->stats.num_reads);
823 ret = zram_bvec_read(zram, bvec, index, offset); 1181 ret = zram_bvec_read(zram, bvec, index, offset, bio);
824 flush_dcache_page(bvec->bv_page); 1182 flush_dcache_page(bvec->bv_page);
825 } else { 1183 } else {
826 atomic64_inc(&zram->stats.num_writes); 1184 atomic64_inc(&zram->stats.num_writes);
827 ret = zram_bvec_write(zram, bvec, index, offset); 1185 ret = zram_bvec_write(zram, bvec, index, offset, bio);
828 } 1186 }
829 1187
830 generic_end_io_acct(rw_acct, &zram->disk->part0, start_time); 1188 generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
831 1189
832 if (unlikely(ret)) { 1190 if (unlikely(ret < 0)) {
833 if (!is_write) 1191 if (!is_write)
834 atomic64_inc(&zram->stats.failed_reads); 1192 atomic64_inc(&zram->stats.failed_reads);
835 else 1193 else
@@ -868,7 +1226,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
868 bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, 1226 bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
869 unwritten); 1227 unwritten);
870 if (zram_bvec_rw(zram, &bv, index, offset, 1228 if (zram_bvec_rw(zram, &bv, index, offset,
871 op_is_write(bio_op(bio))) < 0) 1229 op_is_write(bio_op(bio)), bio) < 0)
872 goto out; 1230 goto out;
873 1231
874 bv.bv_offset += bv.bv_len; 1232 bv.bv_offset += bv.bv_len;
@@ -922,16 +1280,18 @@ static void zram_slot_free_notify(struct block_device *bdev,
922static int zram_rw_page(struct block_device *bdev, sector_t sector, 1280static int zram_rw_page(struct block_device *bdev, sector_t sector,
923 struct page *page, bool is_write) 1281 struct page *page, bool is_write)
924{ 1282{
925 int offset, err = -EIO; 1283 int offset, ret;
926 u32 index; 1284 u32 index;
927 struct zram *zram; 1285 struct zram *zram;
928 struct bio_vec bv; 1286 struct bio_vec bv;
929 1287
1288 if (PageTransHuge(page))
1289 return -ENOTSUPP;
930 zram = bdev->bd_disk->private_data; 1290 zram = bdev->bd_disk->private_data;
931 1291
932 if (!valid_io_request(zram, sector, PAGE_SIZE)) { 1292 if (!valid_io_request(zram, sector, PAGE_SIZE)) {
933 atomic64_inc(&zram->stats.invalid_io); 1293 atomic64_inc(&zram->stats.invalid_io);
934 err = -EINVAL; 1294 ret = -EINVAL;
935 goto out; 1295 goto out;
936 } 1296 }
937 1297
@@ -942,7 +1302,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
942 bv.bv_len = PAGE_SIZE; 1302 bv.bv_len = PAGE_SIZE;
943 bv.bv_offset = 0; 1303 bv.bv_offset = 0;
944 1304
945 err = zram_bvec_rw(zram, &bv, index, offset, is_write); 1305 ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
946out: 1306out:
947 /* 1307 /*
948 * If I/O fails, just return error(ie, non-zero) without 1308 * If I/O fails, just return error(ie, non-zero) without
@@ -952,9 +1312,20 @@ out:
952 * bio->bi_end_io does things to handle the error 1312 * bio->bi_end_io does things to handle the error
953 * (e.g., SetPageError, set_page_dirty and extra works). 1313 * (e.g., SetPageError, set_page_dirty and extra works).
954 */ 1314 */
955 if (err == 0) 1315 if (unlikely(ret < 0))
1316 return ret;
1317
1318 switch (ret) {
1319 case 0:
956 page_endio(page, is_write, 0); 1320 page_endio(page, is_write, 0);
957 return err; 1321 break;
1322 case 1:
1323 ret = 0;
1324 break;
1325 default:
1326 WARN_ON(1);
1327 }
1328 return ret;
958} 1329}
959 1330
960static void zram_reset_device(struct zram *zram) 1331static void zram_reset_device(struct zram *zram)
@@ -983,6 +1354,7 @@ static void zram_reset_device(struct zram *zram)
983 zram_meta_free(zram, disksize); 1354 zram_meta_free(zram, disksize);
984 memset(&zram->stats, 0, sizeof(zram->stats)); 1355 memset(&zram->stats, 0, sizeof(zram->stats));
985 zcomp_destroy(comp); 1356 zcomp_destroy(comp);
1357 reset_bdev(zram);
986} 1358}
987 1359
988static ssize_t disksize_store(struct device *dev, 1360static ssize_t disksize_store(struct device *dev,
@@ -1108,6 +1480,9 @@ static DEVICE_ATTR_WO(mem_limit);
1108static DEVICE_ATTR_WO(mem_used_max); 1480static DEVICE_ATTR_WO(mem_used_max);
1109static DEVICE_ATTR_RW(max_comp_streams); 1481static DEVICE_ATTR_RW(max_comp_streams);
1110static DEVICE_ATTR_RW(comp_algorithm); 1482static DEVICE_ATTR_RW(comp_algorithm);
1483#ifdef CONFIG_ZRAM_WRITEBACK
1484static DEVICE_ATTR_RW(backing_dev);
1485#endif
1111 1486
1112static struct attribute *zram_disk_attrs[] = { 1487static struct attribute *zram_disk_attrs[] = {
1113 &dev_attr_disksize.attr, 1488 &dev_attr_disksize.attr,
@@ -1118,6 +1493,9 @@ static struct attribute *zram_disk_attrs[] = {
1118 &dev_attr_mem_used_max.attr, 1493 &dev_attr_mem_used_max.attr,
1119 &dev_attr_max_comp_streams.attr, 1494 &dev_attr_max_comp_streams.attr,
1120 &dev_attr_comp_algorithm.attr, 1495 &dev_attr_comp_algorithm.attr,
1496#ifdef CONFIG_ZRAM_WRITEBACK
1497 &dev_attr_backing_dev.attr,
1498#endif
1121 &dev_attr_io_stat.attr, 1499 &dev_attr_io_stat.attr,
1122 &dev_attr_mm_stat.attr, 1500 &dev_attr_mm_stat.attr,
1123 &dev_attr_debug_stat.attr, 1501 &dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e34e44d02e3e..31762db861e3 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -60,9 +60,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
60 60
61/* Flags for zram pages (table[page_no].value) */ 61/* Flags for zram pages (table[page_no].value) */
62enum zram_pageflags { 62enum zram_pageflags {
63 /* Page consists entirely of zeros */ 63 /* Page consists the same element */
64 ZRAM_SAME = ZRAM_FLAG_SHIFT, 64 ZRAM_SAME = ZRAM_FLAG_SHIFT,
65 ZRAM_ACCESS, /* page is now accessed */ 65 ZRAM_ACCESS, /* page is now accessed */
66 ZRAM_WB, /* page is stored on backing_device */
66 67
67 __NR_ZRAM_PAGEFLAGS, 68 __NR_ZRAM_PAGEFLAGS,
68}; 69};
@@ -115,5 +116,13 @@ struct zram {
115 * zram is claimed so open request will be failed 116 * zram is claimed so open request will be failed
116 */ 117 */
117 bool claim; /* Protected by bdev->bd_mutex */ 118 bool claim; /* Protected by bdev->bd_mutex */
119#ifdef CONFIG_ZRAM_WRITEBACK
120 struct file *backing_dev;
121 struct block_device *bdev;
122 unsigned int old_block_size;
123 unsigned long *bitmap;
124 unsigned long nr_pages;
125 spinlock_t bitmap_lock;
126#endif
118}; 127};
119#endif 128#endif
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a36216bd2a84..e4d4b6b41e26 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4308,10 +4308,10 @@ i915_drop_caches_set(void *data, u64 val)
4308 4308
4309 fs_reclaim_acquire(GFP_KERNEL); 4309 fs_reclaim_acquire(GFP_KERNEL);
4310 if (val & DROP_BOUND) 4310 if (val & DROP_BOUND)
4311 i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND); 4311 i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_BOUND);
4312 4312
4313 if (val & DROP_UNBOUND) 4313 if (val & DROP_UNBOUND)
4314 i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_UNBOUND); 4314 i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_UNBOUND);
4315 4315
4316 if (val & DROP_SHRINK_ALL) 4316 if (val & DROP_SHRINK_ALL)
4317 i915_gem_shrink_all(dev_priv); 4317 i915_gem_shrink_all(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 60267e375e88..bd74641ab7f6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3742,6 +3742,7 @@ i915_gem_object_create_internal(struct drm_i915_private *dev_priv,
3742/* i915_gem_shrinker.c */ 3742/* i915_gem_shrinker.c */
3743unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv, 3743unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv,
3744 unsigned long target, 3744 unsigned long target,
3745 unsigned long *nr_scanned,
3745 unsigned flags); 3746 unsigned flags);
3746#define I915_SHRINK_PURGEABLE 0x1 3747#define I915_SHRINK_PURGEABLE 0x1
3747#define I915_SHRINK_UNBOUND 0x2 3748#define I915_SHRINK_UNBOUND 0x2
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b9e8e0d6e97b..287c6ead95b3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2354,7 +2354,7 @@ rebuild_st:
2354 goto err_sg; 2354 goto err_sg;
2355 } 2355 }
2356 2356
2357 i915_gem_shrink(dev_priv, 2 * page_count, *s++); 2357 i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2358 cond_resched(); 2358 cond_resched();
2359 2359
2360 /* We've tried hard to allocate the memory by reaping 2360 /* We've tried hard to allocate the memory by reaping
@@ -5015,7 +5015,7 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
5015 * the objects as well, see i915_gem_freeze() 5015 * the objects as well, see i915_gem_freeze()
5016 */ 5016 */
5017 5017
5018 i915_gem_shrink(dev_priv, -1UL, I915_SHRINK_UNBOUND); 5018 i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
5019 i915_gem_drain_freed_objects(dev_priv); 5019 i915_gem_drain_freed_objects(dev_priv);
5020 5020
5021 mutex_lock(&dev_priv->drm.struct_mutex); 5021 mutex_lock(&dev_priv->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index d60f38adc4c4..6c6b8e8592aa 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2062,7 +2062,7 @@ int i915_gem_gtt_prepare_pages(struct drm_i915_gem_object *obj,
2062 */ 2062 */
2063 GEM_BUG_ON(obj->mm.pages == pages); 2063 GEM_BUG_ON(obj->mm.pages == pages);
2064 } while (i915_gem_shrink(to_i915(obj->base.dev), 2064 } while (i915_gem_shrink(to_i915(obj->base.dev),
2065 obj->base.size >> PAGE_SHIFT, 2065 obj->base.size >> PAGE_SHIFT, NULL,
2066 I915_SHRINK_BOUND | 2066 I915_SHRINK_BOUND |
2067 I915_SHRINK_UNBOUND | 2067 I915_SHRINK_UNBOUND |
2068 I915_SHRINK_ACTIVE)); 2068 I915_SHRINK_ACTIVE));
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 77fb39808131..74002b2d1b6f 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -136,6 +136,7 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
136 * i915_gem_shrink - Shrink buffer object caches 136 * i915_gem_shrink - Shrink buffer object caches
137 * @dev_priv: i915 device 137 * @dev_priv: i915 device
138 * @target: amount of memory to make available, in pages 138 * @target: amount of memory to make available, in pages
139 * @nr_scanned: optional output for number of pages scanned (incremental)
139 * @flags: control flags for selecting cache types 140 * @flags: control flags for selecting cache types
140 * 141 *
141 * This function is the main interface to the shrinker. It will try to release 142 * This function is the main interface to the shrinker. It will try to release
@@ -158,7 +159,9 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
158 */ 159 */
159unsigned long 160unsigned long
160i915_gem_shrink(struct drm_i915_private *dev_priv, 161i915_gem_shrink(struct drm_i915_private *dev_priv,
161 unsigned long target, unsigned flags) 162 unsigned long target,
163 unsigned long *nr_scanned,
164 unsigned flags)
162{ 165{
163 const struct { 166 const struct {
164 struct list_head *list; 167 struct list_head *list;
@@ -169,6 +172,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
169 { NULL, 0 }, 172 { NULL, 0 },
170 }, *phase; 173 }, *phase;
171 unsigned long count = 0; 174 unsigned long count = 0;
175 unsigned long scanned = 0;
172 bool unlock; 176 bool unlock;
173 177
174 if (!shrinker_lock(dev_priv, &unlock)) 178 if (!shrinker_lock(dev_priv, &unlock))
@@ -249,6 +253,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
249 count += obj->base.size >> PAGE_SHIFT; 253 count += obj->base.size >> PAGE_SHIFT;
250 } 254 }
251 mutex_unlock(&obj->mm.lock); 255 mutex_unlock(&obj->mm.lock);
256 scanned += obj->base.size >> PAGE_SHIFT;
252 } 257 }
253 } 258 }
254 list_splice_tail(&still_in_list, phase->list); 259 list_splice_tail(&still_in_list, phase->list);
@@ -261,6 +266,8 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
261 266
262 shrinker_unlock(dev_priv, unlock); 267 shrinker_unlock(dev_priv, unlock);
263 268
269 if (nr_scanned)
270 *nr_scanned += scanned;
264 return count; 271 return count;
265} 272}
266 273
@@ -283,7 +290,7 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv)
283 unsigned long freed; 290 unsigned long freed;
284 291
285 intel_runtime_pm_get(dev_priv); 292 intel_runtime_pm_get(dev_priv);
286 freed = i915_gem_shrink(dev_priv, -1UL, 293 freed = i915_gem_shrink(dev_priv, -1UL, NULL,
287 I915_SHRINK_BOUND | 294 I915_SHRINK_BOUND |
288 I915_SHRINK_UNBOUND | 295 I915_SHRINK_UNBOUND |
289 I915_SHRINK_ACTIVE); 296 I915_SHRINK_ACTIVE);
@@ -329,23 +336,28 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
329 unsigned long freed; 336 unsigned long freed;
330 bool unlock; 337 bool unlock;
331 338
339 sc->nr_scanned = 0;
340
332 if (!shrinker_lock(dev_priv, &unlock)) 341 if (!shrinker_lock(dev_priv, &unlock))
333 return SHRINK_STOP; 342 return SHRINK_STOP;
334 343
335 freed = i915_gem_shrink(dev_priv, 344 freed = i915_gem_shrink(dev_priv,
336 sc->nr_to_scan, 345 sc->nr_to_scan,
346 &sc->nr_scanned,
337 I915_SHRINK_BOUND | 347 I915_SHRINK_BOUND |
338 I915_SHRINK_UNBOUND | 348 I915_SHRINK_UNBOUND |
339 I915_SHRINK_PURGEABLE); 349 I915_SHRINK_PURGEABLE);
340 if (freed < sc->nr_to_scan) 350 if (freed < sc->nr_to_scan)
341 freed += i915_gem_shrink(dev_priv, 351 freed += i915_gem_shrink(dev_priv,
342 sc->nr_to_scan - freed, 352 sc->nr_to_scan - sc->nr_scanned,
353 &sc->nr_scanned,
343 I915_SHRINK_BOUND | 354 I915_SHRINK_BOUND |
344 I915_SHRINK_UNBOUND); 355 I915_SHRINK_UNBOUND);
345 if (freed < sc->nr_to_scan && current_is_kswapd()) { 356 if (freed < sc->nr_to_scan && current_is_kswapd()) {
346 intel_runtime_pm_get(dev_priv); 357 intel_runtime_pm_get(dev_priv);
347 freed += i915_gem_shrink(dev_priv, 358 freed += i915_gem_shrink(dev_priv,
348 sc->nr_to_scan - freed, 359 sc->nr_to_scan - sc->nr_scanned,
360 &sc->nr_scanned,
349 I915_SHRINK_ACTIVE | 361 I915_SHRINK_ACTIVE |
350 I915_SHRINK_BOUND | 362 I915_SHRINK_BOUND |
351 I915_SHRINK_UNBOUND); 363 I915_SHRINK_UNBOUND);
@@ -354,7 +366,7 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
354 366
355 shrinker_unlock(dev_priv, unlock); 367 shrinker_unlock(dev_priv, unlock);
356 368
357 return freed; 369 return sc->nr_scanned ? freed : SHRINK_STOP;
358} 370}
359 371
360static bool 372static bool
@@ -453,7 +465,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
453 goto out; 465 goto out;
454 466
455 intel_runtime_pm_get(dev_priv); 467 intel_runtime_pm_get(dev_priv);
456 freed_pages += i915_gem_shrink(dev_priv, -1UL, 468 freed_pages += i915_gem_shrink(dev_priv, -1UL, NULL,
457 I915_SHRINK_BOUND | 469 I915_SHRINK_BOUND |
458 I915_SHRINK_UNBOUND | 470 I915_SHRINK_UNBOUND |
459 I915_SHRINK_ACTIVE | 471 I915_SHRINK_ACTIVE |
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 14323faf8bd9..60491641a8d6 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1241,8 +1241,10 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector,
1241{ 1241{
1242 struct btt *btt = bdev->bd_disk->private_data; 1242 struct btt *btt = bdev->bd_disk->private_data;
1243 int rc; 1243 int rc;
1244 unsigned int len;
1244 1245
1245 rc = btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector); 1246 len = hpage_nr_pages(page) * PAGE_SIZE;
1247 rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
1246 if (rc == 0) 1248 if (rc == 0)
1247 page_endio(page, is_write, 0); 1249 page_endio(page, is_write, 0);
1248 1250
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f7099adaabc0..e9aa453da50c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -80,22 +80,40 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
80static void write_pmem(void *pmem_addr, struct page *page, 80static void write_pmem(void *pmem_addr, struct page *page,
81 unsigned int off, unsigned int len) 81 unsigned int off, unsigned int len)
82{ 82{
83 void *mem = kmap_atomic(page); 83 unsigned int chunk;
84 84 void *mem;
85 memcpy_flushcache(pmem_addr, mem + off, len); 85
86 kunmap_atomic(mem); 86 while (len) {
87 mem = kmap_atomic(page);
88 chunk = min_t(unsigned int, len, PAGE_SIZE);
89 memcpy_flushcache(pmem_addr, mem + off, chunk);
90 kunmap_atomic(mem);
91 len -= chunk;
92 off = 0;
93 page++;
94 pmem_addr += PAGE_SIZE;
95 }
87} 96}
88 97
89static blk_status_t read_pmem(struct page *page, unsigned int off, 98static blk_status_t read_pmem(struct page *page, unsigned int off,
90 void *pmem_addr, unsigned int len) 99 void *pmem_addr, unsigned int len)
91{ 100{
101 unsigned int chunk;
92 int rc; 102 int rc;
93 void *mem = kmap_atomic(page); 103 void *mem;
94 104
95 rc = memcpy_mcsafe(mem + off, pmem_addr, len); 105 while (len) {
96 kunmap_atomic(mem); 106 mem = kmap_atomic(page);
97 if (rc) 107 chunk = min_t(unsigned int, len, PAGE_SIZE);
98 return BLK_STS_IOERR; 108 rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
109 kunmap_atomic(mem);
110 if (rc)
111 return BLK_STS_IOERR;
112 len -= chunk;
113 off = 0;
114 page++;
115 pmem_addr += PAGE_SIZE;
116 }
99 return BLK_STS_OK; 117 return BLK_STS_OK;
100} 118}
101 119
@@ -188,7 +206,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
188 struct pmem_device *pmem = bdev->bd_queue->queuedata; 206 struct pmem_device *pmem = bdev->bd_queue->queuedata;
189 blk_status_t rc; 207 blk_status_t rc;
190 208
191 rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector); 209 rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
210 0, is_write, sector);
192 211
193 /* 212 /*
194 * The ->rw_page interface is subtle and tricky. The core 213 * The ->rw_page interface is subtle and tricky. The core
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 103ca5e1267b..64c58eb26159 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
151 return FSCACHE_CHECKAUX_OKAY; 151 return FSCACHE_CHECKAUX_OKAY;
152} 152}
153 153
154static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
155{
156 struct v9fs_inode *v9inode = cookie_netfs_data;
157 struct pagevec pvec;
158 pgoff_t first;
159 int loop, nr_pages;
160
161 pagevec_init(&pvec, 0);
162 first = 0;
163
164 for (;;) {
165 nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
166 first,
167 PAGEVEC_SIZE - pagevec_count(&pvec));
168 if (!nr_pages)
169 break;
170
171 for (loop = 0; loop < nr_pages; loop++)
172 ClearPageFsCache(pvec.pages[loop]);
173
174 first = pvec.pages[nr_pages - 1]->index + 1;
175
176 pvec.nr = nr_pages;
177 pagevec_release(&pvec);
178 cond_resched();
179 }
180}
181
182const struct fscache_cookie_def v9fs_cache_inode_index_def = { 154const struct fscache_cookie_def v9fs_cache_inode_index_def = {
183 .name = "9p.inode", 155 .name = "9p.inode",
184 .type = FSCACHE_COOKIE_TYPE_DATAFILE, 156 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
186 .get_attr = v9fs_cache_inode_get_attr, 158 .get_attr = v9fs_cache_inode_get_attr,
187 .get_aux = v9fs_cache_inode_get_aux, 159 .get_aux = v9fs_cache_inode_get_aux,
188 .check_aux = v9fs_cache_inode_check_aux, 160 .check_aux = v9fs_cache_inode_check_aux,
189 .now_uncached = v9fs_cache_inode_now_uncached,
190}; 161};
191 162
192void v9fs_cache_inode_get_cookie(struct inode *inode) 163void v9fs_cache_inode_get_cookie(struct inode *inode)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 577763c3d88b..1fe855191261 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
39static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, 39static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
40 const void *buffer, 40 const void *buffer,
41 uint16_t buflen); 41 uint16_t buflen);
42static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
43 42
44struct fscache_netfs afs_cache_netfs = { 43struct fscache_netfs afs_cache_netfs = {
45 .name = "afs", 44 .name = "afs",
@@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = {
75 .get_attr = afs_vnode_cache_get_attr, 74 .get_attr = afs_vnode_cache_get_attr,
76 .get_aux = afs_vnode_cache_get_aux, 75 .get_aux = afs_vnode_cache_get_aux,
77 .check_aux = afs_vnode_cache_check_aux, 76 .check_aux = afs_vnode_cache_check_aux,
78 .now_uncached = afs_vnode_cache_now_uncached,
79}; 77};
80 78
81/* 79/*
@@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
359 _leave(" = SUCCESS"); 357 _leave(" = SUCCESS");
360 return FSCACHE_CHECKAUX_OKAY; 358 return FSCACHE_CHECKAUX_OKAY;
361} 359}
362
363/*
364 * indication the cookie is no longer uncached
365 * - this function is called when the backing store currently caching a cookie
366 * is removed
367 * - the netfs should use this to clean up any markers indicating cached pages
368 * - this is mandatory for any object that may have data
369 */
370static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
371{
372 struct afs_vnode *vnode = cookie_netfs_data;
373 struct pagevec pvec;
374 pgoff_t first;
375 int loop, nr_pages;
376
377 _enter("{%x,%x,%Lx}",
378 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
379
380 pagevec_init(&pvec, 0);
381 first = 0;
382
383 for (;;) {
384 /* grab a bunch of pages to clean */
385 nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
386 first,
387 PAGEVEC_SIZE - pagevec_count(&pvec));
388 if (!nr_pages)
389 break;
390
391 for (loop = 0; loop < nr_pages; loop++)
392 ClearPageFsCache(pvec.pages[loop]);
393
394 first = pvec.pages[nr_pages - 1]->index + 1;
395
396 pvec.nr = nr_pages;
397 pagevec_release(&pvec);
398 cond_resched();
399 }
400
401 _leave("");
402}
diff --git a/fs/buffer.c b/fs/buffer.c
index 5715dac7821f..50da0e102ca0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1627 struct pagevec pvec; 1627 struct pagevec pvec;
1628 pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); 1628 pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1629 pgoff_t end; 1629 pgoff_t end;
1630 int i; 1630 int i, count;
1631 struct buffer_head *bh; 1631 struct buffer_head *bh;
1632 struct buffer_head *head; 1632 struct buffer_head *head;
1633 1633
1634 end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); 1634 end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1635 pagevec_init(&pvec, 0); 1635 pagevec_init(&pvec, 0);
1636 while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, 1636 while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
1637 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 1637 count = pagevec_count(&pvec);
1638 for (i = 0; i < pagevec_count(&pvec); i++) { 1638 for (i = 0; i < count; i++) {
1639 struct page *page = pvec.pages[i]; 1639 struct page *page = pvec.pages[i];
1640 1640
1641 index = page->index;
1642 if (index > end)
1643 break;
1644 if (!page_has_buffers(page)) 1641 if (!page_has_buffers(page))
1645 continue; 1642 continue;
1646 /* 1643 /*
@@ -1670,7 +1667,9 @@ unlock_page:
1670 } 1667 }
1671 pagevec_release(&pvec); 1668 pagevec_release(&pvec);
1672 cond_resched(); 1669 cond_resched();
1673 index++; 1670 /* End of range already reached? */
1671 if (index > end || !index)
1672 break;
1674 } 1673 }
1675} 1674}
1676EXPORT_SYMBOL(clean_bdev_aliases); 1675EXPORT_SYMBOL(clean_bdev_aliases);
@@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
3549 pagevec_init(&pvec, 0); 3548 pagevec_init(&pvec, 0);
3550 3549
3551 do { 3550 do {
3552 unsigned want, nr_pages, i; 3551 unsigned nr_pages, i;
3553 3552
3554 want = min_t(unsigned, end - index, PAGEVEC_SIZE); 3553 nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
3555 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); 3554 end - 1);
3556 if (nr_pages == 0) 3555 if (nr_pages == 0)
3557 break; 3556 break;
3558 3557
@@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
3573 lastoff < page_offset(page)) 3572 lastoff < page_offset(page))
3574 goto check_range; 3573 goto check_range;
3575 3574
3576 /* Searching done if the page index is out of range. */
3577 if (page->index >= end)
3578 goto not_found;
3579
3580 lock_page(page); 3575 lock_page(page);
3581 if (likely(page->mapping == inode->i_mapping) && 3576 if (likely(page->mapping == inode->i_mapping) &&
3582 page_has_buffers(page)) { 3577 page_has_buffers(page)) {
@@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
3589 unlock_page(page); 3584 unlock_page(page);
3590 lastoff = page_offset(page) + PAGE_SIZE; 3585 lastoff = page_offset(page) + PAGE_SIZE;
3591 } 3586 }
3592
3593 /* Searching done if fewer pages returned than wanted. */
3594 if (nr_pages < want)
3595 break;
3596
3597 index = pvec.pages[i - 1]->index + 1;
3598 pagevec_release(&pvec); 3587 pagevec_release(&pvec);
3599 } while (index < end); 3588 } while (index < end);
3600 3589
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 337f88673ed9..174d6e6569a8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
194 return FSCACHE_CHECKAUX_OKAY; 194 return FSCACHE_CHECKAUX_OKAY;
195} 195}
196 196
197static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
198{
199 struct ceph_inode_info* ci = cookie_netfs_data;
200 struct pagevec pvec;
201 pgoff_t first;
202 int loop, nr_pages;
203
204 pagevec_init(&pvec, 0);
205 first = 0;
206
207 dout("ceph inode 0x%p now uncached", ci);
208
209 while (1) {
210 nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
211 PAGEVEC_SIZE - pagevec_count(&pvec));
212
213 if (!nr_pages)
214 break;
215
216 for (loop = 0; loop < nr_pages; loop++)
217 ClearPageFsCache(pvec.pages[loop]);
218
219 first = pvec.pages[nr_pages - 1]->index + 1;
220
221 pvec.nr = nr_pages;
222 pagevec_release(&pvec);
223 cond_resched();
224 }
225}
226
227static const struct fscache_cookie_def ceph_fscache_inode_object_def = { 197static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
228 .name = "CEPH.inode", 198 .name = "CEPH.inode",
229 .type = FSCACHE_COOKIE_TYPE_DATAFILE, 199 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
231 .get_attr = ceph_fscache_inode_get_attr, 201 .get_attr = ceph_fscache_inode_get_attr,
232 .get_aux = ceph_fscache_inode_get_aux, 202 .get_aux = ceph_fscache_inode_get_aux,
233 .check_aux = ceph_fscache_inode_check_aux, 203 .check_aux = ceph_fscache_inode_check_aux,
234 .now_uncached = ceph_fscache_inode_now_uncached,
235}; 204};
236 205
237void ceph_fscache_register_inode_cookie(struct inode *inode) 206void ceph_fscache_register_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 6c665bf4a27c..2c14020e5e1d 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
292 return FSCACHE_CHECKAUX_OKAY; 292 return FSCACHE_CHECKAUX_OKAY;
293} 293}
294 294
295static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
296{
297 struct cifsInodeInfo *cifsi = cookie_netfs_data;
298 struct pagevec pvec;
299 pgoff_t first;
300 int loop, nr_pages;
301
302 pagevec_init(&pvec, 0);
303 first = 0;
304
305 cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
306
307 for (;;) {
308 nr_pages = pagevec_lookup(&pvec,
309 cifsi->vfs_inode.i_mapping, first,
310 PAGEVEC_SIZE - pagevec_count(&pvec));
311 if (!nr_pages)
312 break;
313
314 for (loop = 0; loop < nr_pages; loop++)
315 ClearPageFsCache(pvec.pages[loop]);
316
317 first = pvec.pages[nr_pages - 1]->index + 1;
318
319 pvec.nr = nr_pages;
320 pagevec_release(&pvec);
321 cond_resched();
322 }
323}
324
325const struct fscache_cookie_def cifs_fscache_inode_object_def = { 295const struct fscache_cookie_def cifs_fscache_inode_object_def = {
326 .name = "CIFS.uniqueid", 296 .name = "CIFS.uniqueid",
327 .type = FSCACHE_COOKIE_TYPE_DATAFILE, 297 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = {
329 .get_attr = cifs_fscache_inode_get_attr, 299 .get_attr = cifs_fscache_inode_get_attr,
330 .get_aux = cifs_fscache_inode_get_aux, 300 .get_aux = cifs_fscache_inode_get_aux,
331 .check_aux = cifs_fscache_inode_check_aux, 301 .check_aux = cifs_fscache_inode_check_aux,
332 .now_uncached = cifs_fscache_inode_now_uncached,
333}; 302};
diff --git a/fs/dax.c b/fs/dax.c
index ab925dc6647a..6afcacb3a87b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -42,6 +42,9 @@
42#define DAX_WAIT_TABLE_BITS 12 42#define DAX_WAIT_TABLE_BITS 12
43#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 43#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
44 44
45/* The 'colour' (ie low bits) within a PMD of a page offset. */
46#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
47
45static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 48static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
46 49
47static int __init init_dax_wait_table(void) 50static int __init init_dax_wait_table(void)
@@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void)
54} 57}
55fs_initcall(init_dax_wait_table); 58fs_initcall(init_dax_wait_table);
56 59
60/*
61 * We use lowest available bit in exceptional entry for locking, one bit for
62 * the entry size (PMD) and two more to tell us if the entry is a zero page or
63 * an empty entry that is just used for locking. In total four special bits.
64 *
65 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
66 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
67 * block allocation.
68 */
69#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
70#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
71#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
72#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
73#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
74
75static unsigned long dax_radix_sector(void *entry)
76{
77 return (unsigned long)entry >> RADIX_DAX_SHIFT;
78}
79
80static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
81{
82 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
83 ((unsigned long)sector << RADIX_DAX_SHIFT) |
84 RADIX_DAX_ENTRY_LOCK);
85}
86
87static unsigned int dax_radix_order(void *entry)
88{
89 if ((unsigned long)entry & RADIX_DAX_PMD)
90 return PMD_SHIFT - PAGE_SHIFT;
91 return 0;
92}
93
57static int dax_is_pmd_entry(void *entry) 94static int dax_is_pmd_entry(void *entry)
58{ 95{
59 return (unsigned long)entry & RADIX_DAX_PMD; 96 return (unsigned long)entry & RADIX_DAX_PMD;
@@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry)
66 103
67static int dax_is_zero_entry(void *entry) 104static int dax_is_zero_entry(void *entry)
68{ 105{
69 return (unsigned long)entry & RADIX_DAX_HZP; 106 return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
70} 107}
71 108
72static int dax_is_empty_entry(void *entry) 109static int dax_is_empty_entry(void *entry)
@@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
98 * the range covered by the PMD map to the same bit lock. 135 * the range covered by the PMD map to the same bit lock.
99 */ 136 */
100 if (dax_is_pmd_entry(entry)) 137 if (dax_is_pmd_entry(entry))
101 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 138 index &= ~PG_PMD_COLOUR;
102 139
103 key->mapping = mapping; 140 key->mapping = mapping;
104 key->entry_start = index; 141 key->entry_start = index;
@@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
121} 158}
122 159
123/* 160/*
161 * We do not necessarily hold the mapping->tree_lock when we call this
162 * function so it is possible that 'entry' is no longer a valid item in the
163 * radix tree. This is okay because all we really need to do is to find the
164 * correct waitqueue where tasks might be waiting for that old 'entry' and
165 * wake them.
166 */
167static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
168 pgoff_t index, void *entry, bool wake_all)
169{
170 struct exceptional_entry_key key;
171 wait_queue_head_t *wq;
172
173 wq = dax_entry_waitqueue(mapping, index, entry, &key);
174
175 /*
176 * Checking for locked entry and prepare_to_wait_exclusive() happens
177 * under mapping->tree_lock, ditto for entry handling in our callers.
178 * So at this point all tasks that could have seen our entry locked
179 * must be in the waitqueue and the following check will see them.
180 */
181 if (waitqueue_active(wq))
182 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
183}
184
185/*
124 * Check whether the given slot is locked. The function must be called with 186 * Check whether the given slot is locked. The function must be called with
125 * mapping->tree_lock held 187 * mapping->tree_lock held
126 */ 188 */
@@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
181 for (;;) { 243 for (;;) {
182 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 244 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
183 &slot); 245 &slot);
184 if (!entry || !radix_tree_exceptional_entry(entry) || 246 if (!entry ||
247 WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
185 !slot_locked(mapping, slot)) { 248 !slot_locked(mapping, slot)) {
186 if (slotp) 249 if (slotp)
187 *slotp = slot; 250 *slotp = slot;
@@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
216} 279}
217 280
218static void put_locked_mapping_entry(struct address_space *mapping, 281static void put_locked_mapping_entry(struct address_space *mapping,
219 pgoff_t index, void *entry) 282 pgoff_t index)
220{ 283{
221 if (!radix_tree_exceptional_entry(entry)) { 284 dax_unlock_mapping_entry(mapping, index);
222 unlock_page(entry);
223 put_page(entry);
224 } else {
225 dax_unlock_mapping_entry(mapping, index);
226 }
227} 285}
228 286
229/* 287/*
@@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
233static void put_unlocked_mapping_entry(struct address_space *mapping, 291static void put_unlocked_mapping_entry(struct address_space *mapping,
234 pgoff_t index, void *entry) 292 pgoff_t index, void *entry)
235{ 293{
236 if (!radix_tree_exceptional_entry(entry)) 294 if (!entry)
237 return; 295 return;
238 296
239 /* We have to wake up next waiter for the radix tree entry lock */ 297 /* We have to wake up next waiter for the radix tree entry lock */
@@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
241} 299}
242 300
243/* 301/*
244 * Find radix tree entry at given index. If it points to a page, return with 302 * Find radix tree entry at given index. If it points to an exceptional entry,
245 * the page locked. If it points to the exceptional entry, return with the 303 * return it with the radix tree entry locked. If the radix tree doesn't
246 * radix tree entry locked. If the radix tree doesn't contain given index, 304 * contain given index, create an empty exceptional entry for the index and
247 * create empty exceptional entry for the index and return with it locked. 305 * return with it locked.
248 * 306 *
249 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 307 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
250 * either return that locked entry or will return an error. This error will 308 * either return that locked entry or will return an error. This error will
251 * happen if there are any 4k entries (either zero pages or DAX entries) 309 * happen if there are any 4k entries within the 2MiB range that we are
252 * within the 2MiB range that we are requesting. 310 * requesting.
253 * 311 *
254 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 312 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
255 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 313 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
@@ -276,18 +334,21 @@ restart:
276 spin_lock_irq(&mapping->tree_lock); 334 spin_lock_irq(&mapping->tree_lock);
277 entry = get_unlocked_mapping_entry(mapping, index, &slot); 335 entry = get_unlocked_mapping_entry(mapping, index, &slot);
278 336
337 if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
338 entry = ERR_PTR(-EIO);
339 goto out_unlock;
340 }
341
279 if (entry) { 342 if (entry) {
280 if (size_flag & RADIX_DAX_PMD) { 343 if (size_flag & RADIX_DAX_PMD) {
281 if (!radix_tree_exceptional_entry(entry) || 344 if (dax_is_pte_entry(entry)) {
282 dax_is_pte_entry(entry)) {
283 put_unlocked_mapping_entry(mapping, index, 345 put_unlocked_mapping_entry(mapping, index,
284 entry); 346 entry);
285 entry = ERR_PTR(-EEXIST); 347 entry = ERR_PTR(-EEXIST);
286 goto out_unlock; 348 goto out_unlock;
287 } 349 }
288 } else { /* trying to grab a PTE entry */ 350 } else { /* trying to grab a PTE entry */
289 if (radix_tree_exceptional_entry(entry) && 351 if (dax_is_pmd_entry(entry) &&
290 dax_is_pmd_entry(entry) &&
291 (dax_is_zero_entry(entry) || 352 (dax_is_zero_entry(entry) ||
292 dax_is_empty_entry(entry))) { 353 dax_is_empty_entry(entry))) {
293 pmd_downgrade = true; 354 pmd_downgrade = true;
@@ -321,7 +382,7 @@ restart:
321 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 382 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
322 if (err) { 383 if (err) {
323 if (pmd_downgrade) 384 if (pmd_downgrade)
324 put_locked_mapping_entry(mapping, index, entry); 385 put_locked_mapping_entry(mapping, index);
325 return ERR_PTR(err); 386 return ERR_PTR(err);
326 } 387 }
327 spin_lock_irq(&mapping->tree_lock); 388 spin_lock_irq(&mapping->tree_lock);
@@ -371,52 +432,12 @@ restart:
371 spin_unlock_irq(&mapping->tree_lock); 432 spin_unlock_irq(&mapping->tree_lock);
372 return entry; 433 return entry;
373 } 434 }
374 /* Normal page in radix tree? */
375 if (!radix_tree_exceptional_entry(entry)) {
376 struct page *page = entry;
377
378 get_page(page);
379 spin_unlock_irq(&mapping->tree_lock);
380 lock_page(page);
381 /* Page got truncated? Retry... */
382 if (unlikely(page->mapping != mapping)) {
383 unlock_page(page);
384 put_page(page);
385 goto restart;
386 }
387 return page;
388 }
389 entry = lock_slot(mapping, slot); 435 entry = lock_slot(mapping, slot);
390 out_unlock: 436 out_unlock:
391 spin_unlock_irq(&mapping->tree_lock); 437 spin_unlock_irq(&mapping->tree_lock);
392 return entry; 438 return entry;
393} 439}
394 440
395/*
396 * We do not necessarily hold the mapping->tree_lock when we call this
397 * function so it is possible that 'entry' is no longer a valid item in the
398 * radix tree. This is okay because all we really need to do is to find the
399 * correct waitqueue where tasks might be waiting for that old 'entry' and
400 * wake them.
401 */
402void dax_wake_mapping_entry_waiter(struct address_space *mapping,
403 pgoff_t index, void *entry, bool wake_all)
404{
405 struct exceptional_entry_key key;
406 wait_queue_head_t *wq;
407
408 wq = dax_entry_waitqueue(mapping, index, entry, &key);
409
410 /*
411 * Checking for locked entry and prepare_to_wait_exclusive() happens
412 * under mapping->tree_lock, ditto for entry handling in our callers.
413 * So at this point all tasks that could have seen our entry locked
414 * must be in the waitqueue and the following check will see them.
415 */
416 if (waitqueue_active(wq))
417 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
418}
419
420static int __dax_invalidate_mapping_entry(struct address_space *mapping, 441static int __dax_invalidate_mapping_entry(struct address_space *mapping,
421 pgoff_t index, bool trunc) 442 pgoff_t index, bool trunc)
422{ 443{
@@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
426 447
427 spin_lock_irq(&mapping->tree_lock); 448 spin_lock_irq(&mapping->tree_lock);
428 entry = get_unlocked_mapping_entry(mapping, index, NULL); 449 entry = get_unlocked_mapping_entry(mapping, index, NULL);
429 if (!entry || !radix_tree_exceptional_entry(entry)) 450 if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
430 goto out; 451 goto out;
431 if (!trunc && 452 if (!trunc &&
432 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 453 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
468 return __dax_invalidate_mapping_entry(mapping, index, false); 489 return __dax_invalidate_mapping_entry(mapping, index, false);
469} 490}
470 491
471/*
472 * The user has performed a load from a hole in the file. Allocating
473 * a new page in the file would cause excessive storage usage for
474 * workloads with sparse files. We allocate a page cache page instead.
475 * We'll kick it out of the page cache if it's ever written to,
476 * otherwise it will simply fall out of the page cache under memory
477 * pressure without ever having been dirtied.
478 */
479static int dax_load_hole(struct address_space *mapping, void **entry,
480 struct vm_fault *vmf)
481{
482 struct inode *inode = mapping->host;
483 struct page *page;
484 int ret;
485
486 /* Hole page already exists? Return it... */
487 if (!radix_tree_exceptional_entry(*entry)) {
488 page = *entry;
489 goto finish_fault;
490 }
491
492 /* This will replace locked radix tree entry with a hole page */
493 page = find_or_create_page(mapping, vmf->pgoff,
494 vmf->gfp_mask | __GFP_ZERO);
495 if (!page) {
496 ret = VM_FAULT_OOM;
497 goto out;
498 }
499
500finish_fault:
501 vmf->page = page;
502 ret = finish_fault(vmf);
503 vmf->page = NULL;
504 *entry = page;
505 if (!ret) {
506 /* Grab reference for PTE that is now referencing the page */
507 get_page(page);
508 ret = VM_FAULT_NOPAGE;
509 }
510out:
511 trace_dax_load_hole(inode, vmf, ret);
512 return ret;
513}
514
515static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, 492static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
516 sector_t sector, size_t size, struct page *to, 493 sector_t sector, size_t size, struct page *to,
517 unsigned long vaddr) 494 unsigned long vaddr)
@@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
552 unsigned long flags) 529 unsigned long flags)
553{ 530{
554 struct radix_tree_root *page_tree = &mapping->page_tree; 531 struct radix_tree_root *page_tree = &mapping->page_tree;
555 int error = 0;
556 bool hole_fill = false;
557 void *new_entry; 532 void *new_entry;
558 pgoff_t index = vmf->pgoff; 533 pgoff_t index = vmf->pgoff;
559 534
560 if (vmf->flags & FAULT_FLAG_WRITE) 535 if (vmf->flags & FAULT_FLAG_WRITE)
561 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 536 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
562 537
563 /* Replacing hole page with block mapping? */ 538 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
564 if (!radix_tree_exceptional_entry(entry)) { 539 /* we are replacing a zero page with block mapping */
565 hole_fill = true; 540 if (dax_is_pmd_entry(entry))
566 /* 541 unmap_mapping_range(mapping,
567 * Unmap the page now before we remove it from page cache below. 542 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
568 * The page is locked so it cannot be faulted in again. 543 PMD_SIZE, 0);
569 */ 544 else /* pte entry */
570 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 545 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
571 PAGE_SIZE, 0); 546 PAGE_SIZE, 0);
572 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
573 if (error)
574 return ERR_PTR(error);
575 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
576 /* replacing huge zero page with PMD block mapping */
577 unmap_mapping_range(mapping,
578 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
579 } 547 }
580 548
581 spin_lock_irq(&mapping->tree_lock); 549 spin_lock_irq(&mapping->tree_lock);
582 new_entry = dax_radix_locked_entry(sector, flags); 550 new_entry = dax_radix_locked_entry(sector, flags);
583 551
584 if (hole_fill) { 552 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
585 __delete_from_page_cache(entry, NULL);
586 /* Drop pagecache reference */
587 put_page(entry);
588 error = __radix_tree_insert(page_tree, index,
589 dax_radix_order(new_entry), new_entry);
590 if (error) {
591 new_entry = ERR_PTR(error);
592 goto unlock;
593 }
594 mapping->nrexceptional++;
595 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
596 /* 553 /*
597 * Only swap our new entry into the radix tree if the current 554 * Only swap our new entry into the radix tree if the current
598 * entry is a zero page or an empty entry. If a normal PTE or 555 * entry is a zero page or an empty entry. If a normal PTE or
@@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
609 WARN_ON_ONCE(ret != entry); 566 WARN_ON_ONCE(ret != entry);
610 __radix_tree_replace(page_tree, node, slot, 567 __radix_tree_replace(page_tree, node, slot,
611 new_entry, NULL, NULL); 568 new_entry, NULL, NULL);
569 entry = new_entry;
612 } 570 }
571
613 if (vmf->flags & FAULT_FLAG_WRITE) 572 if (vmf->flags & FAULT_FLAG_WRITE)
614 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 573 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
615 unlock: 574
616 spin_unlock_irq(&mapping->tree_lock); 575 spin_unlock_irq(&mapping->tree_lock);
617 if (hole_fill) { 576 return entry;
618 radix_tree_preload_end();
619 /*
620 * We don't need hole page anymore, it has been replaced with
621 * locked radix tree entry now.
622 */
623 if (mapping->a_ops->freepage)
624 mapping->a_ops->freepage(entry);
625 unlock_page(entry);
626 put_page(entry);
627 }
628 return new_entry;
629} 577}
630 578
631static inline unsigned long 579static inline unsigned long
@@ -727,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev,
727 spin_lock_irq(&mapping->tree_lock); 675 spin_lock_irq(&mapping->tree_lock);
728 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 676 entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
729 /* Entry got punched out / reallocated? */ 677 /* Entry got punched out / reallocated? */
730 if (!entry2 || !radix_tree_exceptional_entry(entry2)) 678 if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
731 goto put_unlocked; 679 goto put_unlocked;
732 /* 680 /*
733 * Entry got reallocated elsewhere? No need to writeback. We have to 681 * Entry got reallocated elsewhere? No need to writeback. We have to
@@ -799,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev,
799 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); 747 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
800 dax_unlock: 748 dax_unlock:
801 dax_read_unlock(id); 749 dax_read_unlock(id);
802 put_locked_mapping_entry(mapping, index, entry); 750 put_locked_mapping_entry(mapping, index);
803 return ret; 751 return ret;
804 752
805 put_unlocked: 753 put_unlocked:
@@ -874,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
874 822
875static int dax_insert_mapping(struct address_space *mapping, 823static int dax_insert_mapping(struct address_space *mapping,
876 struct block_device *bdev, struct dax_device *dax_dev, 824 struct block_device *bdev, struct dax_device *dax_dev,
877 sector_t sector, size_t size, void **entryp, 825 sector_t sector, size_t size, void *entry,
878 struct vm_area_struct *vma, struct vm_fault *vmf) 826 struct vm_area_struct *vma, struct vm_fault *vmf)
879{ 827{
880 unsigned long vaddr = vmf->address; 828 unsigned long vaddr = vmf->address;
881 void *entry = *entryp;
882 void *ret, *kaddr; 829 void *ret, *kaddr;
883 pgoff_t pgoff; 830 pgoff_t pgoff;
884 int id, rc; 831 int id, rc;
@@ -899,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping,
899 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); 846 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
900 if (IS_ERR(ret)) 847 if (IS_ERR(ret))
901 return PTR_ERR(ret); 848 return PTR_ERR(ret);
902 *entryp = ret;
903 849
904 trace_dax_insert_mapping(mapping->host, vmf, ret); 850 trace_dax_insert_mapping(mapping->host, vmf, ret);
905 return vm_insert_mixed(vma, vaddr, pfn); 851 if (vmf->flags & FAULT_FLAG_WRITE)
852 return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
853 else
854 return vm_insert_mixed(vma, vaddr, pfn);
906} 855}
907 856
908/** 857/*
909 * dax_pfn_mkwrite - handle first write to DAX page 858 * The user has performed a load from a hole in the file. Allocating a new
910 * @vmf: The description of the fault 859 * page in the file would cause excessive storage usage for workloads with
860 * sparse files. Instead we insert a read-only mapping of the 4k zero page.
861 * If this page is ever written to we will re-fault and change the mapping to
862 * point to real DAX storage instead.
911 */ 863 */
912int dax_pfn_mkwrite(struct vm_fault *vmf) 864static int dax_load_hole(struct address_space *mapping, void *entry,
865 struct vm_fault *vmf)
913{ 866{
914 struct file *file = vmf->vma->vm_file;
915 struct address_space *mapping = file->f_mapping;
916 struct inode *inode = mapping->host; 867 struct inode *inode = mapping->host;
917 void *entry, **slot; 868 unsigned long vaddr = vmf->address;
918 pgoff_t index = vmf->pgoff; 869 int ret = VM_FAULT_NOPAGE;
870 struct page *zero_page;
871 void *entry2;
919 872
920 spin_lock_irq(&mapping->tree_lock); 873 zero_page = ZERO_PAGE(0);
921 entry = get_unlocked_mapping_entry(mapping, index, &slot); 874 if (unlikely(!zero_page)) {
922 if (!entry || !radix_tree_exceptional_entry(entry)) { 875 ret = VM_FAULT_OOM;
923 if (entry) 876 goto out;
924 put_unlocked_mapping_entry(mapping, index, entry);
925 spin_unlock_irq(&mapping->tree_lock);
926 trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
927 return VM_FAULT_NOPAGE;
928 } 877 }
929 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 878
930 entry = lock_slot(mapping, slot); 879 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
931 spin_unlock_irq(&mapping->tree_lock); 880 RADIX_DAX_ZERO_PAGE);
932 /* 881 if (IS_ERR(entry2)) {
933 * If we race with somebody updating the PTE and finish_mkwrite_fault() 882 ret = VM_FAULT_SIGBUS;
934 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry 883 goto out;
935 * the fault in either case. 884 }
936 */ 885
937 finish_mkwrite_fault(vmf); 886 vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
938 put_locked_mapping_entry(mapping, index, entry); 887out:
939 trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); 888 trace_dax_load_hole(inode, vmf, ret);
940 return VM_FAULT_NOPAGE; 889 return ret;
941} 890}
942EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
943 891
944static bool dax_range_is_aligned(struct block_device *bdev, 892static bool dax_range_is_aligned(struct block_device *bdev,
945 unsigned int offset, unsigned int length) 893 unsigned int offset, unsigned int length)
@@ -1059,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1059 if (map_len > end - pos) 1007 if (map_len > end - pos)
1060 map_len = end - pos; 1008 map_len = end - pos;
1061 1009
1010 /*
1011 * The userspace address for the memory copy has already been
1012 * validated via access_ok() in either vfs_read() or
1013 * vfs_write(), depending on which operation we are doing.
1014 */
1062 if (iov_iter_rw(iter) == WRITE) 1015 if (iov_iter_rw(iter) == WRITE)
1063 map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, 1016 map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1064 map_len, iter); 1017 map_len, iter);
@@ -1223,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1223 major = VM_FAULT_MAJOR; 1176 major = VM_FAULT_MAJOR;
1224 } 1177 }
1225 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, 1178 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
1226 sector, PAGE_SIZE, &entry, vmf->vma, vmf); 1179 sector, PAGE_SIZE, entry, vmf->vma, vmf);
1227 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1180 /* -EBUSY is fine, somebody else faulted on the same PTE */
1228 if (error == -EBUSY) 1181 if (error == -EBUSY)
1229 error = 0; 1182 error = 0;
@@ -1231,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1231 case IOMAP_UNWRITTEN: 1184 case IOMAP_UNWRITTEN:
1232 case IOMAP_HOLE: 1185 case IOMAP_HOLE:
1233 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1186 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1234 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1187 vmf_ret = dax_load_hole(mapping, entry, vmf);
1235 goto finish_iomap; 1188 goto finish_iomap;
1236 } 1189 }
1237 /*FALLTHRU*/ 1190 /*FALLTHRU*/
@@ -1258,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1258 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1211 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1259 } 1212 }
1260 unlock_entry: 1213 unlock_entry:
1261 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1214 put_locked_mapping_entry(mapping, vmf->pgoff);
1262 out: 1215 out:
1263 trace_dax_pte_fault_done(inode, vmf, vmf_ret); 1216 trace_dax_pte_fault_done(inode, vmf, vmf_ret);
1264 return vmf_ret; 1217 return vmf_ret;
1265} 1218}
1266 1219
1267#ifdef CONFIG_FS_DAX_PMD 1220#ifdef CONFIG_FS_DAX_PMD
1268/*
1269 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
1270 * more often than one might expect in the below functions.
1271 */
1272#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
1273
1274static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, 1221static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1275 loff_t pos, void **entryp) 1222 loff_t pos, void *entry)
1276{ 1223{
1277 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1224 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1278 const sector_t sector = dax_iomap_sector(iomap, pos); 1225 const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1283,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1283 void *ret = NULL, *kaddr; 1230 void *ret = NULL, *kaddr;
1284 long length = 0; 1231 long length = 0;
1285 pgoff_t pgoff; 1232 pgoff_t pgoff;
1286 pfn_t pfn; 1233 pfn_t pfn = {};
1287 int id; 1234 int id;
1288 1235
1289 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) 1236 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
@@ -1303,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1303 goto unlock_fallback; 1250 goto unlock_fallback;
1304 dax_read_unlock(id); 1251 dax_read_unlock(id);
1305 1252
1306 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, 1253 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
1307 RADIX_DAX_PMD); 1254 RADIX_DAX_PMD);
1308 if (IS_ERR(ret)) 1255 if (IS_ERR(ret))
1309 goto fallback; 1256 goto fallback;
1310 *entryp = ret;
1311 1257
1312 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); 1258 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
1313 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1259 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1321,7 +1267,7 @@ fallback:
1321} 1267}
1322 1268
1323static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1269static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1324 void **entryp) 1270 void *entry)
1325{ 1271{
1326 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1272 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1327 unsigned long pmd_addr = vmf->address & PMD_MASK; 1273 unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1336,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1336 if (unlikely(!zero_page)) 1282 if (unlikely(!zero_page))
1337 goto fallback; 1283 goto fallback;
1338 1284
1339 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1285 ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
1340 RADIX_DAX_PMD | RADIX_DAX_HZP); 1286 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
1341 if (IS_ERR(ret)) 1287 if (IS_ERR(ret))
1342 goto fallback; 1288 goto fallback;
1343 *entryp = ret;
1344 1289
1345 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1290 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1346 if (!pmd_none(*(vmf->pmd))) { 1291 if (!pmd_none(*(vmf->pmd))) {
@@ -1416,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1416 goto fallback; 1361 goto fallback;
1417 1362
1418 /* 1363 /*
1419 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1364 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
1420 * PMD or a HZP entry. If it can't (because a 4k page is already in 1365 * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
1421 * the tree, for instance), it will return -EEXIST and we just fall 1366 * is already in the tree, for instance), it will return -EEXIST and
1422 * back to 4k entries. 1367 * we just fall back to 4k entries.
1423 */ 1368 */
1424 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1369 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1425 if (IS_ERR(entry)) 1370 if (IS_ERR(entry))
@@ -1452,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1452 1397
1453 switch (iomap.type) { 1398 switch (iomap.type) {
1454 case IOMAP_MAPPED: 1399 case IOMAP_MAPPED:
1455 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); 1400 result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
1456 break; 1401 break;
1457 case IOMAP_UNWRITTEN: 1402 case IOMAP_UNWRITTEN:
1458 case IOMAP_HOLE: 1403 case IOMAP_HOLE:
1459 if (WARN_ON_ONCE(write)) 1404 if (WARN_ON_ONCE(write))
1460 break; 1405 break;
1461 result = dax_pmd_load_hole(vmf, &iomap, &entry); 1406 result = dax_pmd_load_hole(vmf, &iomap, entry);
1462 break; 1407 break;
1463 default: 1408 default:
1464 WARN_ON_ONCE(1); 1409 WARN_ON_ONCE(1);
@@ -1481,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1481 &iomap); 1426 &iomap);
1482 } 1427 }
1483 unlock_entry: 1428 unlock_entry:
1484 put_locked_mapping_entry(mapping, pgoff, entry); 1429 put_locked_mapping_entry(mapping, pgoff);
1485 fallback: 1430 fallback:
1486 if (result == VM_FAULT_FALLBACK) { 1431 if (result == VM_FAULT_FALLBACK) {
1487 split_huge_pmd(vma, vmf->pmd, vmf->address); 1432 split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d34d32bdc944..ff3a3636a5ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf)
107 return ret; 107 return ret;
108} 108}
109 109
110static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
111{
112 struct inode *inode = file_inode(vmf->vma->vm_file);
113 struct ext2_inode_info *ei = EXT2_I(inode);
114 loff_t size;
115 int ret;
116
117 sb_start_pagefault(inode->i_sb);
118 file_update_time(vmf->vma->vm_file);
119 down_read(&ei->dax_sem);
120
121 /* check that the faulting page hasn't raced with truncate */
122 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
123 if (vmf->pgoff >= size)
124 ret = VM_FAULT_SIGBUS;
125 else
126 ret = dax_pfn_mkwrite(vmf);
127
128 up_read(&ei->dax_sem);
129 sb_end_pagefault(inode->i_sb);
130 return ret;
131}
132
133static const struct vm_operations_struct ext2_dax_vm_ops = { 110static const struct vm_operations_struct ext2_dax_vm_ops = {
134 .fault = ext2_dax_fault, 111 .fault = ext2_dax_fault,
135 /* 112 /*
@@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
138 * will always fail and fail back to regular faults. 115 * will always fail and fail back to regular faults.
139 */ 116 */
140 .page_mkwrite = ext2_dax_fault, 117 .page_mkwrite = ext2_dax_fault,
141 .pfn_mkwrite = ext2_dax_pfn_mkwrite, 118 .pfn_mkwrite = ext2_dax_fault,
142}; 119};
143 120
144static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) 121static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 197653ea6041..57dcaea762c3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -324,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
324 return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); 324 return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
325} 325}
326 326
327/*
328 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
329 * handler we check for races agaist truncate. Note that since we cycle through
330 * i_mmap_sem, we are sure that also any hole punching that began before we
331 * were called is finished by now and so if it included part of the file we
332 * are working on, our pte will get unmapped and the check for pte_same() in
333 * wp_pfn_shared() fails. Thus fault gets retried and things work out as
334 * desired.
335 */
336static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
337{
338 struct inode *inode = file_inode(vmf->vma->vm_file);
339 struct super_block *sb = inode->i_sb;
340 loff_t size;
341 int ret;
342
343 sb_start_pagefault(sb);
344 file_update_time(vmf->vma->vm_file);
345 down_read(&EXT4_I(inode)->i_mmap_sem);
346 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
347 if (vmf->pgoff >= size)
348 ret = VM_FAULT_SIGBUS;
349 else
350 ret = dax_pfn_mkwrite(vmf);
351 up_read(&EXT4_I(inode)->i_mmap_sem);
352 sb_end_pagefault(sb);
353
354 return ret;
355}
356
357static const struct vm_operations_struct ext4_dax_vm_ops = { 327static const struct vm_operations_struct ext4_dax_vm_ops = {
358 .fault = ext4_dax_fault, 328 .fault = ext4_dax_fault,
359 .huge_fault = ext4_dax_huge_fault, 329 .huge_fault = ext4_dax_huge_fault,
360 .page_mkwrite = ext4_dax_fault, 330 .page_mkwrite = ext4_dax_fault,
361 .pfn_mkwrite = ext4_dax_pfn_mkwrite, 331 .pfn_mkwrite = ext4_dax_fault,
362}; 332};
363#else 333#else
364#define ext4_dax_vm_ops ext4_file_vm_ops 334#define ext4_dax_vm_ops ext4_file_vm_ops
@@ -507,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
507 477
508 pagevec_init(&pvec, 0); 478 pagevec_init(&pvec, 0);
509 do { 479 do {
510 int i, num; 480 int i;
511 unsigned long nr_pages; 481 unsigned long nr_pages;
512 482
513 num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; 483 nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
514 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, 484 &index, end);
515 (pgoff_t)num);
516 if (nr_pages == 0) 485 if (nr_pages == 0)
517 break; 486 break;
518 487
@@ -531,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
531 goto out; 500 goto out;
532 } 501 }
533 502
534 if (page->index > end)
535 goto out;
536
537 lock_page(page); 503 lock_page(page);
538 504
539 if (unlikely(page->mapping != inode->i_mapping)) { 505 if (unlikely(page->mapping != inode->i_mapping)) {
@@ -576,14 +542,10 @@ next:
576 unlock_page(page); 542 unlock_page(page);
577 } 543 }
578 544
579 /* The no. of pages is less than our desired, we are done. */
580 if (nr_pages < num)
581 break;
582
583 index = pvec.pages[i - 1]->index + 1;
584 pagevec_release(&pvec); 545 pagevec_release(&pvec);
585 } while (index <= end); 546 } while (index <= end);
586 547
548 /* There are no pages upto endoff - that would be a hole in there. */
587 if (whence == SEEK_HOLE && lastoff < endoff) { 549 if (whence == SEEK_HOLE && lastoff < endoff) {
588 found = 1; 550 found = 1;
589 *offset = lastoff; 551 *offset = lastoff;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 714396760616..e963508ea35f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1720 1720
1721 pagevec_init(&pvec, 0); 1721 pagevec_init(&pvec, 0);
1722 while (index <= end) { 1722 while (index <= end) {
1723 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1723 nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
1724 if (nr_pages == 0) 1724 if (nr_pages == 0)
1725 break; 1725 break;
1726 for (i = 0; i < nr_pages; i++) { 1726 for (i = 0; i < nr_pages; i++) {
1727 struct page *page = pvec.pages[i]; 1727 struct page *page = pvec.pages[i];
1728 if (page->index > end) 1728
1729 break;
1730 BUG_ON(!PageLocked(page)); 1729 BUG_ON(!PageLocked(page));
1731 BUG_ON(PageWriteback(page)); 1730 BUG_ON(PageWriteback(page));
1732 if (invalidate) { 1731 if (invalidate) {
@@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1737 } 1736 }
1738 unlock_page(page); 1737 unlock_page(page);
1739 } 1738 }
1740 index = pvec.pages[nr_pages - 1]->index + 1;
1741 pagevec_release(&pvec); 1739 pagevec_release(&pvec);
1742 } 1740 }
1743} 1741}
@@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2348 2346
2349 pagevec_init(&pvec, 0); 2347 pagevec_init(&pvec, 0);
2350 while (start <= end) { 2348 while (start <= end) {
2351 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, 2349 nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
2352 PAGEVEC_SIZE); 2350 &start, end);
2353 if (nr_pages == 0) 2351 if (nr_pages == 0)
2354 break; 2352 break;
2355 for (i = 0; i < nr_pages; i++) { 2353 for (i = 0; i < nr_pages; i++) {
2356 struct page *page = pvec.pages[i]; 2354 struct page *page = pvec.pages[i];
2357 2355
2358 if (page->index > end)
2359 break;
2360 /* Up to 'end' pages must be contiguous */
2361 BUG_ON(page->index != start);
2362 bh = head = page_buffers(page); 2356 bh = head = page_buffers(page);
2363 do { 2357 do {
2364 if (lblk < mpd->map.m_lblk) 2358 if (lblk < mpd->map.m_lblk)
@@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2403 pagevec_release(&pvec); 2397 pagevec_release(&pvec);
2404 return err; 2398 return err;
2405 } 2399 }
2406 start++;
2407 } 2400 }
2408 pagevec_release(&pvec); 2401 pagevec_release(&pvec);
2409 } 2402 }
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c8c4f79c7ce1..0ad3fd3ad0b4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
1178 pagevec_init(&pvec, 0); 1178 pagevec_init(&pvec, 0);
1179 next = 0; 1179 next = 0;
1180 do { 1180 do {
1181 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) 1181 if (!pagevec_lookup(&pvec, mapping, &next))
1182 break; 1182 break;
1183 for (i = 0; i < pagevec_count(&pvec); i++) { 1183 for (i = 0; i < pagevec_count(&pvec); i++) {
1184 struct page *page = pvec.pages[i]; 1184 struct page *page = pvec.pages[i];
1185 next = page->index;
1186 if (PageFsCache(page)) { 1185 if (PageFsCache(page)) {
1187 __fscache_wait_on_page_write(cookie, page); 1186 __fscache_wait_on_page_write(cookie, page);
1188 __fscache_uncache_page(cookie, page); 1187 __fscache_uncache_page(cookie, page);
@@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
1190 } 1189 }
1191 pagevec_release(&pvec); 1190 pagevec_release(&pvec);
1192 cond_resched(); 1191 cond_resched();
1193 } while (++next); 1192 } while (next);
1194 1193
1195 _leave(""); 1194 _leave("");
1196} 1195}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 28d2753be094..7c02b3f738e1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
401 const pgoff_t end = lend >> huge_page_shift(h); 401 const pgoff_t end = lend >> huge_page_shift(h);
402 struct vm_area_struct pseudo_vma; 402 struct vm_area_struct pseudo_vma;
403 struct pagevec pvec; 403 struct pagevec pvec;
404 pgoff_t next; 404 pgoff_t next, index;
405 int i, freed = 0; 405 int i, freed = 0;
406 long lookup_nr = PAGEVEC_SIZE;
407 bool truncate_op = (lend == LLONG_MAX); 406 bool truncate_op = (lend == LLONG_MAX);
408 407
409 memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); 408 memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
@@ -412,33 +411,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
412 next = start; 411 next = start;
413 while (next < end) { 412 while (next < end) {
414 /* 413 /*
415 * Don't grab more pages than the number left in the range.
416 */
417 if (end - next < lookup_nr)
418 lookup_nr = end - next;
419
420 /*
421 * When no more pages are found, we are done. 414 * When no more pages are found, we are done.
422 */ 415 */
423 if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) 416 if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
424 break; 417 break;
425 418
426 for (i = 0; i < pagevec_count(&pvec); ++i) { 419 for (i = 0; i < pagevec_count(&pvec); ++i) {
427 struct page *page = pvec.pages[i]; 420 struct page *page = pvec.pages[i];
428 u32 hash; 421 u32 hash;
429 422
430 /* 423 index = page->index;
431 * The page (index) could be beyond end. This is
432 * only possible in the punch hole case as end is
433 * max page offset in the truncate case.
434 */
435 next = page->index;
436 if (next >= end)
437 break;
438
439 hash = hugetlb_fault_mutex_hash(h, current->mm, 424 hash = hugetlb_fault_mutex_hash(h, current->mm,
440 &pseudo_vma, 425 &pseudo_vma,
441 mapping, next, 0); 426 mapping, index, 0);
442 mutex_lock(&hugetlb_fault_mutex_table[hash]); 427 mutex_lock(&hugetlb_fault_mutex_table[hash]);
443 428
444 /* 429 /*
@@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
455 440
456 i_mmap_lock_write(mapping); 441 i_mmap_lock_write(mapping);
457 hugetlb_vmdelete_list(&mapping->i_mmap, 442 hugetlb_vmdelete_list(&mapping->i_mmap,
458 next * pages_per_huge_page(h), 443 index * pages_per_huge_page(h),
459 (next + 1) * pages_per_huge_page(h)); 444 (index + 1) * pages_per_huge_page(h));
460 i_mmap_unlock_write(mapping); 445 i_mmap_unlock_write(mapping);
461 } 446 }
462 447
@@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
475 freed++; 460 freed++;
476 if (!truncate_op) { 461 if (!truncate_op) {
477 if (unlikely(hugetlb_unreserve_pages(inode, 462 if (unlikely(hugetlb_unreserve_pages(inode,
478 next, next + 1, 1))) 463 index, index + 1, 1)))
479 hugetlb_fix_reserve_counts(inode); 464 hugetlb_fix_reserve_counts(inode);
480 } 465 }
481 466
482 unlock_page(page); 467 unlock_page(page);
483 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 468 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
484 } 469 }
485 ++next;
486 huge_pagevec_release(&pvec); 470 huge_pagevec_release(&pvec);
487 cond_resched(); 471 cond_resched();
488 } 472 }
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 777b055063f6..3025fe8584a0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -252,45 +252,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
252} 252}
253 253
254/* 254/*
255 * Indication from FS-Cache that the cookie is no longer cached
256 * - This function is called when the backing store currently caching a cookie
257 * is removed
258 * - The netfs should use this to clean up any markers indicating cached pages
259 * - This is mandatory for any object that may have data
260 */
261static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
262{
263 struct nfs_inode *nfsi = cookie_netfs_data;
264 struct pagevec pvec;
265 pgoff_t first;
266 int loop, nr_pages;
267
268 pagevec_init(&pvec, 0);
269 first = 0;
270
271 dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
272
273 for (;;) {
274 /* grab a bunch of pages to unmark */
275 nr_pages = pagevec_lookup(&pvec,
276 nfsi->vfs_inode.i_mapping,
277 first,
278 PAGEVEC_SIZE - pagevec_count(&pvec));
279 if (!nr_pages)
280 break;
281
282 for (loop = 0; loop < nr_pages; loop++)
283 ClearPageFsCache(pvec.pages[loop]);
284
285 first = pvec.pages[nr_pages - 1]->index + 1;
286
287 pvec.nr = nr_pages;
288 pagevec_release(&pvec);
289 cond_resched();
290 }
291}
292
293/*
294 * Get an extra reference on a read context. 255 * Get an extra reference on a read context.
295 * - This function can be absent if the completion function doesn't require a 256 * - This function can be absent if the completion function doesn't require a
296 * context. 257 * context.
@@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = {
330 .get_attr = nfs_fscache_inode_get_attr, 291 .get_attr = nfs_fscache_inode_get_attr,
331 .get_aux = nfs_fscache_inode_get_aux, 292 .get_aux = nfs_fscache_inode_get_aux,
332 .check_aux = nfs_fscache_inode_check_aux, 293 .check_aux = nfs_fscache_inode_check_aux,
333 .now_uncached = nfs_fscache_inode_now_uncached,
334 .get_context = nfs_fh_get_context, 294 .get_context = nfs_fh_get_context,
335 .put_context = nfs_fh_put_context, 295 .put_context = nfs_fh_put_context,
336}; 296};
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index f11a3ad2df0c..8616c46d33da 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap,
312 312
313 pagevec_init(&pvec, 0); 313 pagevec_init(&pvec, 0);
314repeat: 314repeat:
315 n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); 315 n = pagevec_lookup(&pvec, smap, &index);
316 if (!n) 316 if (!n)
317 return; 317 return;
318 index = pvec.pages[n - 1]->index + 1;
319 318
320 for (i = 0; i < pagevec_count(&pvec); i++) { 319 for (i = 0; i < pagevec_count(&pvec); i++) {
321 struct page *page = pvec.pages[i], *dpage; 320 struct page *page = pvec.pages[i], *dpage;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e50a387959bf..40b5cc97f7b0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -221,7 +221,7 @@ out:
221/* 221/*
222 * Set the access or default ACL of an inode. 222 * Set the access or default ACL of an inode.
223 */ 223 */
224int ocfs2_set_acl(handle_t *handle, 224static int ocfs2_set_acl(handle_t *handle,
225 struct inode *inode, 225 struct inode *inode,
226 struct buffer_head *di_bh, 226 struct buffer_head *di_bh,
227 int type, 227 int type,
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 2783a75b3999..7be0bb756286 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -28,13 +28,6 @@ struct ocfs2_acl_entry {
28 28
29struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); 29struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
30int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); 30int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
31int ocfs2_set_acl(handle_t *handle,
32 struct inode *inode,
33 struct buffer_head *di_bh,
34 int type,
35 struct posix_acl *acl,
36 struct ocfs2_alloc_context *meta_ac,
37 struct ocfs2_alloc_context *data_ac);
38extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); 31extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
39extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, 32extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
40 struct buffer_head *, struct buffer_head *, 33 struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fb15a96df0b6..a177eae3aa1a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
955/* 955/*
956 * How many free extents have we got before we need more meta data? 956 * How many free extents have we got before we need more meta data?
957 */ 957 */
958int ocfs2_num_free_extents(struct ocfs2_super *osb, 958int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
959 struct ocfs2_extent_tree *et)
960{ 959{
961 int retval; 960 int retval;
962 struct ocfs2_extent_list *el = NULL; 961 struct ocfs2_extent_list *el = NULL;
@@ -1933,14 +1932,12 @@ out:
1933 * the new changes. 1932 * the new changes.
1934 * 1933 *
1935 * left_rec: the record on the left. 1934 * left_rec: the record on the left.
1936 * left_child_el: is the child list pointed to by left_rec
1937 * right_rec: the record to the right of left_rec 1935 * right_rec: the record to the right of left_rec
1938 * right_child_el: is the child list pointed to by right_rec 1936 * right_child_el: is the child list pointed to by right_rec
1939 * 1937 *
1940 * By definition, this only works on interior nodes. 1938 * By definition, this only works on interior nodes.
1941 */ 1939 */
1942static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1940static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1943 struct ocfs2_extent_list *left_child_el,
1944 struct ocfs2_extent_rec *right_rec, 1941 struct ocfs2_extent_rec *right_rec,
1945 struct ocfs2_extent_list *right_child_el) 1942 struct ocfs2_extent_list *right_child_el)
1946{ 1943{
@@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
2003 */ 2000 */
2004 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 2001 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2005 2002
2006 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, 2003 ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
2007 &root_el->l_recs[i + 1], right_el); 2004 &root_el->l_recs[i + 1], right_el);
2008} 2005}
2009 2006
@@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2060 el = right_path->p_node[i].el; 2057 el = right_path->p_node[i].el;
2061 right_rec = &el->l_recs[0]; 2058 right_rec = &el->l_recs[0];
2062 2059
2063 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 2060 ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
2064 right_el);
2065 2061
2066 ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 2062 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2067 ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 2063 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
@@ -2509,7 +2505,7 @@ out_ret_path:
2509 2505
2510static int ocfs2_update_edge_lengths(handle_t *handle, 2506static int ocfs2_update_edge_lengths(handle_t *handle,
2511 struct ocfs2_extent_tree *et, 2507 struct ocfs2_extent_tree *et,
2512 int subtree_index, struct ocfs2_path *path) 2508 struct ocfs2_path *path)
2513{ 2509{
2514 int i, idx, ret; 2510 int i, idx, ret;
2515 struct ocfs2_extent_rec *rec; 2511 struct ocfs2_extent_rec *rec;
@@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2755 if (del_right_subtree) { 2751 if (del_right_subtree) {
2756 ocfs2_unlink_subtree(handle, et, left_path, right_path, 2752 ocfs2_unlink_subtree(handle, et, left_path, right_path,
2757 subtree_index, dealloc); 2753 subtree_index, dealloc);
2758 ret = ocfs2_update_edge_lengths(handle, et, subtree_index, 2754 ret = ocfs2_update_edge_lengths(handle, et, left_path);
2759 left_path);
2760 if (ret) { 2755 if (ret) {
2761 mlog_errno(ret); 2756 mlog_errno(ret);
2762 goto out; 2757 goto out;
@@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
3060 3055
3061 ocfs2_unlink_subtree(handle, et, left_path, path, 3056 ocfs2_unlink_subtree(handle, et, left_path, path,
3062 subtree_index, dealloc); 3057 subtree_index, dealloc);
3063 ret = ocfs2_update_edge_lengths(handle, et, subtree_index, 3058 ret = ocfs2_update_edge_lengths(handle, et, left_path);
3064 left_path);
3065 if (ret) { 3059 if (ret) {
3066 mlog_errno(ret); 3060 mlog_errno(ret);
3067 goto out; 3061 goto out;
@@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4790 if (mark_unwritten) 4784 if (mark_unwritten)
4791 flags = OCFS2_EXT_UNWRITTEN; 4785 flags = OCFS2_EXT_UNWRITTEN;
4792 4786
4793 free_extents = ocfs2_num_free_extents(osb, et); 4787 free_extents = ocfs2_num_free_extents(et);
4794 if (free_extents < 0) { 4788 if (free_extents < 0) {
4795 status = free_extents; 4789 status = free_extents;
4796 mlog_errno(status); 4790 mlog_errno(status);
@@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5668 5662
5669 *ac = NULL; 5663 *ac = NULL;
5670 5664
5671 num_free_extents = ocfs2_num_free_extents(osb, et); 5665 num_free_extents = ocfs2_num_free_extents(et);
5672 if (num_free_extents < 0) { 5666 if (num_free_extents < 0) {
5673 ret = num_free_extents; 5667 ret = num_free_extents;
5674 mlog_errno(ret); 5668 mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 4a5152ec88a3..27b75cf32cfa 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
144 struct ocfs2_cached_dealloc_ctxt *dealloc, 144 struct ocfs2_cached_dealloc_ctxt *dealloc,
145 u64 refcount_loc, bool refcount_tree_locked); 145 u64 refcount_loc, bool refcount_tree_locked);
146 146
147int ocfs2_num_free_extents(struct ocfs2_super *osb, 147int ocfs2_num_free_extents(struct ocfs2_extent_tree *et);
148 struct ocfs2_extent_tree *et);
149 148
150/* 149/*
151 * how many new metadata chunks would an allocation need at maximum? 150 * how many new metadata chunks would an allocation need at maximum?
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ffe003982d95..56ac07cd35f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
505 } 505 }
506} 506}
507 507
508static void o2hb_wait_on_io(struct o2hb_region *reg, 508static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
509 struct o2hb_bio_wait_ctxt *wc)
510{ 509{
511 o2hb_bio_wait_dec(wc, 1); 510 o2hb_bio_wait_dec(wc, 1);
512 wait_for_completion(&wc->wc_io_complete); 511 wait_for_completion(&wc->wc_io_complete);
@@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
608 status = 0; 607 status = 0;
609 608
610bail_and_wait: 609bail_and_wait:
611 o2hb_wait_on_io(reg, &wc); 610 o2hb_wait_on_io(&wc);
612 if (wc.wc_error && !status) 611 if (wc.wc_error && !status)
613 status = wc.wc_error; 612 status = wc.wc_error;
614 613
@@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
1162 * before we can go to steady state. This ensures that 1161 * before we can go to steady state. This ensures that
1163 * people we find in our steady state have seen us. 1162 * people we find in our steady state have seen us.
1164 */ 1163 */
1165 o2hb_wait_on_io(reg, &write_wc); 1164 o2hb_wait_on_io(&write_wc);
1166 if (write_wc.wc_error) { 1165 if (write_wc.wc_error) {
1167 /* Do not re-arm the write timeout on I/O error - we 1166 /* Do not re-arm the write timeout on I/O error - we
1168 * can't be sure that the new block ever made it to 1167 * can't be sure that the new block ever made it to
@@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data)
1275 o2hb_prepare_block(reg, 0); 1274 o2hb_prepare_block(reg, 0);
1276 ret = o2hb_issue_node_write(reg, &write_wc); 1275 ret = o2hb_issue_node_write(reg, &write_wc);
1277 if (ret == 0) 1276 if (ret == 0)
1278 o2hb_wait_on_io(reg, &write_wc); 1277 o2hb_wait_on_io(&write_wc);
1279 else 1278 else
1280 mlog_errno(ret); 1279 mlog_errno(ret);
1281 } 1280 }
@@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid,
2576} 2575}
2577EXPORT_SYMBOL_GPL(o2hb_unregister_callback); 2576EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
2578 2577
2579int o2hb_check_node_heartbeating(u8 node_num)
2580{
2581 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2582
2583 o2hb_fill_node_map(testing_map, sizeof(testing_map));
2584 if (!test_bit(node_num, testing_map)) {
2585 mlog(ML_HEARTBEAT,
2586 "node (%u) does not have heartbeating enabled.\n",
2587 node_num);
2588 return 0;
2589 }
2590
2591 return 1;
2592}
2593EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
2594
2595int o2hb_check_node_heartbeating_no_sem(u8 node_num) 2578int o2hb_check_node_heartbeating_no_sem(u8 node_num)
2596{ 2579{
2597 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 2580 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
2626} 2609}
2627EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); 2610EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
2628 2611
2629/* Makes sure our local node is configured with a node number, and is
2630 * heartbeating. */
2631int o2hb_check_local_node_heartbeating(void)
2632{
2633 u8 node_num;
2634
2635 /* if this node was set then we have networking */
2636 node_num = o2nm_this_node();
2637 if (node_num == O2NM_MAX_NODES) {
2638 mlog(ML_HEARTBEAT, "this node has not been configured.\n");
2639 return 0;
2640 }
2641
2642 return o2hb_check_node_heartbeating(node_num);
2643}
2644EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
2645
2646/* 2612/*
2647 * this is just a hack until we get the plumbing which flips file systems 2613 * this is just a hack until we get the plumbing which flips file systems
2648 * read only and drops the hb ref instead of killing the node dead. 2614 * read only and drops the hb ref instead of killing the node dead.
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3ecb9f337b7d..febe6312ceff 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3249 spin_unlock(&OCFS2_I(dir)->ip_lock); 3249 spin_unlock(&OCFS2_I(dir)->ip_lock);
3250 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), 3250 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
3251 parent_fe_bh); 3251 parent_fe_bh);
3252 num_free_extents = ocfs2_num_free_extents(osb, &et); 3252 num_free_extents = ocfs2_num_free_extents(&et);
3253 if (num_free_extents < 0) { 3253 if (num_free_extents < 0) {
3254 status = num_free_extents; 3254 status = num_free_extents;
3255 mlog_errno(status); 3255 mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 66e59d3163ea..6e41fc8fabbe 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -713,13 +713,6 @@ leave:
713 return status; 713 return status;
714} 714}
715 715
716int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
717 u32 clusters_to_add, int mark_unwritten)
718{
719 return __ocfs2_extend_allocation(inode, logical_start,
720 clusters_to_add, mark_unwritten);
721}
722
723/* 716/*
724 * While a write will already be ordering the data, a truncate will not. 717 * While a write will already be ordering the data, a truncate will not.
725 * Thus, we need to explicitly order the zeroed pages. 718 * Thus, we need to explicitly order the zeroed pages.
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d5e5fa7f0743..36304434eacf 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1348 ocfs2_schedule_truncate_log_flush(osb, 0); 1348 ocfs2_schedule_truncate_log_flush(osb, 0);
1349 1349
1350 osb->local_alloc_copy = NULL; 1350 osb->local_alloc_copy = NULL;
1351 osb->dirty = 0;
1352 1351
1353 /* queue to recover orphan slots for all offline slots */ 1352 /* queue to recover orphan slots for all offline slots */
1354 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); 1353 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index e52a2852d50d..7eb3b0a6347e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
175 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; 175 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
176 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 176 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
177 177
178 num_free_extents = ocfs2_num_free_extents(osb, et); 178 num_free_extents = ocfs2_num_free_extents(et);
179 if (num_free_extents < 0) { 179 if (num_free_extents < 0) {
180 ret = num_free_extents; 180 ret = num_free_extents;
181 mlog_errno(ret); 181 mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 0c39d71c67a1..9a50f222ac97 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -320,7 +320,6 @@ struct ocfs2_super
320 u64 system_dir_blkno; 320 u64 system_dir_blkno;
321 u64 bitmap_blkno; 321 u64 bitmap_blkno;
322 u32 bitmap_cpg; 322 u32 bitmap_cpg;
323 u8 *uuid;
324 char *uuid_str; 323 char *uuid_str;
325 u32 uuid_hash; 324 u32 uuid_hash;
326 u8 *vol_label; 325 u8 *vol_label;
@@ -388,9 +387,8 @@ struct ocfs2_super
388 unsigned int osb_resv_level; 387 unsigned int osb_resv_level;
389 unsigned int osb_dir_resv_level; 388 unsigned int osb_dir_resv_level;
390 389
391 /* Next three fields are for local node slot recovery during 390 /* Next two fields are for local node slot recovery during
392 * mount. */ 391 * mount. */
393 int dirty;
394 struct ocfs2_dinode *local_alloc_copy; 392 struct ocfs2_dinode *local_alloc_copy;
395 struct ocfs2_quota_recovery *quota_rec; 393 struct ocfs2_quota_recovery *quota_rec;
396 394
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index f8933cb53d68..ab156e35ec00 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2851 int *credits) 2851 int *credits)
2852{ 2852{
2853 int ret = 0, meta_add = 0; 2853 int ret = 0, meta_add = 0;
2854 int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); 2854 int num_free_extents = ocfs2_num_free_extents(et);
2855 2855
2856 if (num_free_extents < 0) { 2856 if (num_free_extents < 0) {
2857 ret = num_free_extents; 2857 ret = num_free_extents;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6ad3533940ba..71f22c8fbffd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode,
2700 2700
2701 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2701 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2702 2702
2703 num_free_extents = ocfs2_num_free_extents(osb, et); 2703 num_free_extents = ocfs2_num_free_extents(et);
2704 if (num_free_extents < 0) { 2704 if (num_free_extents < 0) {
2705 ret = num_free_extents; 2705 ret = num_free_extents;
2706 mlog_errno(ret); 2706 mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83005f486451..3f936be379a9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2486 if (dirty) { 2486 if (dirty) {
2487 /* Recovery will be completed after we've mounted the 2487 /* Recovery will be completed after we've mounted the
2488 * rest of the volume. */ 2488 * rest of the volume. */
2489 osb->dirty = 1;
2490 osb->local_alloc_copy = local_alloc; 2489 osb->local_alloc_copy = local_alloc;
2491 local_alloc = NULL; 2490 local_alloc = NULL;
2492 } 2491 }
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f70c3778d600..5fdf269ba82e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
6800 *credits += 1; 6800 *credits += 1;
6801 6801
6802 /* count in the xattr tree change. */ 6802 /* count in the xattr tree change. */
6803 num_free_extents = ocfs2_num_free_extents(osb, xt_et); 6803 num_free_extents = ocfs2_num_free_extents(xt_et);
6804 if (num_free_extents < 0) { 6804 if (num_free_extents < 0) {
6805 ret = num_free_extents; 6805 ret = num_free_extents;
6806 mlog_errno(ret); 6806 mlog_errno(ret);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 98fd8f6df851..e5d89a0d0b8a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2931,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2931#ifdef CONFIG_PROC_PAGE_MONITOR 2931#ifdef CONFIG_PROC_PAGE_MONITOR
2932 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2932 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2933 REG("smaps", S_IRUGO, proc_pid_smaps_operations), 2933 REG("smaps", S_IRUGO, proc_pid_smaps_operations),
2934 REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
2934 REG("pagemap", S_IRUSR, proc_pagemap_operations), 2935 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2935#endif 2936#endif
2936#ifdef CONFIG_SECURITY 2937#ifdef CONFIG_SECURITY
@@ -3324,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = {
3324#ifdef CONFIG_PROC_PAGE_MONITOR 3325#ifdef CONFIG_PROC_PAGE_MONITOR
3325 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 3326 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3326 REG("smaps", S_IRUGO, proc_tid_smaps_operations), 3327 REG("smaps", S_IRUGO, proc_tid_smaps_operations),
3328 REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3327 REG("pagemap", S_IRUSR, proc_pagemap_operations), 3329 REG("pagemap", S_IRUSR, proc_pagemap_operations),
3328#endif 3330#endif
3329#ifdef CONFIG_SECURITY 3331#ifdef CONFIG_SECURITY
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2b89071630..2cbfcd32e884 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *);
269/* 269/*
270 * task_[no]mmu.c 270 * task_[no]mmu.c
271 */ 271 */
272struct mem_size_stats;
272struct proc_maps_private { 273struct proc_maps_private {
273 struct inode *inode; 274 struct inode *inode;
274 struct task_struct *task; 275 struct task_struct *task;
275 struct mm_struct *mm; 276 struct mm_struct *mm;
277 struct mem_size_stats *rollup;
276#ifdef CONFIG_MMU 278#ifdef CONFIG_MMU
277 struct vm_area_struct *tail_vma; 279 struct vm_area_struct *tail_vma;
278#endif 280#endif
@@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations;
288extern const struct file_operations proc_pid_numa_maps_operations; 290extern const struct file_operations proc_pid_numa_maps_operations;
289extern const struct file_operations proc_tid_numa_maps_operations; 291extern const struct file_operations proc_tid_numa_maps_operations;
290extern const struct file_operations proc_pid_smaps_operations; 292extern const struct file_operations proc_pid_smaps_operations;
293extern const struct file_operations proc_pid_smaps_rollup_operations;
291extern const struct file_operations proc_tid_smaps_operations; 294extern const struct file_operations proc_tid_smaps_operations;
292extern const struct file_operations proc_clear_refs_operations; 295extern const struct file_operations proc_clear_refs_operations;
293extern const struct file_operations proc_pagemap_operations; 296extern const struct file_operations proc_pagemap_operations;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 509a61668d90..cdd979724c74 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
80 show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); 80 show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
81 show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); 81 show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
82 show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); 82 show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
83 show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); 83 show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
84 84
85#ifdef CONFIG_HIGHMEM 85#ifdef CONFIG_HIGHMEM
86 show_val_kb(m, "HighTotal: ", i.totalhigh); 86 show_val_kb(m, "HighTotal: ", i.totalhigh);
@@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
114 show_val_kb(m, "SUnreclaim: ", 114 show_val_kb(m, "SUnreclaim: ",
115 global_node_page_state(NR_SLAB_UNRECLAIMABLE)); 115 global_node_page_state(NR_SLAB_UNRECLAIMABLE));
116 seq_printf(m, "KernelStack: %8lu kB\n", 116 seq_printf(m, "KernelStack: %8lu kB\n",
117 global_page_state(NR_KERNEL_STACK_KB)); 117 global_zone_page_state(NR_KERNEL_STACK_KB));
118 show_val_kb(m, "PageTables: ", 118 show_val_kb(m, "PageTables: ",
119 global_page_state(NR_PAGETABLE)); 119 global_zone_page_state(NR_PAGETABLE));
120#ifdef CONFIG_QUICKLIST 120#ifdef CONFIG_QUICKLIST
121 show_val_kb(m, "Quicklists: ", quicklist_total_size()); 121 show_val_kb(m, "Quicklists: ", quicklist_total_size());
122#endif 122#endif
@@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
124 show_val_kb(m, "NFS_Unstable: ", 124 show_val_kb(m, "NFS_Unstable: ",
125 global_node_page_state(NR_UNSTABLE_NFS)); 125 global_node_page_state(NR_UNSTABLE_NFS));
126 show_val_kb(m, "Bounce: ", 126 show_val_kb(m, "Bounce: ",
127 global_page_state(NR_BOUNCE)); 127 global_zone_page_state(NR_BOUNCE));
128 show_val_kb(m, "WritebackTmp: ", 128 show_val_kb(m, "WritebackTmp: ",
129 global_node_page_state(NR_WRITEBACK_TEMP)); 129 global_node_page_state(NR_WRITEBACK_TEMP));
130 show_val_kb(m, "CommitLimit: ", vm_commit_limit()); 130 show_val_kb(m, "CommitLimit: ", vm_commit_limit());
@@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
151#ifdef CONFIG_CMA 151#ifdef CONFIG_CMA
152 show_val_kb(m, "CmaTotal: ", totalcma_pages); 152 show_val_kb(m, "CmaTotal: ", totalcma_pages);
153 show_val_kb(m, "CmaFree: ", 153 show_val_kb(m, "CmaFree: ",
154 global_page_state(NR_FREE_CMA_PAGES)); 154 global_zone_page_state(NR_FREE_CMA_PAGES));
155#endif 155#endif
156 156
157 hugetlb_report_meminfo(m); 157 hugetlb_report_meminfo(m);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fe8f3265e877..a290966f91ec 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file)
253 if (priv->mm) 253 if (priv->mm)
254 mmdrop(priv->mm); 254 mmdrop(priv->mm);
255 255
256 kfree(priv->rollup);
256 return seq_release_private(inode, file); 257 return seq_release_private(inode, file);
257} 258}
258 259
@@ -279,6 +280,23 @@ static int is_stack(struct proc_maps_private *priv,
279 vma->vm_end >= vma->vm_mm->start_stack; 280 vma->vm_end >= vma->vm_mm->start_stack;
280} 281}
281 282
283static void show_vma_header_prefix(struct seq_file *m,
284 unsigned long start, unsigned long end,
285 vm_flags_t flags, unsigned long long pgoff,
286 dev_t dev, unsigned long ino)
287{
288 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
289 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
290 start,
291 end,
292 flags & VM_READ ? 'r' : '-',
293 flags & VM_WRITE ? 'w' : '-',
294 flags & VM_EXEC ? 'x' : '-',
295 flags & VM_MAYSHARE ? 's' : 'p',
296 pgoff,
297 MAJOR(dev), MINOR(dev), ino);
298}
299
282static void 300static void
283show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) 301show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
284{ 302{
@@ -301,17 +319,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
301 319
302 start = vma->vm_start; 320 start = vma->vm_start;
303 end = vma->vm_end; 321 end = vma->vm_end;
304 322 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
305 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
306 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
307 start,
308 end,
309 flags & VM_READ ? 'r' : '-',
310 flags & VM_WRITE ? 'w' : '-',
311 flags & VM_EXEC ? 'x' : '-',
312 flags & VM_MAYSHARE ? 's' : 'p',
313 pgoff,
314 MAJOR(dev), MINOR(dev), ino);
315 323
316 /* 324 /*
317 * Print the dentry name for named mappings, and a 325 * Print the dentry name for named mappings, and a
@@ -430,6 +438,7 @@ const struct file_operations proc_tid_maps_operations = {
430 438
431#ifdef CONFIG_PROC_PAGE_MONITOR 439#ifdef CONFIG_PROC_PAGE_MONITOR
432struct mem_size_stats { 440struct mem_size_stats {
441 bool first;
433 unsigned long resident; 442 unsigned long resident;
434 unsigned long shared_clean; 443 unsigned long shared_clean;
435 unsigned long shared_dirty; 444 unsigned long shared_dirty;
@@ -443,7 +452,9 @@ struct mem_size_stats {
443 unsigned long swap; 452 unsigned long swap;
444 unsigned long shared_hugetlb; 453 unsigned long shared_hugetlb;
445 unsigned long private_hugetlb; 454 unsigned long private_hugetlb;
455 unsigned long first_vma_start;
446 u64 pss; 456 u64 pss;
457 u64 pss_locked;
447 u64 swap_pss; 458 u64 swap_pss;
448 bool check_shmem_swap; 459 bool check_shmem_swap;
449}; 460};
@@ -652,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
652 [ilog2(VM_NORESERVE)] = "nr", 663 [ilog2(VM_NORESERVE)] = "nr",
653 [ilog2(VM_HUGETLB)] = "ht", 664 [ilog2(VM_HUGETLB)] = "ht",
654 [ilog2(VM_ARCH_1)] = "ar", 665 [ilog2(VM_ARCH_1)] = "ar",
666 [ilog2(VM_WIPEONFORK)] = "wf",
655 [ilog2(VM_DONTDUMP)] = "dd", 667 [ilog2(VM_DONTDUMP)] = "dd",
656#ifdef CONFIG_MEM_SOFT_DIRTY 668#ifdef CONFIG_MEM_SOFT_DIRTY
657 [ilog2(VM_SOFTDIRTY)] = "sd", 669 [ilog2(VM_SOFTDIRTY)] = "sd",
@@ -719,18 +731,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
719 731
720static int show_smap(struct seq_file *m, void *v, int is_pid) 732static int show_smap(struct seq_file *m, void *v, int is_pid)
721{ 733{
734 struct proc_maps_private *priv = m->private;
722 struct vm_area_struct *vma = v; 735 struct vm_area_struct *vma = v;
723 struct mem_size_stats mss; 736 struct mem_size_stats mss_stack;
737 struct mem_size_stats *mss;
724 struct mm_walk smaps_walk = { 738 struct mm_walk smaps_walk = {
725 .pmd_entry = smaps_pte_range, 739 .pmd_entry = smaps_pte_range,
726#ifdef CONFIG_HUGETLB_PAGE 740#ifdef CONFIG_HUGETLB_PAGE
727 .hugetlb_entry = smaps_hugetlb_range, 741 .hugetlb_entry = smaps_hugetlb_range,
728#endif 742#endif
729 .mm = vma->vm_mm, 743 .mm = vma->vm_mm,
730 .private = &mss,
731 }; 744 };
745 int ret = 0;
746 bool rollup_mode;
747 bool last_vma;
748
749 if (priv->rollup) {
750 rollup_mode = true;
751 mss = priv->rollup;
752 if (mss->first) {
753 mss->first_vma_start = vma->vm_start;
754 mss->first = false;
755 }
756 last_vma = !m_next_vma(priv, vma);
757 } else {
758 rollup_mode = false;
759 memset(&mss_stack, 0, sizeof(mss_stack));
760 mss = &mss_stack;
761 }
732 762
733 memset(&mss, 0, sizeof mss); 763 smaps_walk.private = mss;
734 764
735#ifdef CONFIG_SHMEM 765#ifdef CONFIG_SHMEM
736 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { 766 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
@@ -748,9 +778,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
748 778
749 if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || 779 if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
750 !(vma->vm_flags & VM_WRITE)) { 780 !(vma->vm_flags & VM_WRITE)) {
751 mss.swap = shmem_swapped; 781 mss->swap = shmem_swapped;
752 } else { 782 } else {
753 mss.check_shmem_swap = true; 783 mss->check_shmem_swap = true;
754 smaps_walk.pte_hole = smaps_pte_hole; 784 smaps_walk.pte_hole = smaps_pte_hole;
755 } 785 }
756 } 786 }
@@ -758,54 +788,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
758 788
759 /* mmap_sem is held in m_start */ 789 /* mmap_sem is held in m_start */
760 walk_page_vma(vma, &smaps_walk); 790 walk_page_vma(vma, &smaps_walk);
791 if (vma->vm_flags & VM_LOCKED)
792 mss->pss_locked += mss->pss;
793
794 if (!rollup_mode) {
795 show_map_vma(m, vma, is_pid);
796 } else if (last_vma) {
797 show_vma_header_prefix(
798 m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
799 seq_pad(m, ' ');
800 seq_puts(m, "[rollup]\n");
801 } else {
802 ret = SEQ_SKIP;
803 }
761 804
762 show_map_vma(m, vma, is_pid); 805 if (!rollup_mode)
763 806 seq_printf(m,
764 seq_printf(m, 807 "Size: %8lu kB\n"
765 "Size: %8lu kB\n" 808 "KernelPageSize: %8lu kB\n"
766 "Rss: %8lu kB\n" 809 "MMUPageSize: %8lu kB\n",
767 "Pss: %8lu kB\n" 810 (vma->vm_end - vma->vm_start) >> 10,
768 "Shared_Clean: %8lu kB\n" 811 vma_kernel_pagesize(vma) >> 10,
769 "Shared_Dirty: %8lu kB\n" 812 vma_mmu_pagesize(vma) >> 10);
770 "Private_Clean: %8lu kB\n" 813
771 "Private_Dirty: %8lu kB\n" 814
772 "Referenced: %8lu kB\n" 815 if (!rollup_mode || last_vma)
773 "Anonymous: %8lu kB\n" 816 seq_printf(m,
774 "LazyFree: %8lu kB\n" 817 "Rss: %8lu kB\n"
775 "AnonHugePages: %8lu kB\n" 818 "Pss: %8lu kB\n"
776 "ShmemPmdMapped: %8lu kB\n" 819 "Shared_Clean: %8lu kB\n"
777 "Shared_Hugetlb: %8lu kB\n" 820 "Shared_Dirty: %8lu kB\n"
778 "Private_Hugetlb: %7lu kB\n" 821 "Private_Clean: %8lu kB\n"
779 "Swap: %8lu kB\n" 822 "Private_Dirty: %8lu kB\n"
780 "SwapPss: %8lu kB\n" 823 "Referenced: %8lu kB\n"
781 "KernelPageSize: %8lu kB\n" 824 "Anonymous: %8lu kB\n"
782 "MMUPageSize: %8lu kB\n" 825 "LazyFree: %8lu kB\n"
783 "Locked: %8lu kB\n", 826 "AnonHugePages: %8lu kB\n"
784 (vma->vm_end - vma->vm_start) >> 10, 827 "ShmemPmdMapped: %8lu kB\n"
785 mss.resident >> 10, 828 "Shared_Hugetlb: %8lu kB\n"
786 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 829 "Private_Hugetlb: %7lu kB\n"
787 mss.shared_clean >> 10, 830 "Swap: %8lu kB\n"
788 mss.shared_dirty >> 10, 831 "SwapPss: %8lu kB\n"
789 mss.private_clean >> 10, 832 "Locked: %8lu kB\n",
790 mss.private_dirty >> 10, 833 mss->resident >> 10,
791 mss.referenced >> 10, 834 (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
792 mss.anonymous >> 10, 835 mss->shared_clean >> 10,
793 mss.lazyfree >> 10, 836 mss->shared_dirty >> 10,
794 mss.anonymous_thp >> 10, 837 mss->private_clean >> 10,
795 mss.shmem_thp >> 10, 838 mss->private_dirty >> 10,
796 mss.shared_hugetlb >> 10, 839 mss->referenced >> 10,
797 mss.private_hugetlb >> 10, 840 mss->anonymous >> 10,
798 mss.swap >> 10, 841 mss->lazyfree >> 10,
799 (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), 842 mss->anonymous_thp >> 10,
800 vma_kernel_pagesize(vma) >> 10, 843 mss->shmem_thp >> 10,
801 vma_mmu_pagesize(vma) >> 10, 844 mss->shared_hugetlb >> 10,
802 (vma->vm_flags & VM_LOCKED) ? 845 mss->private_hugetlb >> 10,
803 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 846 mss->swap >> 10,
804 847 (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
805 arch_show_smap(m, vma); 848 (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
806 show_smap_vma_flags(m, vma); 849
850 if (!rollup_mode) {
851 arch_show_smap(m, vma);
852 show_smap_vma_flags(m, vma);
853 }
807 m_cache_vma(m, vma); 854 m_cache_vma(m, vma);
808 return 0; 855 return ret;
809} 856}
810 857
811static int show_pid_smap(struct seq_file *m, void *v) 858static int show_pid_smap(struct seq_file *m, void *v)
@@ -837,6 +884,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
837 return do_maps_open(inode, file, &proc_pid_smaps_op); 884 return do_maps_open(inode, file, &proc_pid_smaps_op);
838} 885}
839 886
887static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
888{
889 struct seq_file *seq;
890 struct proc_maps_private *priv;
891 int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
892
893 if (ret < 0)
894 return ret;
895 seq = file->private_data;
896 priv = seq->private;
897 priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
898 if (!priv->rollup) {
899 proc_map_release(inode, file);
900 return -ENOMEM;
901 }
902 priv->rollup->first = true;
903 return 0;
904}
905
840static int tid_smaps_open(struct inode *inode, struct file *file) 906static int tid_smaps_open(struct inode *inode, struct file *file)
841{ 907{
842 return do_maps_open(inode, file, &proc_tid_smaps_op); 908 return do_maps_open(inode, file, &proc_tid_smaps_op);
@@ -849,6 +915,13 @@ const struct file_operations proc_pid_smaps_operations = {
849 .release = proc_map_release, 915 .release = proc_map_release,
850}; 916};
851 917
918const struct file_operations proc_pid_smaps_rollup_operations = {
919 .open = pid_smaps_rollup_open,
920 .read = seq_read,
921 .llseek = seq_lseek,
922 .release = proc_map_release,
923};
924
852const struct file_operations proc_tid_smaps_operations = { 925const struct file_operations proc_tid_smaps_operations = {
853 .open = tid_smaps_open, 926 .open = tid_smaps_open,
854 .read = seq_read, 927 .read = seq_read,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2ef7ce75c062..3ac1f2387083 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
228 if (!pages) 228 if (!pages)
229 goto out_free; 229 goto out_free;
230 230
231 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); 231 nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
232 if (nr != lpages) 232 if (nr != lpages)
233 goto out_free_pages; /* leave if some pages were missing */ 233 goto out_free_pages; /* leave if some pages were missing */
234 234
diff --git a/fs/sync.c b/fs/sync.c
index 27d6b8bbcb6a..2e3fd7d94d2d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -335,11 +335,6 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
335 goto out_put; 335 goto out_put;
336 336
337 mapping = f.file->f_mapping; 337 mapping = f.file->f_mapping;
338 if (!mapping) {
339 ret = -EINVAL;
340 goto out_put;
341 }
342
343 ret = 0; 338 ret = 0;
344 if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { 339 if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
345 ret = file_fdatawait_range(f.file, offset, endbyte); 340 ret = file_fdatawait_range(f.file, offset, endbyte);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 886085b47c75..5419e7da82ba 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
178 178
179static inline struct uffd_msg userfault_msg(unsigned long address, 179static inline struct uffd_msg userfault_msg(unsigned long address,
180 unsigned int flags, 180 unsigned int flags,
181 unsigned long reason) 181 unsigned long reason,
182 unsigned int features)
182{ 183{
183 struct uffd_msg msg; 184 struct uffd_msg msg;
184 msg_init(&msg); 185 msg_init(&msg);
@@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
202 * write protect fault. 203 * write protect fault.
203 */ 204 */
204 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; 205 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
206 if (features & UFFD_FEATURE_THREAD_ID)
207 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
205 return msg; 208 return msg;
206} 209}
207 210
@@ -370,6 +373,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
370 VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); 373 VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
371 VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); 374 VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
372 375
376 if (ctx->features & UFFD_FEATURE_SIGBUS)
377 goto out;
378
373 /* 379 /*
374 * If it's already released don't get it. This avoids to loop 380 * If it's already released don't get it. This avoids to loop
375 * in __get_user_pages if userfaultfd_release waits on the 381 * in __get_user_pages if userfaultfd_release waits on the
@@ -419,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
419 425
420 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 426 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
421 uwq.wq.private = current; 427 uwq.wq.private = current;
422 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); 428 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
429 ctx->features);
423 uwq.ctx = ctx; 430 uwq.ctx = ctx;
424 uwq.waken = false; 431 uwq.waken = false;
425 432
@@ -1194,7 +1201,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1194 struct uffdio_register __user *user_uffdio_register; 1201 struct uffdio_register __user *user_uffdio_register;
1195 unsigned long vm_flags, new_flags; 1202 unsigned long vm_flags, new_flags;
1196 bool found; 1203 bool found;
1197 bool non_anon_pages; 1204 bool basic_ioctls;
1198 unsigned long start, end, vma_end; 1205 unsigned long start, end, vma_end;
1199 1206
1200 user_uffdio_register = (struct uffdio_register __user *) arg; 1207 user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1260,7 +1267,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1260 * Search for not compatible vmas. 1267 * Search for not compatible vmas.
1261 */ 1268 */
1262 found = false; 1269 found = false;
1263 non_anon_pages = false; 1270 basic_ioctls = false;
1264 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { 1271 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
1265 cond_resched(); 1272 cond_resched();
1266 1273
@@ -1299,8 +1306,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1299 /* 1306 /*
1300 * Note vmas containing huge pages 1307 * Note vmas containing huge pages
1301 */ 1308 */
1302 if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur)) 1309 if (is_vm_hugetlb_page(cur))
1303 non_anon_pages = true; 1310 basic_ioctls = true;
1304 1311
1305 found = true; 1312 found = true;
1306 } 1313 }
@@ -1371,7 +1378,7 @@ out_unlock:
1371 * userland which ioctls methods are guaranteed to 1378 * userland which ioctls methods are guaranteed to
1372 * succeed on this range. 1379 * succeed on this range.
1373 */ 1380 */
1374 if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC : 1381 if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1375 UFFD_API_RANGE_IOCTLS, 1382 UFFD_API_RANGE_IOCTLS,
1376 &user_uffdio_register->ioctls)) 1383 &user_uffdio_register->ioctls))
1377 ret = -EFAULT; 1384 ret = -EFAULT;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0debbc7e3f03..ec3e44fcf771 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1101,7 +1101,7 @@ xfs_filemap_pfn_mkwrite(
1101 if (vmf->pgoff >= size) 1101 if (vmf->pgoff >= size)
1102 ret = VM_FAULT_SIGBUS; 1102 ret = VM_FAULT_SIGBUS;
1103 else if (IS_DAX(inode)) 1103 else if (IS_DAX(inode))
1104 ret = dax_pfn_mkwrite(vmf); 1104 ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
1105 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1105 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1106 sb_end_pagefault(inode->i_sb); 1106 sb_end_pagefault(inode->i_sb);
1107 return ret; 1107 return ret;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7b1cf4ba0902..1f0720de8990 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -38,7 +38,15 @@
38#define BIO_BUG_ON 38#define BIO_BUG_ON
39#endif 39#endif
40 40
41#ifdef CONFIG_THP_SWAP
42#if HPAGE_PMD_NR > 256
43#define BIO_MAX_PAGES HPAGE_PMD_NR
44#else
41#define BIO_MAX_PAGES 256 45#define BIO_MAX_PAGES 256
46#endif
47#else
48#define BIO_MAX_PAGES 256
49#endif
42 50
43#define bio_prio(bio) (bio)->bi_ioprio 51#define bio_prio(bio) (bio)->bi_ioprio
44#define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) 52#define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df97b7af7e2c..eb0bff6f1eab 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -89,34 +89,6 @@ void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
89void dax_write_cache(struct dax_device *dax_dev, bool wc); 89void dax_write_cache(struct dax_device *dax_dev, bool wc);
90bool dax_write_cache_enabled(struct dax_device *dax_dev); 90bool dax_write_cache_enabled(struct dax_device *dax_dev);
91 91
92/*
93 * We use lowest available bit in exceptional entry for locking, one bit for
94 * the entry size (PMD) and two more to tell us if the entry is a huge zero
95 * page (HZP) or an empty entry that is just used for locking. In total four
96 * special bits.
97 *
98 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
99 * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
100 * block allocation.
101 */
102#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
103#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
104#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
105#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
106#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
107
108static inline unsigned long dax_radix_sector(void *entry)
109{
110 return (unsigned long)entry >> RADIX_DAX_SHIFT;
111}
112
113static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
114{
115 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
116 ((unsigned long)sector << RADIX_DAX_SHIFT) |
117 RADIX_DAX_ENTRY_LOCK);
118}
119
120ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 92ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
121 const struct iomap_ops *ops); 93 const struct iomap_ops *ops);
122int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 94int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
@@ -124,8 +96,6 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
124int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 96int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
125int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 97int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
126 pgoff_t index); 98 pgoff_t index);
127void dax_wake_mapping_entry_waiter(struct address_space *mapping,
128 pgoff_t index, void *entry, bool wake_all);
129 99
130#ifdef CONFIG_FS_DAX 100#ifdef CONFIG_FS_DAX
131int __dax_zero_page_range(struct block_device *bdev, 101int __dax_zero_page_range(struct block_device *bdev,
@@ -140,21 +110,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
140} 110}
141#endif 111#endif
142 112
143#ifdef CONFIG_FS_DAX_PMD
144static inline unsigned int dax_radix_order(void *entry)
145{
146 if ((unsigned long)entry & RADIX_DAX_PMD)
147 return PMD_SHIFT - PAGE_SHIFT;
148 return 0;
149}
150#else
151static inline unsigned int dax_radix_order(void *entry)
152{
153 return 0;
154}
155#endif
156int dax_pfn_mkwrite(struct vm_fault *vmf);
157
158static inline bool dax_mapping(struct address_space *mapping) 113static inline bool dax_mapping(struct address_space *mapping)
159{ 114{
160 return mapping->host && IS_DAX(mapping->host); 115 return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b744a3456c5..c57002ae6520 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1269,8 +1269,6 @@ extern void f_delown(struct file *filp);
1269extern pid_t f_getown(struct file *filp); 1269extern pid_t f_getown(struct file *filp);
1270extern int send_sigurg(struct fown_struct *fown); 1270extern int send_sigurg(struct fown_struct *fown);
1271 1271
1272struct mm_struct;
1273
1274/* 1272/*
1275 * Umount options 1273 * Umount options
1276 */ 1274 */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 115bb81912cc..f4ff47d4a893 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -143,15 +143,6 @@ struct fscache_cookie_def {
143 void (*mark_page_cached)(void *cookie_netfs_data, 143 void (*mark_page_cached)(void *cookie_netfs_data,
144 struct address_space *mapping, 144 struct address_space *mapping,
145 struct page *page); 145 struct page *page);
146
147 /* indicate the cookie is no longer cached
148 * - this function is called when the backing store currently caching
149 * a cookie is removed
150 * - the netfs should use this to clean up any markers indicating
151 * cached pages
152 * - this is mandatory for any object that may have data
153 */
154 void (*now_uncached)(void *cookie_netfs_data);
155}; 146};
156 147
157/* 148/*
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b15a4bcfa77..69966c461d1c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -488,8 +488,9 @@ struct mem_cgroup *lock_page_memcg(struct page *page);
488void __unlock_page_memcg(struct mem_cgroup *memcg); 488void __unlock_page_memcg(struct mem_cgroup *memcg);
489void unlock_page_memcg(struct page *page); 489void unlock_page_memcg(struct page *page);
490 490
491/* idx can be of type enum memcg_stat_item or node_stat_item */
491static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, 492static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
492 enum memcg_stat_item idx) 493 int idx)
493{ 494{
494 long val = 0; 495 long val = 0;
495 int cpu; 496 int cpu;
@@ -503,15 +504,17 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
503 return val; 504 return val;
504} 505}
505 506
507/* idx can be of type enum memcg_stat_item or node_stat_item */
506static inline void __mod_memcg_state(struct mem_cgroup *memcg, 508static inline void __mod_memcg_state(struct mem_cgroup *memcg,
507 enum memcg_stat_item idx, int val) 509 int idx, int val)
508{ 510{
509 if (!mem_cgroup_disabled()) 511 if (!mem_cgroup_disabled())
510 __this_cpu_add(memcg->stat->count[idx], val); 512 __this_cpu_add(memcg->stat->count[idx], val);
511} 513}
512 514
515/* idx can be of type enum memcg_stat_item or node_stat_item */
513static inline void mod_memcg_state(struct mem_cgroup *memcg, 516static inline void mod_memcg_state(struct mem_cgroup *memcg,
514 enum memcg_stat_item idx, int val) 517 int idx, int val)
515{ 518{
516 if (!mem_cgroup_disabled()) 519 if (!mem_cgroup_disabled())
517 this_cpu_add(memcg->stat->count[idx], val); 520 this_cpu_add(memcg->stat->count[idx], val);
@@ -535,14 +538,14 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
535 * Kernel pages are an exception to this, since they'll never move. 538 * Kernel pages are an exception to this, since they'll never move.
536 */ 539 */
537static inline void __mod_memcg_page_state(struct page *page, 540static inline void __mod_memcg_page_state(struct page *page,
538 enum memcg_stat_item idx, int val) 541 int idx, int val)
539{ 542{
540 if (page->mem_cgroup) 543 if (page->mem_cgroup)
541 __mod_memcg_state(page->mem_cgroup, idx, val); 544 __mod_memcg_state(page->mem_cgroup, idx, val);
542} 545}
543 546
544static inline void mod_memcg_page_state(struct page *page, 547static inline void mod_memcg_page_state(struct page *page,
545 enum memcg_stat_item idx, int val) 548 int idx, int val)
546{ 549{
547 if (page->mem_cgroup) 550 if (page->mem_cgroup)
548 mod_memcg_state(page->mem_cgroup, idx, val); 551 mod_memcg_state(page->mem_cgroup, idx, val);
@@ -632,8 +635,9 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
632 this_cpu_add(memcg->stat->events[idx], count); 635 this_cpu_add(memcg->stat->events[idx], count);
633} 636}
634 637
638/* idx can be of type enum memcg_stat_item or node_stat_item */
635static inline void count_memcg_page_event(struct page *page, 639static inline void count_memcg_page_event(struct page *page,
636 enum memcg_stat_item idx) 640 int idx)
637{ 641{
638 if (page->mem_cgroup) 642 if (page->mem_cgroup)
639 count_memcg_events(page->mem_cgroup, idx, 1); 643 count_memcg_events(page->mem_cgroup, idx, 1);
@@ -846,31 +850,31 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
846} 850}
847 851
848static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, 852static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
849 enum memcg_stat_item idx) 853 int idx)
850{ 854{
851 return 0; 855 return 0;
852} 856}
853 857
854static inline void __mod_memcg_state(struct mem_cgroup *memcg, 858static inline void __mod_memcg_state(struct mem_cgroup *memcg,
855 enum memcg_stat_item idx, 859 int idx,
856 int nr) 860 int nr)
857{ 861{
858} 862}
859 863
860static inline void mod_memcg_state(struct mem_cgroup *memcg, 864static inline void mod_memcg_state(struct mem_cgroup *memcg,
861 enum memcg_stat_item idx, 865 int idx,
862 int nr) 866 int nr)
863{ 867{
864} 868}
865 869
866static inline void __mod_memcg_page_state(struct page *page, 870static inline void __mod_memcg_page_state(struct page *page,
867 enum memcg_stat_item idx, 871 int idx,
868 int nr) 872 int nr)
869{ 873{
870} 874}
871 875
872static inline void mod_memcg_page_state(struct page *page, 876static inline void mod_memcg_page_state(struct page *page,
873 enum memcg_stat_item idx, 877 int idx,
874 int nr) 878 int nr)
875{ 879{
876} 880}
@@ -924,7 +928,7 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
924} 928}
925 929
926static inline void count_memcg_page_event(struct page *page, 930static inline void count_memcg_page_event(struct page *page,
927 enum memcg_stat_item idx) 931 int idx)
928{ 932{
929} 933}
930 934
@@ -934,26 +938,30 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
934} 938}
935#endif /* CONFIG_MEMCG */ 939#endif /* CONFIG_MEMCG */
936 940
941/* idx can be of type enum memcg_stat_item or node_stat_item */
937static inline void __inc_memcg_state(struct mem_cgroup *memcg, 942static inline void __inc_memcg_state(struct mem_cgroup *memcg,
938 enum memcg_stat_item idx) 943 int idx)
939{ 944{
940 __mod_memcg_state(memcg, idx, 1); 945 __mod_memcg_state(memcg, idx, 1);
941} 946}
942 947
948/* idx can be of type enum memcg_stat_item or node_stat_item */
943static inline void __dec_memcg_state(struct mem_cgroup *memcg, 949static inline void __dec_memcg_state(struct mem_cgroup *memcg,
944 enum memcg_stat_item idx) 950 int idx)
945{ 951{
946 __mod_memcg_state(memcg, idx, -1); 952 __mod_memcg_state(memcg, idx, -1);
947} 953}
948 954
955/* idx can be of type enum memcg_stat_item or node_stat_item */
949static inline void __inc_memcg_page_state(struct page *page, 956static inline void __inc_memcg_page_state(struct page *page,
950 enum memcg_stat_item idx) 957 int idx)
951{ 958{
952 __mod_memcg_page_state(page, idx, 1); 959 __mod_memcg_page_state(page, idx, 1);
953} 960}
954 961
962/* idx can be of type enum memcg_stat_item or node_stat_item */
955static inline void __dec_memcg_page_state(struct page *page, 963static inline void __dec_memcg_page_state(struct page *page,
956 enum memcg_stat_item idx) 964 int idx)
957{ 965{
958 __mod_memcg_page_state(page, idx, -1); 966 __mod_memcg_page_state(page, idx, -1);
959} 967}
@@ -982,26 +990,30 @@ static inline void __dec_lruvec_page_state(struct page *page,
982 __mod_lruvec_page_state(page, idx, -1); 990 __mod_lruvec_page_state(page, idx, -1);
983} 991}
984 992
993/* idx can be of type enum memcg_stat_item or node_stat_item */
985static inline void inc_memcg_state(struct mem_cgroup *memcg, 994static inline void inc_memcg_state(struct mem_cgroup *memcg,
986 enum memcg_stat_item idx) 995 int idx)
987{ 996{
988 mod_memcg_state(memcg, idx, 1); 997 mod_memcg_state(memcg, idx, 1);
989} 998}
990 999
1000/* idx can be of type enum memcg_stat_item or node_stat_item */
991static inline void dec_memcg_state(struct mem_cgroup *memcg, 1001static inline void dec_memcg_state(struct mem_cgroup *memcg,
992 enum memcg_stat_item idx) 1002 int idx)
993{ 1003{
994 mod_memcg_state(memcg, idx, -1); 1004 mod_memcg_state(memcg, idx, -1);
995} 1005}
996 1006
1007/* idx can be of type enum memcg_stat_item or node_stat_item */
997static inline void inc_memcg_page_state(struct page *page, 1008static inline void inc_memcg_page_state(struct page *page,
998 enum memcg_stat_item idx) 1009 int idx)
999{ 1010{
1000 mod_memcg_page_state(page, idx, 1); 1011 mod_memcg_page_state(page, idx, 1);
1001} 1012}
1002 1013
1014/* idx can be of type enum memcg_stat_item or node_stat_item */
1003static inline void dec_memcg_page_state(struct page *page, 1015static inline void dec_memcg_page_state(struct page *page,
1004 enum memcg_stat_item idx) 1016 int idx)
1005{ 1017{
1006 mod_memcg_page_state(page, idx, -1); 1018 mod_memcg_page_state(page, idx, -1);
1007} 1019}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c8a5056a5ae0..5e6e4cc36ff4 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -319,6 +319,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
319 unsigned long pnum); 319 unsigned long pnum);
320extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, 320extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
321 int online_type); 321 int online_type);
322extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, 322extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
323 unsigned long nr_pages); 323 unsigned long nr_pages);
324#endif /* __LINUX_MEMORY_HOTPLUG_H */ 324#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c1f6c95f3496..39db8e54c5d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp);
189#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 189#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
190#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 190#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
191#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ 191#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
192#define VM_ARCH_2 0x02000000 192#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
193#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ 193#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
194 194
195#ifdef CONFIG_MEM_SOFT_DIRTY 195#ifdef CONFIG_MEM_SOFT_DIRTY
@@ -208,10 +208,12 @@ extern unsigned int kobjsize(const void *objp);
208#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ 208#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
209#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ 209#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
210#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ 210#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
211#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
211#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) 212#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
212#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) 213#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
213#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) 214#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
214#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) 215#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
216#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
215#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ 217#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
216 218
217#if defined(CONFIG_X86) 219#if defined(CONFIG_X86)
@@ -235,9 +237,11 @@ extern unsigned int kobjsize(const void *objp);
235# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ 237# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
236#endif 238#endif
237 239
238#if defined(CONFIG_X86) 240#if defined(CONFIG_X86_INTEL_MPX)
239/* MPX specific bounds table or bounds directory */ 241/* MPX specific bounds table or bounds directory */
240# define VM_MPX VM_ARCH_2 242# define VM_MPX VM_HIGH_ARCH_BIT_4
243#else
244# define VM_MPX VM_NONE
241#endif 245#endif
242 246
243#ifndef VM_GROWSUP 247#ifndef VM_GROWSUP
@@ -2294,6 +2298,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
2294 unsigned long pfn, pgprot_t pgprot); 2298 unsigned long pfn, pgprot_t pgprot);
2295int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 2299int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2296 pfn_t pfn); 2300 pfn_t pfn);
2301int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
2302 pfn_t pfn);
2297int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); 2303int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
2298 2304
2299 2305
@@ -2506,7 +2512,7 @@ enum mf_action_page_type {
2506 2512
2507#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 2513#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
2508extern void clear_huge_page(struct page *page, 2514extern void clear_huge_page(struct page *page,
2509 unsigned long addr, 2515 unsigned long addr_hint,
2510 unsigned int pages_per_huge_page); 2516 unsigned int pages_per_huge_page);
2511extern void copy_user_huge_page(struct page *dst, struct page *src, 2517extern void copy_user_huge_page(struct page *dst, struct page *src,
2512 unsigned long addr, struct vm_area_struct *vma, 2518 unsigned long addr, struct vm_area_struct *vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 57378c7cb5f8..f45ad815b7d7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,6 +335,7 @@ struct vm_area_struct {
335 struct file * vm_file; /* File we map to (can be NULL). */ 335 struct file * vm_file; /* File we map to (can be NULL). */
336 void * vm_private_data; /* was vm_pte (shared mem) */ 336 void * vm_private_data; /* was vm_pte (shared mem) */
337 337
338 atomic_long_t swap_readahead_info;
338#ifndef CONFIG_MMU 339#ifndef CONFIG_MMU
339 struct vm_region *vm_region; /* NOMMU mapping region */ 340 struct vm_region *vm_region; /* NOMMU mapping region */
340#endif 341#endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b3f6ce..e7e92c8f4883 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -770,8 +770,7 @@ static inline bool is_dev_zone(const struct zone *zone)
770 770
771#include <linux/memory_hotplug.h> 771#include <linux/memory_hotplug.h>
772 772
773extern struct mutex zonelists_mutex; 773void build_all_zonelists(pg_data_t *pgdat);
774void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
775void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); 774void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
776bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 775bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
777 int classzone_idx, unsigned int alloc_flags, 776 int classzone_idx, unsigned int alloc_flags,
@@ -896,7 +895,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
896extern int numa_zonelist_order_handler(struct ctl_table *, int, 895extern int numa_zonelist_order_handler(struct ctl_table *, int,
897 void __user *, size_t *, loff_t *); 896 void __user *, size_t *, loff_t *);
898extern char numa_zonelist_order[]; 897extern char numa_zonelist_order[];
899#define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ 898#define NUMA_ZONELIST_ORDER_LEN 16
900 899
901#ifndef CONFIG_NEED_MULTIPLE_NODES 900#ifndef CONFIG_NEED_MULTIPLE_NODES
902 901
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d33e3280c8ad..ba2d470d2d0a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -303,8 +303,8 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
303 * Only test-and-set exist for PG_writeback. The unconditional operators are 303 * Only test-and-set exist for PG_writeback. The unconditional operators are
304 * risky: they bypass page accounting. 304 * risky: they bypass page accounting.
305 */ 305 */
306TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) 306TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
307 TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) 307 TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
308PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) 308PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
309 309
310/* PG_readahead is only used for reads; PG_reclaim is only for writes */ 310/* PG_readahead is only used for reads; PG_reclaim is only for writes */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 79b36f57c3ba..5bbd6780f205 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -353,8 +353,16 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
353unsigned find_get_entries(struct address_space *mapping, pgoff_t start, 353unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
354 unsigned int nr_entries, struct page **entries, 354 unsigned int nr_entries, struct page **entries,
355 pgoff_t *indices); 355 pgoff_t *indices);
356unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 356unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
357 unsigned int nr_pages, struct page **pages); 357 pgoff_t end, unsigned int nr_pages,
358 struct page **pages);
359static inline unsigned find_get_pages(struct address_space *mapping,
360 pgoff_t *start, unsigned int nr_pages,
361 struct page **pages)
362{
363 return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
364 pages);
365}
358unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, 366unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
359 unsigned int nr_pages, struct page **pages); 367 unsigned int nr_pages, struct page **pages);
360unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 368unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index b45d391b4540..4dcd5506f1ed 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,8 +27,16 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
27 pgoff_t start, unsigned nr_entries, 27 pgoff_t start, unsigned nr_entries,
28 pgoff_t *indices); 28 pgoff_t *indices);
29void pagevec_remove_exceptionals(struct pagevec *pvec); 29void pagevec_remove_exceptionals(struct pagevec *pvec);
30unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 30unsigned pagevec_lookup_range(struct pagevec *pvec,
31 pgoff_t start, unsigned nr_pages); 31 struct address_space *mapping,
32 pgoff_t *start, pgoff_t end);
33static inline unsigned pagevec_lookup(struct pagevec *pvec,
34 struct address_space *mapping,
35 pgoff_t *start)
36{
37 return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
38}
39
32unsigned pagevec_lookup_tag(struct pagevec *pvec, 40unsigned pagevec_lookup_tag(struct pagevec *pvec,
33 struct address_space *mapping, pgoff_t *index, int tag, 41 struct address_space *mapping, pgoff_t *index, int tag,
34 unsigned nr_pages); 42 unsigned nr_pages);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b0a281f9d26..3a19c253bdb1 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
84 84
85/* mmput gets rid of the mappings and all user-space */ 85/* mmput gets rid of the mappings and all user-space */
86extern void mmput(struct mm_struct *); 86extern void mmput(struct mm_struct *);
87#ifdef CONFIG_MMU
88/* same as above but performs the slow path from the async context. Can
89 * be called from the atomic context as well
90 */
91extern void mmput_async(struct mm_struct *);
92#endif
93 87
94/* Grab a reference to a task's mm, if it is not already going away */ 88/* Grab a reference to a task's mm, if it is not already going away */
95extern struct mm_struct *get_task_mm(struct task_struct *task); 89extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 0fb7061ec54c..21a5e6c43385 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -27,23 +27,6 @@ struct shmid_kernel /* private to the kernel */
27/* shm_mode upper byte flags */ 27/* shm_mode upper byte flags */
28#define SHM_DEST 01000 /* segment will be destroyed on last detach */ 28#define SHM_DEST 01000 /* segment will be destroyed on last detach */
29#define SHM_LOCKED 02000 /* segment will not be swapped */ 29#define SHM_LOCKED 02000 /* segment will not be swapped */
30#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
31#define SHM_NORESERVE 010000 /* don't check for reservations */
32
33/* Bits [26:31] are reserved */
34
35/*
36 * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
37 * This gives us 6 bits, which is enough until someone invents 128 bit address
38 * spaces.
39 *
40 * Assume these are all power of twos.
41 * When 0 use the default page size.
42 */
43#define SHM_HUGE_SHIFT 26
44#define SHM_HUGE_MASK 0x3f
45#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
46#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
47 30
48#ifdef CONFIG_SYSVIPC 31#ifdef CONFIG_SYSVIPC
49struct sysv_shm { 32struct sysv_shm {
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a7d6bd2a918f..b6c3540e07bc 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -137,9 +137,15 @@ extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
137 unsigned long dst_addr, 137 unsigned long dst_addr,
138 unsigned long src_addr, 138 unsigned long src_addr,
139 struct page **pagep); 139 struct page **pagep);
140extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
141 pmd_t *dst_pmd,
142 struct vm_area_struct *dst_vma,
143 unsigned long dst_addr);
140#else 144#else
141#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ 145#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
142 src_addr, pagep) ({ BUG(); 0; }) 146 src_addr, pagep) ({ BUG(); 0; })
147#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
148 dst_addr) ({ BUG(); 0; })
143#endif 149#endif
144 150
145#endif 151#endif
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 4fcacd915d45..51d189615bda 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -18,6 +18,13 @@ struct shrink_control {
18 */ 18 */
19 unsigned long nr_to_scan; 19 unsigned long nr_to_scan;
20 20
21 /*
22 * How many objects did scan_objects process?
23 * This defaults to nr_to_scan before every call, but the callee
24 * should track its actual progress.
25 */
26 unsigned long nr_scanned;
27
21 /* current node being shrunk (for NUMA aware shrinkers) */ 28 /* current node being shrunk (for NUMA aware shrinkers) */
22 int nid; 29 int nid;
23 30
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index cc0faf3a90be..0783b622311e 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -115,6 +115,10 @@ struct kmem_cache {
115#endif 115#endif
116#endif 116#endif
117 117
118#ifdef CONFIG_SLAB_FREELIST_HARDENED
119 unsigned long random;
120#endif
121
118#ifdef CONFIG_NUMA 122#ifdef CONFIG_NUMA
119 /* 123 /*
120 * Defragmentation by allocating from a remote node. 124 * Defragmentation by allocating from a remote node.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d83d28e53e62..8bf3487fb204 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -188,6 +188,7 @@ struct swap_cluster_info {
188}; 188};
189#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ 189#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
190#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ 190#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
191#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
191 192
192/* 193/*
193 * We assign a cluster to each CPU, so each CPU can allocate swap entry from 194 * We assign a cluster to each CPU, so each CPU can allocate swap entry from
@@ -211,7 +212,7 @@ struct swap_info_struct {
211 unsigned long flags; /* SWP_USED etc: see above */ 212 unsigned long flags; /* SWP_USED etc: see above */
212 signed short prio; /* swap priority of this type */ 213 signed short prio; /* swap priority of this type */
213 struct plist_node list; /* entry in swap_active_head */ 214 struct plist_node list; /* entry in swap_active_head */
214 struct plist_node avail_list; /* entry in swap_avail_head */ 215 struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */
215 signed char type; /* strange name for an index */ 216 signed char type; /* strange name for an index */
216 unsigned int max; /* extent of the swap_map */ 217 unsigned int max; /* extent of the swap_map */
217 unsigned char *swap_map; /* vmalloc'ed array of usage counts */ 218 unsigned char *swap_map; /* vmalloc'ed array of usage counts */
@@ -250,6 +251,25 @@ struct swap_info_struct {
250 struct swap_cluster_list discard_clusters; /* discard clusters list */ 251 struct swap_cluster_list discard_clusters; /* discard clusters list */
251}; 252};
252 253
254#ifdef CONFIG_64BIT
255#define SWAP_RA_ORDER_CEILING 5
256#else
257/* Avoid stack overflow, because we need to save part of page table */
258#define SWAP_RA_ORDER_CEILING 3
259#define SWAP_RA_PTE_CACHE_SIZE (1 << SWAP_RA_ORDER_CEILING)
260#endif
261
262struct vma_swap_readahead {
263 unsigned short win;
264 unsigned short offset;
265 unsigned short nr_pte;
266#ifdef CONFIG_64BIT
267 pte_t *ptes;
268#else
269 pte_t ptes[SWAP_RA_PTE_CACHE_SIZE];
270#endif
271};
272
253/* linux/mm/workingset.c */ 273/* linux/mm/workingset.c */
254void *workingset_eviction(struct address_space *mapping, struct page *page); 274void *workingset_eviction(struct address_space *mapping, struct page *page);
255bool workingset_refault(void *shadow); 275bool workingset_refault(void *shadow);
@@ -262,8 +282,8 @@ extern unsigned long totalreserve_pages;
262extern unsigned long nr_free_buffer_pages(void); 282extern unsigned long nr_free_buffer_pages(void);
263extern unsigned long nr_free_pagecache_pages(void); 283extern unsigned long nr_free_pagecache_pages(void);
264 284
265/* Definition of global_page_state not available yet */ 285/* Definition of global_zone_page_state not available yet */
266#define nr_free_pages() global_page_state(NR_FREE_PAGES) 286#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
267 287
268 288
269/* linux/mm/swap.c */ 289/* linux/mm/swap.c */
@@ -349,6 +369,7 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
349#define SWAP_ADDRESS_SPACE_SHIFT 14 369#define SWAP_ADDRESS_SPACE_SHIFT 14
350#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) 370#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
351extern struct address_space *swapper_spaces[]; 371extern struct address_space *swapper_spaces[];
372extern bool swap_vma_readahead;
352#define swap_address_space(entry) \ 373#define swap_address_space(entry) \
353 (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ 374 (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
354 >> SWAP_ADDRESS_SPACE_SHIFT]) 375 >> SWAP_ADDRESS_SPACE_SHIFT])
@@ -361,7 +382,9 @@ extern void __delete_from_swap_cache(struct page *);
361extern void delete_from_swap_cache(struct page *); 382extern void delete_from_swap_cache(struct page *);
362extern void free_page_and_swap_cache(struct page *); 383extern void free_page_and_swap_cache(struct page *);
363extern void free_pages_and_swap_cache(struct page **, int); 384extern void free_pages_and_swap_cache(struct page **, int);
364extern struct page *lookup_swap_cache(swp_entry_t); 385extern struct page *lookup_swap_cache(swp_entry_t entry,
386 struct vm_area_struct *vma,
387 unsigned long addr);
365extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, 388extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
366 struct vm_area_struct *vma, unsigned long addr, 389 struct vm_area_struct *vma, unsigned long addr,
367 bool do_poll); 390 bool do_poll);
@@ -371,11 +394,23 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
371extern struct page *swapin_readahead(swp_entry_t, gfp_t, 394extern struct page *swapin_readahead(swp_entry_t, gfp_t,
372 struct vm_area_struct *vma, unsigned long addr); 395 struct vm_area_struct *vma, unsigned long addr);
373 396
397extern struct page *swap_readahead_detect(struct vm_fault *vmf,
398 struct vma_swap_readahead *swap_ra);
399extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
400 struct vm_fault *vmf,
401 struct vma_swap_readahead *swap_ra);
402
374/* linux/mm/swapfile.c */ 403/* linux/mm/swapfile.c */
375extern atomic_long_t nr_swap_pages; 404extern atomic_long_t nr_swap_pages;
376extern long total_swap_pages; 405extern long total_swap_pages;
406extern atomic_t nr_rotate_swap;
377extern bool has_usable_swap(void); 407extern bool has_usable_swap(void);
378 408
409static inline bool swap_use_vma_readahead(void)
410{
411 return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap);
412}
413
379/* Swap 50% full? Release swapcache more aggressively.. */ 414/* Swap 50% full? Release swapcache more aggressively.. */
380static inline bool vm_swap_full(void) 415static inline bool vm_swap_full(void)
381{ 416{
@@ -465,12 +500,32 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
465 return NULL; 500 return NULL;
466} 501}
467 502
503static inline bool swap_use_vma_readahead(void)
504{
505 return false;
506}
507
508static inline struct page *swap_readahead_detect(
509 struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
510{
511 return NULL;
512}
513
514static inline struct page *do_swap_page_readahead(
515 swp_entry_t fentry, gfp_t gfp_mask,
516 struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
517{
518 return NULL;
519}
520
468static inline int swap_writepage(struct page *p, struct writeback_control *wbc) 521static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
469{ 522{
470 return 0; 523 return 0;
471} 524}
472 525
473static inline struct page *lookup_swap_cache(swp_entry_t swp) 526static inline struct page *lookup_swap_cache(swp_entry_t swp,
527 struct vm_area_struct *vma,
528 unsigned long addr)
474{ 529{
475 return NULL; 530 return NULL;
476} 531}
@@ -509,8 +564,8 @@ static inline int swp_swapcount(swp_entry_t entry)
509 return 0; 564 return 0;
510} 565}
511 566
512#define reuse_swap_page(page, total_mapcount) \ 567#define reuse_swap_page(page, total_map_swapcount) \
513 (page_trans_huge_mapcount(page, total_mapcount) == 1) 568 (page_trans_huge_mapcount(page, total_map_swapcount) == 1)
514 569
515static inline int try_to_free_swap(struct page *page) 570static inline int try_to_free_swap(struct page *page)
516{ 571{
@@ -526,6 +581,15 @@ static inline swp_entry_t get_swap_page(struct page *page)
526 581
527#endif /* CONFIG_SWAP */ 582#endif /* CONFIG_SWAP */
528 583
584#ifdef CONFIG_THP_SWAP
585extern int split_swap_cluster(swp_entry_t entry);
586#else
587static inline int split_swap_cluster(swp_entry_t entry)
588{
589 return 0;
590}
591#endif
592
529#ifdef CONFIG_MEMCG 593#ifdef CONFIG_MEMCG
530static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) 594static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
531{ 595{
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 37e8d31a4632..d77bc35278b0 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -85,6 +85,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
85#endif 85#endif
86 THP_ZERO_PAGE_ALLOC, 86 THP_ZERO_PAGE_ALLOC,
87 THP_ZERO_PAGE_ALLOC_FAILED, 87 THP_ZERO_PAGE_ALLOC_FAILED,
88 THP_SWPOUT,
89 THP_SWPOUT_FALLBACK,
88#endif 90#endif
89#ifdef CONFIG_MEMORY_BALLOON 91#ifdef CONFIG_MEMORY_BALLOON
90 BALLOON_INFLATE, 92 BALLOON_INFLATE,
@@ -104,6 +106,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
104 VMACACHE_FIND_HITS, 106 VMACACHE_FIND_HITS,
105 VMACACHE_FULL_FLUSHES, 107 VMACACHE_FULL_FLUSHES,
106#endif 108#endif
109#ifdef CONFIG_SWAP
110 SWAP_RA,
111 SWAP_RA_HIT,
112#endif
107 NR_VM_EVENT_ITEMS 113 NR_VM_EVENT_ITEMS
108}; 114};
109 115
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b3d85f30d424..97e11ab573f0 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -123,7 +123,7 @@ static inline void node_page_state_add(long x, struct pglist_data *pgdat,
123 atomic_long_add(x, &vm_node_stat[item]); 123 atomic_long_add(x, &vm_node_stat[item]);
124} 124}
125 125
126static inline unsigned long global_page_state(enum zone_stat_item item) 126static inline unsigned long global_zone_page_state(enum zone_stat_item item)
127{ 127{
128 long x = atomic_long_read(&vm_zone_stat[item]); 128 long x = atomic_long_read(&vm_zone_stat[item]);
129#ifdef CONFIG_SMP 129#ifdef CONFIG_SMP
@@ -199,7 +199,7 @@ extern unsigned long sum_zone_node_page_state(int node,
199extern unsigned long node_page_state(struct pglist_data *pgdat, 199extern unsigned long node_page_state(struct pglist_data *pgdat,
200 enum node_stat_item item); 200 enum node_stat_item item);
201#else 201#else
202#define sum_zone_node_page_state(node, item) global_page_state(item) 202#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
203#define node_page_state(node, item) global_node_page_state(item) 203#define node_page_state(node, item) global_node_page_state(item)
204#endif /* CONFIG_NUMA */ 204#endif /* CONFIG_NUMA */
205 205
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 08bb3ed18dcc..fbc4a06f7310 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -190,8 +190,6 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
190 190
191DEFINE_PTE_FAULT_EVENT(dax_pte_fault); 191DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
192DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); 192DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
193DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
194DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
195DEFINE_PTE_FAULT_EVENT(dax_load_hole); 193DEFINE_PTE_FAULT_EVENT(dax_load_hole);
196 194
197TRACE_EVENT(dax_insert_mapping, 195TRACE_EVENT(dax_insert_mapping,
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 8e50d01c645f..4c2e4737d7bc 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
125#define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" } 125#define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" }
126#endif 126#endif
127 127
128#if defined(CONFIG_X86)
129#define __VM_ARCH_SPECIFIC_2 {VM_MPX, "mpx" }
130#else
131#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2, "arch_2" }
132#endif
133
134#ifdef CONFIG_MEM_SOFT_DIRTY 128#ifdef CONFIG_MEM_SOFT_DIRTY
135#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name }, 129#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
136#else 130#else
@@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
162 {VM_NORESERVE, "noreserve" }, \ 156 {VM_NORESERVE, "noreserve" }, \
163 {VM_HUGETLB, "hugetlb" }, \ 157 {VM_HUGETLB, "hugetlb" }, \
164 __VM_ARCH_SPECIFIC_1 , \ 158 __VM_ARCH_SPECIFIC_1 , \
165 __VM_ARCH_SPECIFIC_2 , \ 159 {VM_WIPEONFORK, "wipeonfork" }, \
166 {VM_DONTDUMP, "dontdump" }, \ 160 {VM_DONTDUMP, "dontdump" }, \
167IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ 161IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
168 {VM_MIXEDMAP, "mixedmap" }, \ 162 {VM_MIXEDMAP, "mixedmap" }, \
diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h
new file mode 100644
index 000000000000..e4732d3c2998
--- /dev/null
+++ b/include/uapi/asm-generic/hugetlb_encode.h
@@ -0,0 +1,34 @@
1#ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_
2#define _ASM_GENERIC_HUGETLB_ENCODE_H_
3
4/*
5 * Several system calls take a flag to request "hugetlb" huge pages.
6 * Without further specification, these system calls will use the
7 * system's default huge page size. If a system supports multiple
8 * huge page sizes, the desired huge page size can be specified in
9 * bits [26:31] of the flag arguments. The value in these 6 bits
10 * will encode the log2 of the huge page size.
11 *
12 * The following definitions are associated with this huge page size
13 * encoding in flag arguments. System call specific header files
14 * that use this encoding should include this file. They can then
15 * provide definitions based on these with their own specific prefix.
16 * for example:
17 * #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
18 */
19
20#define HUGETLB_FLAG_ENCODE_SHIFT 26
21#define HUGETLB_FLAG_ENCODE_MASK 0x3f
22
23#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT)
24#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT)
25#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT)
26#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT)
27#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT)
28#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT)
29#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT)
30#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT)
31#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT)
32#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT)
33
34#endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 8c27db0c5c08..203268f9231e 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -58,20 +58,12 @@
58 overrides the coredump filter bits */ 58 overrides the coredump filter bits */
59#define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ 59#define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */
60 60
61#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
62#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
63
61/* compatibility flags */ 64/* compatibility flags */
62#define MAP_FILE 0 65#define MAP_FILE 0
63 66
64/*
65 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
66 * This gives us 6 bits, which is enough until someone invents 128 bit address
67 * spaces.
68 *
69 * Assume these are all power of twos.
70 * When 0 use the default page size.
71 */
72#define MAP_HUGE_SHIFT 26
73#define MAP_HUGE_MASK 0x3f
74
75#define PKEY_DISABLE_ACCESS 0x1 67#define PKEY_DISABLE_ACCESS 0x1
76#define PKEY_DISABLE_WRITE 0x2 68#define PKEY_DISABLE_WRITE 0x2
77#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 69#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 534e364bda92..7f3a722dbd72 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -1,8 +1,32 @@
1#ifndef _UAPI_LINUX_MEMFD_H 1#ifndef _UAPI_LINUX_MEMFD_H
2#define _UAPI_LINUX_MEMFD_H 2#define _UAPI_LINUX_MEMFD_H
3 3
4#include <asm-generic/hugetlb_encode.h>
5
4/* flags for memfd_create(2) (unsigned int) */ 6/* flags for memfd_create(2) (unsigned int) */
5#define MFD_CLOEXEC 0x0001U 7#define MFD_CLOEXEC 0x0001U
6#define MFD_ALLOW_SEALING 0x0002U 8#define MFD_ALLOW_SEALING 0x0002U
9#define MFD_HUGETLB 0x0004U
10
11/*
12 * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
13 * size other than the default is desired. See hugetlb_encode.h.
14 * All known huge page size encodings are provided here. It is the
15 * responsibility of the application to know which sizes are supported on
16 * the running system. See mmap(2) man page for details.
17 */
18#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
19#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK
20
21#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB
22#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
23#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB
24#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
25#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
26#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
27#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
28#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
29#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
30#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
7 31
8#endif /* _UAPI_LINUX_MEMFD_H */ 32#endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd3a90c..a937480d7cd3 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -2,6 +2,7 @@
2#define _UAPI_LINUX_MMAN_H 2#define _UAPI_LINUX_MMAN_H
3 3
4#include <asm/mman.h> 4#include <asm/mman.h>
5#include <asm-generic/hugetlb_encode.h>
5 6
6#define MREMAP_MAYMOVE 1 7#define MREMAP_MAYMOVE 1
7#define MREMAP_FIXED 2 8#define MREMAP_FIXED 2
@@ -10,4 +11,25 @@
10#define OVERCOMMIT_ALWAYS 1 11#define OVERCOMMIT_ALWAYS 1
11#define OVERCOMMIT_NEVER 2 12#define OVERCOMMIT_NEVER 2
12 13
14/*
15 * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
16 * size other than the default is desired. See hugetlb_encode.h.
17 * All known huge page size encodings are provided here. It is the
18 * responsibility of the application to know which sizes are supported on
19 * the running system. See mmap(2) man page for details.
20 */
21#define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
22#define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK
23
24#define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB
25#define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
26#define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB
27#define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
28#define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
29#define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
30#define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
31#define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
32#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
33#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
34
13#endif /* _UAPI_LINUX_MMAN_H */ 35#endif /* _UAPI_LINUX_MMAN_H */
diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h
index 1fbf24ea37fd..cf23c873719d 100644
--- a/include/uapi/linux/shm.h
+++ b/include/uapi/linux/shm.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/ipc.h> 4#include <linux/ipc.h>
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <asm-generic/hugetlb_encode.h>
6#ifndef __KERNEL__ 7#ifndef __KERNEL__
7#include <unistd.h> 8#include <unistd.h>
8#endif 9#endif
@@ -40,11 +41,37 @@ struct shmid_ds {
40/* Include the definition of shmid64_ds and shminfo64 */ 41/* Include the definition of shmid64_ds and shminfo64 */
41#include <asm/shmbuf.h> 42#include <asm/shmbuf.h>
42 43
43/* permission flag for shmget */ 44/*
45 * shmget() shmflg values.
46 */
47/* The bottom nine bits are the same as open(2) mode flags */
44#define SHM_R 0400 /* or S_IRUGO from <linux/stat.h> */ 48#define SHM_R 0400 /* or S_IRUGO from <linux/stat.h> */
45#define SHM_W 0200 /* or S_IWUGO from <linux/stat.h> */ 49#define SHM_W 0200 /* or S_IWUGO from <linux/stat.h> */
50/* Bits 9 & 10 are IPC_CREAT and IPC_EXCL */
51#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
52#define SHM_NORESERVE 010000 /* don't check for reservations */
53
54/*
55 * Huge page size encoding when SHM_HUGETLB is specified, and a huge page
56 * size other than the default is desired. See hugetlb_encode.h
57 */
58#define SHM_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
59#define SHM_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK
60
61#define SHM_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB
62#define SHM_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
63#define SHM_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB
64#define SHM_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
65#define SHM_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
66#define SHM_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
67#define SHM_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
68#define SHM_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
69#define SHM_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
70#define SHM_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
46 71
47/* mode for attach */ 72/*
73 * shmat() shmflg values
74 */
48#define SHM_RDONLY 010000 /* read-only access */ 75#define SHM_RDONLY 010000 /* read-only access */
49#define SHM_RND 020000 /* round attach address to SHMLBA boundary */ 76#define SHM_RND 020000 /* round attach address to SHMLBA boundary */
50#define SHM_REMAP 040000 /* take-over region on attach */ 77#define SHM_REMAP 040000 /* take-over region on attach */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..d6d1f65cb3c3 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -23,7 +23,9 @@
23 UFFD_FEATURE_EVENT_REMOVE | \ 23 UFFD_FEATURE_EVENT_REMOVE | \
24 UFFD_FEATURE_EVENT_UNMAP | \ 24 UFFD_FEATURE_EVENT_UNMAP | \
25 UFFD_FEATURE_MISSING_HUGETLBFS | \ 25 UFFD_FEATURE_MISSING_HUGETLBFS | \
26 UFFD_FEATURE_MISSING_SHMEM) 26 UFFD_FEATURE_MISSING_SHMEM | \
27 UFFD_FEATURE_SIGBUS | \
28 UFFD_FEATURE_THREAD_ID)
27#define UFFD_API_IOCTLS \ 29#define UFFD_API_IOCTLS \
28 ((__u64)1 << _UFFDIO_REGISTER | \ 30 ((__u64)1 << _UFFDIO_REGISTER | \
29 (__u64)1 << _UFFDIO_UNREGISTER | \ 31 (__u64)1 << _UFFDIO_UNREGISTER | \
@@ -78,6 +80,9 @@ struct uffd_msg {
78 struct { 80 struct {
79 __u64 flags; 81 __u64 flags;
80 __u64 address; 82 __u64 address;
83 union {
84 __u32 ptid;
85 } feat;
81 } pagefault; 86 } pagefault;
82 87
83 struct { 88 struct {
@@ -153,6 +158,13 @@ struct uffdio_api {
153 * UFFD_FEATURE_MISSING_SHMEM works the same as 158 * UFFD_FEATURE_MISSING_SHMEM works the same as
154 * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem 159 * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
155 * (i.e. tmpfs and other shmem based APIs). 160 * (i.e. tmpfs and other shmem based APIs).
161 *
162 * UFFD_FEATURE_SIGBUS feature means no page-fault
163 * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
164 * a SIGBUS signal will be sent to the faulting process.
165 *
166 * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
167 * be returned, if feature is not requested 0 will be returned.
156 */ 168 */
157#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) 169#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
158#define UFFD_FEATURE_EVENT_FORK (1<<1) 170#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -161,6 +173,8 @@ struct uffdio_api {
161#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) 173#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4)
162#define UFFD_FEATURE_MISSING_SHMEM (1<<5) 174#define UFFD_FEATURE_MISSING_SHMEM (1<<5)
163#define UFFD_FEATURE_EVENT_UNMAP (1<<6) 175#define UFFD_FEATURE_EVENT_UNMAP (1<<6)
176#define UFFD_FEATURE_SIGBUS (1<<7)
177#define UFFD_FEATURE_THREAD_ID (1<<8)
164 __u64 features; 178 __u64 features;
165 179
166 __u64 ioctls; 180 __u64 ioctls;
diff --git a/init/Kconfig b/init/Kconfig
index 5f0ef850e808..78cb2461012e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1576,6 +1576,15 @@ config SLAB_FREELIST_RANDOM
1576 security feature reduces the predictability of the kernel slab 1576 security feature reduces the predictability of the kernel slab
1577 allocator against heap overflows. 1577 allocator against heap overflows.
1578 1578
1579config SLAB_FREELIST_HARDENED
1580 bool "Harden slab freelist metadata"
1581 depends on SLUB
1582 help
1583 Many kernel heap attacks try to target slab cache metadata and
1584 other infrastructure. This options makes minor performance
1585 sacrifies to harden the kernel slab allocator against common
1586 freelist exploit methods.
1587
1579config SLUB_CPU_PARTIAL 1588config SLUB_CPU_PARTIAL
1580 default y 1589 default y
1581 depends on SLUB && SMP 1590 depends on SLUB && SMP
diff --git a/init/main.c b/init/main.c
index 8828fc148670..a21a1a8708a8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -542,7 +542,7 @@ asmlinkage __visible void __init start_kernel(void)
542 boot_cpu_state_init(); 542 boot_cpu_state_init();
543 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 543 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
544 544
545 build_all_zonelists(NULL, NULL); 545 build_all_zonelists(NULL);
546 page_alloc_init(); 546 page_alloc_init();
547 547
548 pr_notice("Kernel command line: %s\n", boot_command_line); 548 pr_notice("Kernel command line: %s\n", boot_command_line);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index df2e0f14a95d..f64fc967a9ef 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4100,9 +4100,6 @@ static void offline_css(struct cgroup_subsys_state *css)
4100 if (!(css->flags & CSS_ONLINE)) 4100 if (!(css->flags & CSS_ONLINE))
4101 return; 4101 return;
4102 4102
4103 if (ss->css_reset)
4104 ss->css_reset(css);
4105
4106 if (ss->css_offline) 4103 if (ss->css_offline)
4107 ss->css_offline(css); 4104 ss->css_offline(css);
4108 4105
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039bafebb..e7485786db9b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,6 +56,7 @@
56#include <linux/time64.h> 56#include <linux/time64.h>
57#include <linux/backing-dev.h> 57#include <linux/backing-dev.h>
58#include <linux/sort.h> 58#include <linux/sort.h>
59#include <linux/oom.h>
59 60
60#include <linux/uaccess.h> 61#include <linux/uaccess.h>
61#include <linux/atomic.h> 62#include <linux/atomic.h>
@@ -2500,12 +2501,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2500 * If we're in interrupt, yes, we can always allocate. If @node is set in 2501 * If we're in interrupt, yes, we can always allocate. If @node is set in
2501 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this 2502 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
2502 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, 2503 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
2503 * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. 2504 * yes. If current has access to memory reserves as an oom victim, yes.
2504 * Otherwise, no. 2505 * Otherwise, no.
2505 * 2506 *
2506 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2507 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2507 * and do not allow allocations outside the current tasks cpuset 2508 * and do not allow allocations outside the current tasks cpuset
2508 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2509 * unless the task has been OOM killed.
2509 * GFP_KERNEL allocations are not so marked, so can escape to the 2510 * GFP_KERNEL allocations are not so marked, so can escape to the
2510 * nearest enclosing hardwalled ancestor cpuset. 2511 * nearest enclosing hardwalled ancestor cpuset.
2511 * 2512 *
@@ -2528,7 +2529,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2528 * affect that: 2529 * affect that:
2529 * in_interrupt - any node ok (current task context irrelevant) 2530 * in_interrupt - any node ok (current task context irrelevant)
2530 * GFP_ATOMIC - any node ok 2531 * GFP_ATOMIC - any node ok
2531 * TIF_MEMDIE - any node ok 2532 * tsk_is_oom_victim - any node ok
2532 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 2533 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2533 * GFP_USER - only nodes in current tasks mems allowed ok. 2534 * GFP_USER - only nodes in current tasks mems allowed ok.
2534 */ 2535 */
@@ -2546,7 +2547,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
2546 * Allow tasks that have access to memory reserves because they have 2547 * Allow tasks that have access to memory reserves because they have
2547 * been OOM killed to get memory anywhere. 2548 * been OOM killed to get memory anywhere.
2548 */ 2549 */
2549 if (unlikely(test_thread_flag(TIF_MEMDIE))) 2550 if (unlikely(tsk_is_oom_victim(current)))
2550 return true; 2551 return true;
2551 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 2552 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
2552 return false; 2553 return false;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e5345c07344..24a4c0be80d5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
657 retval = dup_userfaultfd(tmp, &uf); 657 retval = dup_userfaultfd(tmp, &uf);
658 if (retval) 658 if (retval)
659 goto fail_nomem_anon_vma_fork; 659 goto fail_nomem_anon_vma_fork;
660 if (anon_vma_fork(tmp, mpnt)) 660 if (tmp->vm_flags & VM_WIPEONFORK) {
661 /* VM_WIPEONFORK gets a clean slate in the child. */
662 tmp->anon_vma = NULL;
663 if (anon_vma_prepare(tmp))
664 goto fail_nomem_anon_vma_fork;
665 } else if (anon_vma_fork(tmp, mpnt))
661 goto fail_nomem_anon_vma_fork; 666 goto fail_nomem_anon_vma_fork;
662 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); 667 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
663 tmp->vm_next = tmp->vm_prev = NULL; 668 tmp->vm_next = tmp->vm_prev = NULL;
@@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
701 rb_parent = &tmp->vm_rb; 706 rb_parent = &tmp->vm_rb;
702 707
703 mm->map_count++; 708 mm->map_count++;
704 retval = copy_page_range(mm, oldmm, mpnt); 709 if (!(tmp->vm_flags & VM_WIPEONFORK))
710 retval = copy_page_range(mm, oldmm, mpnt);
705 711
706 if (tmp->vm_ops && tmp->vm_ops->open) 712 if (tmp->vm_ops && tmp->vm_ops->open)
707 tmp->vm_ops->open(tmp); 713 tmp->vm_ops->open(tmp);
@@ -922,7 +928,6 @@ static inline void __mmput(struct mm_struct *mm)
922 } 928 }
923 if (mm->binfmt) 929 if (mm->binfmt)
924 module_put(mm->binfmt->module); 930 module_put(mm->binfmt->module);
925 set_bit(MMF_OOM_SKIP, &mm->flags);
926 mmdrop(mm); 931 mmdrop(mm);
927} 932}
928 933
@@ -938,22 +943,6 @@ void mmput(struct mm_struct *mm)
938} 943}
939EXPORT_SYMBOL_GPL(mmput); 944EXPORT_SYMBOL_GPL(mmput);
940 945
941#ifdef CONFIG_MMU
942static void mmput_async_fn(struct work_struct *work)
943{
944 struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
945 __mmput(mm);
946}
947
948void mmput_async(struct mm_struct *mm)
949{
950 if (atomic_dec_and_test(&mm->mm_users)) {
951 INIT_WORK(&mm->async_put_work, mmput_async_fn);
952 schedule_work(&mm->async_put_work);
953 }
954}
955#endif
956
957/** 946/**
958 * set_mm_exe_file - change a reference to the mm's executable file 947 * set_mm_exe_file - change a reference to the mm's executable file
959 * 948 *
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9afdc434fb49..066e73c2fcc9 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -194,18 +194,41 @@ struct page_map {
194 struct vmem_altmap altmap; 194 struct vmem_altmap altmap;
195}; 195};
196 196
197static void pgmap_radix_release(struct resource *res) 197static unsigned long order_at(struct resource *res, unsigned long pgoff)
198{ 198{
199 resource_size_t key, align_start, align_size, align_end; 199 unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
200 unsigned long nr_pages, mask;
200 201
201 align_start = res->start & ~(SECTION_SIZE - 1); 202 nr_pages = PHYS_PFN(resource_size(res));
202 align_size = ALIGN(resource_size(res), SECTION_SIZE); 203 if (nr_pages == pgoff)
203 align_end = align_start + align_size - 1; 204 return ULONG_MAX;
205
206 /*
207 * What is the largest aligned power-of-2 range available from
208 * this resource pgoff to the end of the resource range,
209 * considering the alignment of the current pgoff?
210 */
211 mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
212 if (!mask)
213 return ULONG_MAX;
214
215 return find_first_bit(&mask, BITS_PER_LONG);
216}
217
218#define foreach_order_pgoff(res, order, pgoff) \
219 for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
220 pgoff += 1UL << order, order = order_at((res), pgoff))
221
222static void pgmap_radix_release(struct resource *res)
223{
224 unsigned long pgoff, order;
204 225
205 mutex_lock(&pgmap_lock); 226 mutex_lock(&pgmap_lock);
206 for (key = res->start; key <= res->end; key += SECTION_SIZE) 227 foreach_order_pgoff(res, order, pgoff)
207 radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); 228 radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
208 mutex_unlock(&pgmap_lock); 229 mutex_unlock(&pgmap_lock);
230
231 synchronize_rcu();
209} 232}
210 233
211static unsigned long pfn_first(struct page_map *page_map) 234static unsigned long pfn_first(struct page_map *page_map)
@@ -268,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
268 291
269 WARN_ON_ONCE(!rcu_read_lock_held()); 292 WARN_ON_ONCE(!rcu_read_lock_held());
270 293
271 page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); 294 page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
272 return page_map ? &page_map->pgmap : NULL; 295 return page_map ? &page_map->pgmap : NULL;
273} 296}
274 297
@@ -293,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
293void *devm_memremap_pages(struct device *dev, struct resource *res, 316void *devm_memremap_pages(struct device *dev, struct resource *res,
294 struct percpu_ref *ref, struct vmem_altmap *altmap) 317 struct percpu_ref *ref, struct vmem_altmap *altmap)
295{ 318{
296 resource_size_t key, align_start, align_size, align_end; 319 resource_size_t align_start, align_size, align_end;
320 unsigned long pfn, pgoff, order;
297 pgprot_t pgprot = PAGE_KERNEL; 321 pgprot_t pgprot = PAGE_KERNEL;
298 struct dev_pagemap *pgmap; 322 struct dev_pagemap *pgmap;
299 struct page_map *page_map; 323 struct page_map *page_map;
300 int error, nid, is_ram; 324 int error, nid, is_ram;
301 unsigned long pfn;
302 325
303 align_start = res->start & ~(SECTION_SIZE - 1); 326 align_start = res->start & ~(SECTION_SIZE - 1);
304 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) 327 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -337,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
337 mutex_lock(&pgmap_lock); 360 mutex_lock(&pgmap_lock);
338 error = 0; 361 error = 0;
339 align_end = align_start + align_size - 1; 362 align_end = align_start + align_size - 1;
340 for (key = align_start; key <= align_end; key += SECTION_SIZE) { 363
364 foreach_order_pgoff(res, order, pgoff) {
341 struct dev_pagemap *dup; 365 struct dev_pagemap *dup;
342 366
343 rcu_read_lock(); 367 rcu_read_lock();
344 dup = find_dev_pagemap(key); 368 dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
345 rcu_read_unlock(); 369 rcu_read_unlock();
346 if (dup) { 370 if (dup) {
347 dev_err(dev, "%s: %pr collides with mapping for %s\n", 371 dev_err(dev, "%s: %pr collides with mapping for %s\n",
@@ -349,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
349 error = -EBUSY; 373 error = -EBUSY;
350 break; 374 break;
351 } 375 }
352 error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, 376 error = __radix_tree_insert(&pgmap_radix,
353 page_map); 377 PHYS_PFN(res->start) + pgoff, order, page_map);
354 if (error) { 378 if (error) {
355 dev_err(dev, "%s: failed: %d\n", __func__, error); 379 dev_err(dev, "%s: failed: %d\n", __func__, error);
356 break; 380 break;
diff --git a/mm/Kconfig b/mm/Kconfig
index 48b1af447fa7..0ded10a22639 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -678,6 +678,7 @@ config ZONE_DEVICE
678 depends on MEMORY_HOTREMOVE 678 depends on MEMORY_HOTREMOVE
679 depends on SPARSEMEM_VMEMMAP 679 depends on SPARSEMEM_VMEMMAP
680 depends on ARCH_HAS_ZONE_DEVICE 680 depends on ARCH_HAS_ZONE_DEVICE
681 select RADIX_TREE_MULTIORDER
681 682
682 help 683 help
683 Device memory hotplug support allows for establishing pmem, 684 Device memory hotplug support allows for establishing pmem,
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e01cb6e5173..9d21afd692b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
130 return -EEXIST; 130 return -EEXIST;
131 131
132 mapping->nrexceptional--; 132 mapping->nrexceptional--;
133 if (!dax_mapping(mapping)) { 133 if (shadowp)
134 if (shadowp) 134 *shadowp = p;
135 *shadowp = p;
136 } else {
137 /* DAX can replace empty locked entry with a hole */
138 WARN_ON_ONCE(p !=
139 dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
140 /* Wakeup waiters for exceptional entry lock */
141 dax_wake_mapping_entry_waiter(mapping, page->index, p,
142 true);
143 }
144 } 135 }
145 __radix_tree_replace(&mapping->page_tree, node, slot, page, 136 __radix_tree_replace(&mapping->page_tree, node, slot, page,
146 workingset_update_node, mapping); 137 workingset_update_node, mapping);
@@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping,
402{ 393{
403 pgoff_t index = start_byte >> PAGE_SHIFT; 394 pgoff_t index = start_byte >> PAGE_SHIFT;
404 pgoff_t end = end_byte >> PAGE_SHIFT; 395 pgoff_t end = end_byte >> PAGE_SHIFT;
405 struct pagevec pvec; 396 struct page *page;
406 bool ret;
407 397
408 if (end_byte < start_byte) 398 if (end_byte < start_byte)
409 return false; 399 return false;
@@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping,
411 if (mapping->nrpages == 0) 401 if (mapping->nrpages == 0)
412 return false; 402 return false;
413 403
414 pagevec_init(&pvec, 0); 404 if (!find_get_pages_range(mapping, &index, end, 1, &page))
415 if (!pagevec_lookup(&pvec, mapping, index, 1))
416 return false; 405 return false;
417 ret = (pvec.pages[0]->index <= end); 406 put_page(page);
418 pagevec_release(&pvec); 407 return true;
419 return ret;
420} 408}
421EXPORT_SYMBOL(filemap_range_has_page); 409EXPORT_SYMBOL(filemap_range_has_page);
422 410
@@ -1564,23 +1552,29 @@ export:
1564} 1552}
1565 1553
1566/** 1554/**
1567 * find_get_pages - gang pagecache lookup 1555 * find_get_pages_range - gang pagecache lookup
1568 * @mapping: The address_space to search 1556 * @mapping: The address_space to search
1569 * @start: The starting page index 1557 * @start: The starting page index
1558 * @end: The final page index (inclusive)
1570 * @nr_pages: The maximum number of pages 1559 * @nr_pages: The maximum number of pages
1571 * @pages: Where the resulting pages are placed 1560 * @pages: Where the resulting pages are placed
1572 * 1561 *
1573 * find_get_pages() will search for and return a group of up to 1562 * find_get_pages_range() will search for and return a group of up to @nr_pages
1574 * @nr_pages pages in the mapping. The pages are placed at @pages. 1563 * pages in the mapping starting at index @start and up to index @end
1575 * find_get_pages() takes a reference against the returned pages. 1564 * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
1565 * a reference against the returned pages.
1576 * 1566 *
1577 * The search returns a group of mapping-contiguous pages with ascending 1567 * The search returns a group of mapping-contiguous pages with ascending
1578 * indexes. There may be holes in the indices due to not-present pages. 1568 * indexes. There may be holes in the indices due to not-present pages.
1569 * We also update @start to index the next page for the traversal.
1579 * 1570 *
1580 * find_get_pages() returns the number of pages which were found. 1571 * find_get_pages_range() returns the number of pages which were found. If this
1572 * number is smaller than @nr_pages, the end of specified range has been
1573 * reached.
1581 */ 1574 */
1582unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 1575unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
1583 unsigned int nr_pages, struct page **pages) 1576 pgoff_t end, unsigned int nr_pages,
1577 struct page **pages)
1584{ 1578{
1585 struct radix_tree_iter iter; 1579 struct radix_tree_iter iter;
1586 void **slot; 1580 void **slot;
@@ -1590,8 +1584,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
1590 return 0; 1584 return 0;
1591 1585
1592 rcu_read_lock(); 1586 rcu_read_lock();
1593 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1587 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
1594 struct page *head, *page; 1588 struct page *head, *page;
1589
1590 if (iter.index > end)
1591 break;
1595repeat: 1592repeat:
1596 page = radix_tree_deref_slot(slot); 1593 page = radix_tree_deref_slot(slot);
1597 if (unlikely(!page)) 1594 if (unlikely(!page))
@@ -1627,11 +1624,25 @@ repeat:
1627 } 1624 }
1628 1625
1629 pages[ret] = page; 1626 pages[ret] = page;
1630 if (++ret == nr_pages) 1627 if (++ret == nr_pages) {
1631 break; 1628 *start = pages[ret - 1]->index + 1;
1629 goto out;
1630 }
1632 } 1631 }
1633 1632
1633 /*
1634 * We come here when there is no page beyond @end. We take care to not
1635 * overflow the index @start as it confuses some of the callers. This
1636 * breaks the iteration when there is page at index -1 but that is
1637 * already broken anyway.
1638 */
1639 if (end == (pgoff_t)-1)
1640 *start = (pgoff_t)-1;
1641 else
1642 *start = end + 1;
1643out:
1634 rcu_read_unlock(); 1644 rcu_read_unlock();
1645
1635 return ret; 1646 return ret;
1636} 1647}
1637 1648
diff --git a/mm/gup.c b/mm/gup.c
index 23f01c40c88f..33d651deeae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1352} 1352}
1353#endif /* __HAVE_ARCH_PTE_SPECIAL */ 1353#endif /* __HAVE_ARCH_PTE_SPECIAL */
1354 1354
1355#ifdef __HAVE_ARCH_PTE_DEVMAP 1355#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1356static int __gup_device_huge(unsigned long pfn, unsigned long addr, 1356static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1357 unsigned long end, struct page **pages, int *nr) 1357 unsigned long end, struct page **pages, int *nr)
1358{ 1358{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3644ff918434..0b51e70e0a8b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -328,7 +328,7 @@ static struct attribute *hugepage_attr[] = {
328 NULL, 328 NULL,
329}; 329};
330 330
331static struct attribute_group hugepage_attr_group = { 331static const struct attribute_group hugepage_attr_group = {
332 .attrs = hugepage_attr, 332 .attrs = hugepage_attr,
333}; 333};
334 334
@@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
567 goto release; 567 goto release;
568 } 568 }
569 569
570 clear_huge_page(page, haddr, HPAGE_PMD_NR); 570 clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
571 /* 571 /*
572 * The memory barrier inside __SetPageUptodate makes sure that 572 * The memory barrier inside __SetPageUptodate makes sure that
573 * clear_huge_page writes become visible before the set_pmd_at() 573 * clear_huge_page writes become visible before the set_pmd_at()
@@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1240 * We can only reuse the page if nobody else maps the huge page or it's 1240 * We can only reuse the page if nobody else maps the huge page or it's
1241 * part. 1241 * part.
1242 */ 1242 */
1243 if (page_trans_huge_mapcount(page, NULL) == 1) { 1243 if (!trylock_page(page)) {
1244 get_page(page);
1245 spin_unlock(vmf->ptl);
1246 lock_page(page);
1247 spin_lock(vmf->ptl);
1248 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1249 unlock_page(page);
1250 put_page(page);
1251 goto out_unlock;
1252 }
1253 put_page(page);
1254 }
1255 if (reuse_swap_page(page, NULL)) {
1244 pmd_t entry; 1256 pmd_t entry;
1245 entry = pmd_mkyoung(orig_pmd); 1257 entry = pmd_mkyoung(orig_pmd);
1246 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1258 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1247 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1259 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1248 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1260 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1249 ret |= VM_FAULT_WRITE; 1261 ret |= VM_FAULT_WRITE;
1262 unlock_page(page);
1250 goto out_unlock; 1263 goto out_unlock;
1251 } 1264 }
1265 unlock_page(page);
1252 get_page(page); 1266 get_page(page);
1253 spin_unlock(vmf->ptl); 1267 spin_unlock(vmf->ptl);
1254alloc: 1268alloc:
@@ -1291,7 +1305,7 @@ alloc:
1291 count_vm_event(THP_FAULT_ALLOC); 1305 count_vm_event(THP_FAULT_ALLOC);
1292 1306
1293 if (!page) 1307 if (!page)
1294 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1308 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
1295 else 1309 else
1296 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1310 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
1297 __SetPageUptodate(new_page); 1311 __SetPageUptodate(new_page);
@@ -2467,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2467 VM_BUG_ON_PAGE(!PageLocked(page), page); 2481 VM_BUG_ON_PAGE(!PageLocked(page), page);
2468 VM_BUG_ON_PAGE(!PageCompound(page), page); 2482 VM_BUG_ON_PAGE(!PageCompound(page), page);
2469 2483
2484 if (PageWriteback(page))
2485 return -EBUSY;
2486
2470 if (PageAnon(head)) { 2487 if (PageAnon(head)) {
2471 /* 2488 /*
2472 * The caller does not necessarily hold an mmap_sem that would 2489 * The caller does not necessarily hold an mmap_sem that would
@@ -2544,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2544 __dec_node_page_state(page, NR_SHMEM_THPS); 2561 __dec_node_page_state(page, NR_SHMEM_THPS);
2545 spin_unlock(&pgdata->split_queue_lock); 2562 spin_unlock(&pgdata->split_queue_lock);
2546 __split_huge_page(page, list, flags); 2563 __split_huge_page(page, list, flags);
2547 ret = 0; 2564 if (PageSwapCache(head)) {
2565 swp_entry_t entry = { .val = page_private(head) };
2566
2567 ret = split_swap_cluster(entry);
2568 } else
2569 ret = 0;
2548 } else { 2570 } else {
2549 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 2571 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
2550 pr_alert("total_mapcount: %u, page_count(): %u\n", 2572 pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 31e207cb399b..34625b257128 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order)
1066} 1066}
1067 1067
1068static int __alloc_gigantic_page(unsigned long start_pfn, 1068static int __alloc_gigantic_page(unsigned long start_pfn,
1069 unsigned long nr_pages) 1069 unsigned long nr_pages, gfp_t gfp_mask)
1070{ 1070{
1071 unsigned long end_pfn = start_pfn + nr_pages; 1071 unsigned long end_pfn = start_pfn + nr_pages;
1072 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 1072 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
1073 GFP_KERNEL); 1073 gfp_mask);
1074} 1074}
1075 1075
1076static bool pfn_range_valid_gigantic(struct zone *z, 1076static bool pfn_range_valid_gigantic(struct zone *z,
@@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone,
1108 return zone_spans_pfn(zone, last_pfn); 1108 return zone_spans_pfn(zone, last_pfn);
1109} 1109}
1110 1110
1111static struct page *alloc_gigantic_page(int nid, unsigned int order) 1111static struct page *alloc_gigantic_page(int nid, struct hstate *h)
1112{ 1112{
1113 unsigned int order = huge_page_order(h);
1113 unsigned long nr_pages = 1 << order; 1114 unsigned long nr_pages = 1 << order;
1114 unsigned long ret, pfn, flags; 1115 unsigned long ret, pfn, flags;
1115 struct zone *z; 1116 struct zonelist *zonelist;
1117 struct zone *zone;
1118 struct zoneref *z;
1119 gfp_t gfp_mask;
1116 1120
1117 z = NODE_DATA(nid)->node_zones; 1121 gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
1118 for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { 1122 zonelist = node_zonelist(nid, gfp_mask);
1119 spin_lock_irqsave(&z->lock, flags); 1123 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
1124 spin_lock_irqsave(&zone->lock, flags);
1120 1125
1121 pfn = ALIGN(z->zone_start_pfn, nr_pages); 1126 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
1122 while (zone_spans_last_pfn(z, pfn, nr_pages)) { 1127 while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
1123 if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { 1128 if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
1124 /* 1129 /*
1125 * We release the zone lock here because 1130 * We release the zone lock here because
1126 * alloc_contig_range() will also lock the zone 1131 * alloc_contig_range() will also lock the zone
@@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
1128 * spinning on this lock, it may win the race 1133 * spinning on this lock, it may win the race
1129 * and cause alloc_contig_range() to fail... 1134 * and cause alloc_contig_range() to fail...
1130 */ 1135 */
1131 spin_unlock_irqrestore(&z->lock, flags); 1136 spin_unlock_irqrestore(&zone->lock, flags);
1132 ret = __alloc_gigantic_page(pfn, nr_pages); 1137 ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
1133 if (!ret) 1138 if (!ret)
1134 return pfn_to_page(pfn); 1139 return pfn_to_page(pfn);
1135 spin_lock_irqsave(&z->lock, flags); 1140 spin_lock_irqsave(&zone->lock, flags);
1136 } 1141 }
1137 pfn += nr_pages; 1142 pfn += nr_pages;
1138 } 1143 }
1139 1144
1140 spin_unlock_irqrestore(&z->lock, flags); 1145 spin_unlock_irqrestore(&zone->lock, flags);
1141 } 1146 }
1142 1147
1143 return NULL; 1148 return NULL;
@@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
1150{ 1155{
1151 struct page *page; 1156 struct page *page;
1152 1157
1153 page = alloc_gigantic_page(nid, huge_page_order(h)); 1158 page = alloc_gigantic_page(nid, h);
1154 if (page) { 1159 if (page) {
1155 prep_compound_gigantic_page(page, huge_page_order(h)); 1160 prep_compound_gigantic_page(page, huge_page_order(h));
1156 prep_new_huge_page(h, page, nid); 1161 prep_new_huge_page(h, page, nid);
@@ -2569,13 +2574,13 @@ static struct attribute *hstate_attrs[] = {
2569 NULL, 2574 NULL,
2570}; 2575};
2571 2576
2572static struct attribute_group hstate_attr_group = { 2577static const struct attribute_group hstate_attr_group = {
2573 .attrs = hstate_attrs, 2578 .attrs = hstate_attrs,
2574}; 2579};
2575 2580
2576static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 2581static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
2577 struct kobject **hstate_kobjs, 2582 struct kobject **hstate_kobjs,
2578 struct attribute_group *hstate_attr_group) 2583 const struct attribute_group *hstate_attr_group)
2579{ 2584{
2580 int retval; 2585 int retval;
2581 int hi = hstate_index(h); 2586 int hi = hstate_index(h);
@@ -2633,7 +2638,7 @@ static struct attribute *per_node_hstate_attrs[] = {
2633 NULL, 2638 NULL,
2634}; 2639};
2635 2640
2636static struct attribute_group per_node_hstate_attr_group = { 2641static const struct attribute_group per_node_hstate_attr_group = {
2637 .attrs = per_node_hstate_attrs, 2642 .attrs = per_node_hstate_attrs,
2638}; 2643};
2639 2644
@@ -4600,6 +4605,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
4600 return pte; 4605 return pte;
4601} 4606}
4602 4607
4608/*
4609 * huge_pte_offset() - Walk the page table to resolve the hugepage
4610 * entry at address @addr
4611 *
4612 * Return: Pointer to page table or swap entry (PUD or PMD) for
4613 * address @addr, or NULL if a p*d_none() entry is encountered and the
4614 * size @sz doesn't match the hugepage size at this level of the page
4615 * table.
4616 */
4603pte_t *huge_pte_offset(struct mm_struct *mm, 4617pte_t *huge_pte_offset(struct mm_struct *mm,
4604 unsigned long addr, unsigned long sz) 4618 unsigned long addr, unsigned long sz)
4605{ 4619{
@@ -4614,13 +4628,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
4614 p4d = p4d_offset(pgd, addr); 4628 p4d = p4d_offset(pgd, addr);
4615 if (!p4d_present(*p4d)) 4629 if (!p4d_present(*p4d))
4616 return NULL; 4630 return NULL;
4631
4617 pud = pud_offset(p4d, addr); 4632 pud = pud_offset(p4d, addr);
4618 if (!pud_present(*pud)) 4633 if (sz != PUD_SIZE && pud_none(*pud))
4619 return NULL; 4634 return NULL;
4620 if (pud_huge(*pud)) 4635 /* hugepage or swap? */
4636 if (pud_huge(*pud) || !pud_present(*pud))
4621 return (pte_t *)pud; 4637 return (pte_t *)pud;
4638
4622 pmd = pmd_offset(pud, addr); 4639 pmd = pmd_offset(pud, addr);
4623 return (pte_t *) pmd; 4640 if (sz != PMD_SIZE && pmd_none(*pmd))
4641 return NULL;
4642 /* hugepage or swap? */
4643 if (pmd_huge(*pmd) || !pmd_present(*pmd))
4644 return (pte_t *)pmd;
4645
4646 return NULL;
4624} 4647}
4625 4648
4626#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 4649#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..1df011f62480 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
480/* Mask to get the watermark bits */ 480/* Mask to get the watermark bits */
481#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) 481#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
482 482
483/*
484 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
485 * cannot assume a reduced access to memory reserves is sufficient for
486 * !MMU
487 */
488#ifdef CONFIG_MMU
489#define ALLOC_OOM 0x08
490#else
491#define ALLOC_OOM ALLOC_NO_WATERMARKS
492#endif
493
483#define ALLOC_HARDER 0x10 /* try to alloc harder */ 494#define ALLOC_HARDER 0x10 /* try to alloc harder */
484#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 495#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
485#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 496#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -525,4 +536,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
525 return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; 536 return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
526} 537}
527 538
539void setup_zone_pageset(struct zone *zone);
528#endif /* __MM_INTERNAL_H */ 540#endif /* __MM_INTERNAL_H */
diff --git a/mm/ksm.c b/mm/ksm.c
index db20f8436bc3..15dd7415f7b3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3043,7 +3043,7 @@ static struct attribute *ksm_attrs[] = {
3043 NULL, 3043 NULL,
3044}; 3044};
3045 3045
3046static struct attribute_group ksm_attr_group = { 3046static const struct attribute_group ksm_attr_group = {
3047 .attrs = ksm_attrs, 3047 .attrs = ksm_attrs,
3048 .name = "ksm", 3048 .name = "ksm",
3049}; 3049};
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d7d1e5ddba9..eea1c733286f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
80 } 80 }
81 new_flags &= ~VM_DONTCOPY; 81 new_flags &= ~VM_DONTCOPY;
82 break; 82 break;
83 case MADV_WIPEONFORK:
84 /* MADV_WIPEONFORK is only supported on anonymous memory. */
85 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
86 error = -EINVAL;
87 goto out;
88 }
89 new_flags |= VM_WIPEONFORK;
90 break;
91 case MADV_KEEPONFORK:
92 new_flags &= ~VM_WIPEONFORK;
93 break;
83 case MADV_DONTDUMP: 94 case MADV_DONTDUMP:
84 new_flags |= VM_DONTDUMP; 95 new_flags |= VM_DONTDUMP;
85 break; 96 break;
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior)
696#endif 707#endif
697 case MADV_DONTDUMP: 708 case MADV_DONTDUMP:
698 case MADV_DODUMP: 709 case MADV_DODUMP:
710 case MADV_WIPEONFORK:
711 case MADV_KEEPONFORK:
699#ifdef CONFIG_MEMORY_FAILURE 712#ifdef CONFIG_MEMORY_FAILURE
700 case MADV_SOFT_OFFLINE: 713 case MADV_SOFT_OFFLINE:
701 case MADV_HWPOISON: 714 case MADV_HWPOISON:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e09741af816f..ad15850ee157 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
550 * value, and reading all cpu value can be performance bottleneck in some 550 * value, and reading all cpu value can be performance bottleneck in some
551 * common workload, threshold and synchronization as vmstat[] should be 551 * common workload, threshold and synchronization as vmstat[] should be
552 * implemented. 552 * implemented.
553 *
554 * The parameter idx can be of type enum memcg_event_item or vm_event_item.
553 */ 555 */
554 556
555static unsigned long memcg_sum_events(struct mem_cgroup *memcg, 557static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
556 enum memcg_event_item event) 558 int event)
557{ 559{
558 unsigned long val = 0; 560 unsigned long val = 0;
559 int cpu; 561 int cpu;
@@ -1915,7 +1917,7 @@ retry:
1915 * bypass the last charges so that they can exit quickly and 1917 * bypass the last charges so that they can exit quickly and
1916 * free their memory. 1918 * free their memory.
1917 */ 1919 */
1918 if (unlikely(test_thread_flag(TIF_MEMDIE) || 1920 if (unlikely(tsk_is_oom_victim(current) ||
1919 fatal_signal_pending(current) || 1921 fatal_signal_pending(current) ||
1920 current->flags & PF_EXITING)) 1922 current->flags & PF_EXITING))
1921 goto force; 1923 goto force;
@@ -4319,6 +4321,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4319 } 4321 }
4320 spin_unlock(&memcg->event_list_lock); 4322 spin_unlock(&memcg->event_list_lock);
4321 4323
4324 memcg->low = 0;
4325
4322 memcg_offline_kmem(memcg); 4326 memcg_offline_kmem(memcg);
4323 wb_memcg_offline(memcg); 4327 wb_memcg_offline(memcg);
4324 4328
@@ -4635,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4635 if (!ret || !target) 4639 if (!ret || !target)
4636 put_page(page); 4640 put_page(page);
4637 } 4641 }
4638 /* There is a swap entry and a page doesn't exist or isn't charged */ 4642 /*
4639 if (ent.val && !ret && 4643 * There is a swap entry and a page doesn't exist or isn't charged.
4644 * But we cannot move a tail-page in a THP.
4645 */
4646 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
4640 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 4647 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4641 ret = MC_TARGET_SWAP; 4648 ret = MC_TARGET_SWAP;
4642 if (target) 4649 if (target)
@@ -4647,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4647 4654
4648#ifdef CONFIG_TRANSPARENT_HUGEPAGE 4655#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4649/* 4656/*
4650 * We don't consider swapping or file mapped pages because THP does not 4657 * We don't consider PMD mapped swapping or file mapped pages because THP does
4651 * support them for now. 4658 * not support them for now.
4652 * Caller should make sure that pmd_trans_huge(pmd) is true. 4659 * Caller should make sure that pmd_trans_huge(pmd) is true.
4653 */ 4660 */
4654static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4661static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -5423,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5423 * in turn serializes uncharging. 5430 * in turn serializes uncharging.
5424 */ 5431 */
5425 VM_BUG_ON_PAGE(!PageLocked(page), page); 5432 VM_BUG_ON_PAGE(!PageLocked(page), page);
5426 if (page->mem_cgroup) 5433 if (compound_head(page)->mem_cgroup)
5427 goto out; 5434 goto out;
5428 5435
5429 if (do_swap_account) { 5436 if (do_swap_account) {
@@ -5906,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
5906void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5913void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5907{ 5914{
5908 struct mem_cgroup *memcg, *swap_memcg; 5915 struct mem_cgroup *memcg, *swap_memcg;
5916 unsigned int nr_entries;
5909 unsigned short oldid; 5917 unsigned short oldid;
5910 5918
5911 VM_BUG_ON_PAGE(PageLRU(page), page); 5919 VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5926,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5926 * ancestor for the swap instead and transfer the memory+swap charge. 5934 * ancestor for the swap instead and transfer the memory+swap charge.
5927 */ 5935 */
5928 swap_memcg = mem_cgroup_id_get_online(memcg); 5936 swap_memcg = mem_cgroup_id_get_online(memcg);
5929 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); 5937 nr_entries = hpage_nr_pages(page);
5938 /* Get references for the tail pages, too */
5939 if (nr_entries > 1)
5940 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
5941 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
5942 nr_entries);
5930 VM_BUG_ON_PAGE(oldid, page); 5943 VM_BUG_ON_PAGE(oldid, page);
5931 mem_cgroup_swap_statistics(swap_memcg, 1); 5944 mem_cgroup_swap_statistics(swap_memcg, nr_entries);
5932 5945
5933 page->mem_cgroup = NULL; 5946 page->mem_cgroup = NULL;
5934 5947
5935 if (!mem_cgroup_is_root(memcg)) 5948 if (!mem_cgroup_is_root(memcg))
5936 page_counter_uncharge(&memcg->memory, 1); 5949 page_counter_uncharge(&memcg->memory, nr_entries);
5937 5950
5938 if (memcg != swap_memcg) { 5951 if (memcg != swap_memcg) {
5939 if (!mem_cgroup_is_root(swap_memcg)) 5952 if (!mem_cgroup_is_root(swap_memcg))
5940 page_counter_charge(&swap_memcg->memsw, 1); 5953 page_counter_charge(&swap_memcg->memsw, nr_entries);
5941 page_counter_uncharge(&memcg->memsw, 1); 5954 page_counter_uncharge(&memcg->memsw, nr_entries);
5942 } 5955 }
5943 5956
5944 /* 5957 /*
@@ -5948,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5948 * only synchronisation we have for udpating the per-CPU variables. 5961 * only synchronisation we have for udpating the per-CPU variables.
5949 */ 5962 */
5950 VM_BUG_ON(!irqs_disabled()); 5963 VM_BUG_ON(!irqs_disabled());
5951 mem_cgroup_charge_statistics(memcg, page, false, -1); 5964 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
5965 -nr_entries);
5952 memcg_check_events(memcg, page); 5966 memcg_check_events(memcg, page);
5953 5967
5954 if (!mem_cgroup_is_root(memcg)) 5968 if (!mem_cgroup_is_root(memcg))
diff --git a/mm/memory.c b/mm/memory.c
index 56e48e4593cb..13ee83b43878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1513,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1513 tlb_gather_mmu(&tlb, mm, start, end); 1513 tlb_gather_mmu(&tlb, mm, start, end);
1514 update_hiwater_rss(mm); 1514 update_hiwater_rss(mm);
1515 mmu_notifier_invalidate_range_start(mm, start, end); 1515 mmu_notifier_invalidate_range_start(mm, start, end);
1516 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) 1516 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
1517 unmap_single_vma(&tlb, vma, start, end, NULL); 1517 unmap_single_vma(&tlb, vma, start, end, NULL);
1518
1519 /*
1520 * zap_page_range does not specify whether mmap_sem should be
1521 * held for read or write. That allows parallel zap_page_range
1522 * operations to unmap a PTE and defer a flush meaning that
1523 * this call observes pte_none and fails to flush the TLB.
1524 * Rather than adding a complex API, ensure that no stale
1525 * TLB entries exist when this call returns.
1526 */
1527 flush_tlb_range(vma, start, end);
1528 }
1529
1518 mmu_notifier_invalidate_range_end(mm, start, end); 1530 mmu_notifier_invalidate_range_end(mm, start, end);
1519 tlb_finish_mmu(&tlb, start, end); 1531 tlb_finish_mmu(&tlb, start, end);
1520} 1532}
@@ -1676,7 +1688,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1676EXPORT_SYMBOL(vm_insert_page); 1688EXPORT_SYMBOL(vm_insert_page);
1677 1689
1678static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1690static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1679 pfn_t pfn, pgprot_t prot) 1691 pfn_t pfn, pgprot_t prot, bool mkwrite)
1680{ 1692{
1681 struct mm_struct *mm = vma->vm_mm; 1693 struct mm_struct *mm = vma->vm_mm;
1682 int retval; 1694 int retval;
@@ -1688,14 +1700,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1688 if (!pte) 1700 if (!pte)
1689 goto out; 1701 goto out;
1690 retval = -EBUSY; 1702 retval = -EBUSY;
1691 if (!pte_none(*pte)) 1703 if (!pte_none(*pte)) {
1692 goto out_unlock; 1704 if (mkwrite) {
1705 /*
1706 * For read faults on private mappings the PFN passed
1707 * in may not match the PFN we have mapped if the
1708 * mapped PFN is a writeable COW page. In the mkwrite
1709 * case we are creating a writable PTE for a shared
1710 * mapping and we expect the PFNs to match.
1711 */
1712 if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
1713 goto out_unlock;
1714 entry = *pte;
1715 goto out_mkwrite;
1716 } else
1717 goto out_unlock;
1718 }
1693 1719
1694 /* Ok, finally just insert the thing.. */ 1720 /* Ok, finally just insert the thing.. */
1695 if (pfn_t_devmap(pfn)) 1721 if (pfn_t_devmap(pfn))
1696 entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); 1722 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1697 else 1723 else
1698 entry = pte_mkspecial(pfn_t_pte(pfn, prot)); 1724 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1725
1726out_mkwrite:
1727 if (mkwrite) {
1728 entry = pte_mkyoung(entry);
1729 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1730 }
1731
1699 set_pte_at(mm, addr, pte, entry); 1732 set_pte_at(mm, addr, pte, entry);
1700 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ 1733 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1701 1734
@@ -1766,14 +1799,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1766 1799
1767 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); 1800 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1768 1801
1769 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); 1802 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1803 false);
1770 1804
1771 return ret; 1805 return ret;
1772} 1806}
1773EXPORT_SYMBOL(vm_insert_pfn_prot); 1807EXPORT_SYMBOL(vm_insert_pfn_prot);
1774 1808
1775int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1809static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1776 pfn_t pfn) 1810 pfn_t pfn, bool mkwrite)
1777{ 1811{
1778 pgprot_t pgprot = vma->vm_page_prot; 1812 pgprot_t pgprot = vma->vm_page_prot;
1779 1813
@@ -1802,10 +1836,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1802 page = pfn_to_page(pfn_t_to_pfn(pfn)); 1836 page = pfn_to_page(pfn_t_to_pfn(pfn));
1803 return insert_page(vma, addr, page, pgprot); 1837 return insert_page(vma, addr, page, pgprot);
1804 } 1838 }
1805 return insert_pfn(vma, addr, pfn, pgprot); 1839 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1840}
1841
1842int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1843 pfn_t pfn)
1844{
1845 return __vm_insert_mixed(vma, addr, pfn, false);
1846
1806} 1847}
1807EXPORT_SYMBOL(vm_insert_mixed); 1848EXPORT_SYMBOL(vm_insert_mixed);
1808 1849
1850int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
1851 pfn_t pfn)
1852{
1853 return __vm_insert_mixed(vma, addr, pfn, true);
1854}
1855EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
1856
1809/* 1857/*
1810 * maps a range of physical memory into the requested pages. the old 1858 * maps a range of physical memory into the requested pages. the old
1811 * mappings are removed. any references to nonexistent pages results 1859 * mappings are removed. any references to nonexistent pages results
@@ -2571,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf)
2571 * not dirty accountable. 2619 * not dirty accountable.
2572 */ 2620 */
2573 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { 2621 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2574 int total_mapcount; 2622 int total_map_swapcount;
2575 if (!trylock_page(vmf->page)) { 2623 if (!trylock_page(vmf->page)) {
2576 get_page(vmf->page); 2624 get_page(vmf->page);
2577 pte_unmap_unlock(vmf->pte, vmf->ptl); 2625 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2586,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf)
2586 } 2634 }
2587 put_page(vmf->page); 2635 put_page(vmf->page);
2588 } 2636 }
2589 if (reuse_swap_page(vmf->page, &total_mapcount)) { 2637 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2590 if (total_mapcount == 1) { 2638 if (total_map_swapcount == 1) {
2591 /* 2639 /*
2592 * The page is all ours. Move it to 2640 * The page is all ours. Move it to
2593 * our anon_vma so the rmap code will 2641 * our anon_vma so the rmap code will
@@ -2704,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range);
2704int do_swap_page(struct vm_fault *vmf) 2752int do_swap_page(struct vm_fault *vmf)
2705{ 2753{
2706 struct vm_area_struct *vma = vmf->vma; 2754 struct vm_area_struct *vma = vmf->vma;
2707 struct page *page, *swapcache; 2755 struct page *page = NULL, *swapcache;
2708 struct mem_cgroup *memcg; 2756 struct mem_cgroup *memcg;
2757 struct vma_swap_readahead swap_ra;
2709 swp_entry_t entry; 2758 swp_entry_t entry;
2710 pte_t pte; 2759 pte_t pte;
2711 int locked; 2760 int locked;
2712 int exclusive = 0; 2761 int exclusive = 0;
2713 int ret = 0; 2762 int ret = 0;
2763 bool vma_readahead = swap_use_vma_readahead();
2714 2764
2715 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) 2765 if (vma_readahead)
2766 page = swap_readahead_detect(vmf, &swap_ra);
2767 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
2768 if (page)
2769 put_page(page);
2716 goto out; 2770 goto out;
2771 }
2717 2772
2718 entry = pte_to_swp_entry(vmf->orig_pte); 2773 entry = pte_to_swp_entry(vmf->orig_pte);
2719 if (unlikely(non_swap_entry(entry))) { 2774 if (unlikely(non_swap_entry(entry))) {
@@ -2729,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf)
2729 goto out; 2784 goto out;
2730 } 2785 }
2731 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2786 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2732 page = lookup_swap_cache(entry); 2787 if (!page)
2788 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
2789 vmf->address);
2733 if (!page) { 2790 if (!page) {
2734 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, 2791 if (vma_readahead)
2735 vmf->address); 2792 page = do_swap_page_readahead(entry,
2793 GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
2794 else
2795 page = swapin_readahead(entry,
2796 GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2736 if (!page) { 2797 if (!page) {
2737 /* 2798 /*
2738 * Back out if somebody else faulted in this pte 2799 * Back out if somebody else faulted in this pte
@@ -4356,19 +4417,53 @@ static void clear_gigantic_page(struct page *page,
4356 } 4417 }
4357} 4418}
4358void clear_huge_page(struct page *page, 4419void clear_huge_page(struct page *page,
4359 unsigned long addr, unsigned int pages_per_huge_page) 4420 unsigned long addr_hint, unsigned int pages_per_huge_page)
4360{ 4421{
4361 int i; 4422 int i, n, base, l;
4423 unsigned long addr = addr_hint &
4424 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4362 4425
4363 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 4426 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4364 clear_gigantic_page(page, addr, pages_per_huge_page); 4427 clear_gigantic_page(page, addr, pages_per_huge_page);
4365 return; 4428 return;
4366 } 4429 }
4367 4430
4431 /* Clear sub-page to access last to keep its cache lines hot */
4368 might_sleep(); 4432 might_sleep();
4369 for (i = 0; i < pages_per_huge_page; i++) { 4433 n = (addr_hint - addr) / PAGE_SIZE;
4434 if (2 * n <= pages_per_huge_page) {
4435 /* If sub-page to access in first half of huge page */
4436 base = 0;
4437 l = n;
4438 /* Clear sub-pages at the end of huge page */
4439 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4440 cond_resched();
4441 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4442 }
4443 } else {
4444 /* If sub-page to access in second half of huge page */
4445 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4446 l = pages_per_huge_page - n;
4447 /* Clear sub-pages at the begin of huge page */
4448 for (i = 0; i < base; i++) {
4449 cond_resched();
4450 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4451 }
4452 }
4453 /*
4454 * Clear remaining sub-pages in left-right-left-right pattern
4455 * towards the sub-page to access
4456 */
4457 for (i = 0; i < l; i++) {
4458 int left_idx = base + i;
4459 int right_idx = base + 2 * l - 1 - i;
4460
4461 cond_resched();
4462 clear_user_highpage(page + left_idx,
4463 addr + left_idx * PAGE_SIZE);
4370 cond_resched(); 4464 cond_resched();
4371 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 4465 clear_user_highpage(page + right_idx,
4466 addr + right_idx * PAGE_SIZE);
4372 } 4467 }
4373} 4468}
4374 4469
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8dccc317aac2..73bf17df6899 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
773 node_set_state(node, N_MEMORY); 773 node_set_state(node, N_MEMORY);
774} 774}
775 775
776bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
777{
778 struct pglist_data *pgdat = NODE_DATA(nid);
779 struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
780 struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
781
782 /*
783 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
784 * physically before ZONE_MOVABLE. All we need is they do not
785 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
786 * though so let's stick with it for simplicity for now.
787 * TODO make sure we do not overlap with ZONE_DEVICE
788 */
789 if (online_type == MMOP_ONLINE_KERNEL) {
790 if (zone_is_empty(movable_zone))
791 return true;
792 return movable_zone->zone_start_pfn >= pfn + nr_pages;
793 } else if (online_type == MMOP_ONLINE_MOVABLE) {
794 return zone_end_pfn(default_zone) <= pfn;
795 }
796
797 /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
798 return online_type == MMOP_ONLINE_KEEP;
799}
800
801static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, 776static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
802 unsigned long nr_pages) 777 unsigned long nr_pages)
803{ 778{
@@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone,
856 * If no kernel zone covers this pfn range it will automatically go 831 * If no kernel zone covers this pfn range it will automatically go
857 * to the ZONE_NORMAL. 832 * to the ZONE_NORMAL.
858 */ 833 */
859struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, 834static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
860 unsigned long nr_pages) 835 unsigned long nr_pages)
861{ 836{
862 struct pglist_data *pgdat = NODE_DATA(nid); 837 struct pglist_data *pgdat = NODE_DATA(nid);
@@ -872,17 +847,40 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
872 return &pgdat->node_zones[ZONE_NORMAL]; 847 return &pgdat->node_zones[ZONE_NORMAL];
873} 848}
874 849
875static inline bool movable_pfn_range(int nid, struct zone *default_zone, 850static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
876 unsigned long start_pfn, unsigned long nr_pages) 851 unsigned long nr_pages)
877{ 852{
878 if (!allow_online_pfn_range(nid, start_pfn, nr_pages, 853 struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
879 MMOP_ONLINE_KERNEL)) 854 nr_pages);
880 return true; 855 struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
856 bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
857 bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
881 858
882 if (!movable_node_is_enabled()) 859 /*
883 return false; 860 * We inherit the existing zone in a simple case where zones do not
861 * overlap in the given range
862 */
863 if (in_kernel ^ in_movable)
864 return (in_kernel) ? kernel_zone : movable_zone;
884 865
885 return !zone_intersects(default_zone, start_pfn, nr_pages); 866 /*
867 * If the range doesn't belong to any zone or two zones overlap in the
868 * given range then we use movable zone only if movable_node is
869 * enabled because we always online to a kernel zone by default.
870 */
871 return movable_node_enabled ? movable_zone : kernel_zone;
872}
873
874struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
875 unsigned long nr_pages)
876{
877 if (online_type == MMOP_ONLINE_KERNEL)
878 return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
879
880 if (online_type == MMOP_ONLINE_MOVABLE)
881 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
882
883 return default_zone_for_pfn(nid, start_pfn, nr_pages);
886} 884}
887 885
888/* 886/*
@@ -892,28 +890,14 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone,
892static struct zone * __meminit move_pfn_range(int online_type, int nid, 890static struct zone * __meminit move_pfn_range(int online_type, int nid,
893 unsigned long start_pfn, unsigned long nr_pages) 891 unsigned long start_pfn, unsigned long nr_pages)
894{ 892{
895 struct pglist_data *pgdat = NODE_DATA(nid); 893 struct zone *zone;
896 struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
897
898 if (online_type == MMOP_ONLINE_KEEP) {
899 struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
900 /*
901 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
902 * movable zone if that is not possible (e.g. we are within
903 * or past the existing movable zone). movable_node overrides
904 * this default and defaults to movable zone
905 */
906 if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
907 zone = movable_zone;
908 } else if (online_type == MMOP_ONLINE_MOVABLE) {
909 zone = &pgdat->node_zones[ZONE_MOVABLE];
910 }
911 894
895 zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
912 move_pfn_range_to_zone(zone, start_pfn, nr_pages); 896 move_pfn_range_to_zone(zone, start_pfn, nr_pages);
913 return zone; 897 return zone;
914} 898}
915 899
916/* Must be protected by mem_hotplug_begin() */ 900/* Must be protected by mem_hotplug_begin() or a device_lock */
917int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 901int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
918{ 902{
919 unsigned long flags; 903 unsigned long flags;
@@ -925,9 +909,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
925 struct memory_notify arg; 909 struct memory_notify arg;
926 910
927 nid = pfn_to_nid(pfn); 911 nid = pfn_to_nid(pfn);
928 if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
929 return -EINVAL;
930
931 /* associate pfn range with the zone */ 912 /* associate pfn range with the zone */
932 zone = move_pfn_range(online_type, nid, pfn, nr_pages); 913 zone = move_pfn_range(online_type, nid, pfn, nr_pages);
933 914
@@ -945,10 +926,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
945 * This means the page allocator ignores this zone. 926 * This means the page allocator ignores this zone.
946 * So, zonelist must be updated after online. 927 * So, zonelist must be updated after online.
947 */ 928 */
948 mutex_lock(&zonelists_mutex);
949 if (!populated_zone(zone)) { 929 if (!populated_zone(zone)) {
950 need_zonelists_rebuild = 1; 930 need_zonelists_rebuild = 1;
951 build_all_zonelists(NULL, zone); 931 setup_zone_pageset(zone);
952 } 932 }
953 933
954 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 934 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
@@ -956,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
956 if (ret) { 936 if (ret) {
957 if (need_zonelists_rebuild) 937 if (need_zonelists_rebuild)
958 zone_pcp_reset(zone); 938 zone_pcp_reset(zone);
959 mutex_unlock(&zonelists_mutex);
960 goto failed_addition; 939 goto failed_addition;
961 } 940 }
962 941
@@ -969,13 +948,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
969 if (onlined_pages) { 948 if (onlined_pages) {
970 node_states_set_node(nid, &arg); 949 node_states_set_node(nid, &arg);
971 if (need_zonelists_rebuild) 950 if (need_zonelists_rebuild)
972 build_all_zonelists(NULL, NULL); 951 build_all_zonelists(NULL);
973 else 952 else
974 zone_pcp_update(zone); 953 zone_pcp_update(zone);
975 } 954 }
976 955
977 mutex_unlock(&zonelists_mutex);
978
979 init_per_zone_wmark_min(); 956 init_per_zone_wmark_min();
980 957
981 if (onlined_pages) { 958 if (onlined_pages) {
@@ -1046,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1046 * The node we allocated has no zone fallback lists. For avoiding 1023 * The node we allocated has no zone fallback lists. For avoiding
1047 * to access not-initialized zonelist, build here. 1024 * to access not-initialized zonelist, build here.
1048 */ 1025 */
1049 mutex_lock(&zonelists_mutex); 1026 build_all_zonelists(pgdat);
1050 build_all_zonelists(pgdat, NULL);
1051 mutex_unlock(&zonelists_mutex);
1052 1027
1053 /* 1028 /*
1054 * zone->managed_pages is set to an approximate value in 1029 * zone->managed_pages is set to an approximate value in
@@ -1100,13 +1075,6 @@ int try_online_node(int nid)
1100 node_set_online(nid); 1075 node_set_online(nid);
1101 ret = register_one_node(nid); 1076 ret = register_one_node(nid);
1102 BUG_ON(ret); 1077 BUG_ON(ret);
1103
1104 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
1105 mutex_lock(&zonelists_mutex);
1106 build_all_zonelists(NULL, NULL);
1107 mutex_unlock(&zonelists_mutex);
1108 }
1109
1110out: 1078out:
1111 mem_hotplug_done(); 1079 mem_hotplug_done();
1112 return ret; 1080 return ret;
@@ -1722,9 +1690,7 @@ repeat:
1722 1690
1723 if (!populated_zone(zone)) { 1691 if (!populated_zone(zone)) {
1724 zone_pcp_reset(zone); 1692 zone_pcp_reset(zone);
1725 mutex_lock(&zonelists_mutex); 1693 build_all_zonelists(NULL);
1726 build_all_zonelists(NULL, NULL);
1727 mutex_unlock(&zonelists_mutex);
1728 } else 1694 } else
1729 zone_pcp_update(zone); 1695 zone_pcp_update(zone);
1730 1696
@@ -1750,7 +1716,7 @@ failed_removal:
1750 return ret; 1716 return ret;
1751} 1717}
1752 1718
1753/* Must be protected by mem_hotplug_begin() */ 1719/* Must be protected by mem_hotplug_begin() or a device_lock */
1754int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1720int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1755{ 1721{
1756 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1722 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..4c5981651407 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -44,6 +44,7 @@
44#include <linux/userfaultfd_k.h> 44#include <linux/userfaultfd_k.h>
45#include <linux/moduleparam.h> 45#include <linux/moduleparam.h>
46#include <linux/pkeys.h> 46#include <linux/pkeys.h>
47#include <linux/oom.h>
47 48
48#include <linux/uaccess.h> 49#include <linux/uaccess.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
@@ -2639,13 +2640,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2639 if (vma->vm_start >= end) 2640 if (vma->vm_start >= end)
2640 return 0; 2641 return 0;
2641 2642
2642 if (uf) {
2643 int error = userfaultfd_unmap_prep(vma, start, end, uf);
2644
2645 if (error)
2646 return error;
2647 }
2648
2649 /* 2643 /*
2650 * If we need to split any vma, do it now to save pain later. 2644 * If we need to split any vma, do it now to save pain later.
2651 * 2645 *
@@ -2679,6 +2673,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2679 } 2673 }
2680 vma = prev ? prev->vm_next : mm->mmap; 2674 vma = prev ? prev->vm_next : mm->mmap;
2681 2675
2676 if (unlikely(uf)) {
2677 /*
2678 * If userfaultfd_unmap_prep returns an error the vmas
2679 * will remain splitted, but userland will get a
2680 * highly unexpected error anyway. This is no
2681 * different than the case where the first of the two
2682 * __split_vma fails, but we don't undo the first
2683 * split, despite we could. This is unlikely enough
2684 * failure that it's not worth optimizing it for.
2685 */
2686 int error = userfaultfd_unmap_prep(vma, start, end, uf);
2687 if (error)
2688 return error;
2689 }
2690
2682 /* 2691 /*
2683 * unlock any mlock()ed ranges before detaching vmas 2692 * unlock any mlock()ed ranges before detaching vmas
2684 */ 2693 */
@@ -2993,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm)
2993 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 3002 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2994 unmap_vmas(&tlb, vma, 0, -1); 3003 unmap_vmas(&tlb, vma, 0, -1);
2995 3004
3005 set_bit(MMF_OOM_SKIP, &mm->flags);
3006 if (unlikely(tsk_is_oom_victim(current))) {
3007 /*
3008 * Wait for oom_reap_task() to stop working on this
3009 * mm. Because MMF_OOM_SKIP is already set before
3010 * calling down_read(), oom_reap_task() will not run
3011 * on this "mm" post up_write().
3012 *
3013 * tsk_is_oom_victim() cannot be set from under us
3014 * either because current->mm is already set to NULL
3015 * under task_lock before calling mmput and oom_mm is
3016 * set not NULL by the OOM killer only if current->mm
3017 * is found not NULL while holding the task_lock.
3018 */
3019 down_write(&mm->mmap_sem);
3020 up_write(&mm->mmap_sem);
3021 }
2996 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); 3022 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
2997 tlb_finish_mmu(&tlb, 0, -1); 3023 tlb_finish_mmu(&tlb, 0, -1);
2998 3024
@@ -3514,7 +3540,7 @@ static int init_user_reserve(void)
3514{ 3540{
3515 unsigned long free_kbytes; 3541 unsigned long free_kbytes;
3516 3542
3517 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3543 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3518 3544
3519 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 3545 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3520 return 0; 3546 return 0;
@@ -3535,7 +3561,7 @@ static int init_admin_reserve(void)
3535{ 3561{
3536 unsigned long free_kbytes; 3562 unsigned long free_kbytes;
3537 3563
3538 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3564 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3539 3565
3540 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 3566 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3541 return 0; 3567 return 0;
@@ -3579,7 +3605,7 @@ static int reserve_mem_notifier(struct notifier_block *nb,
3579 3605
3580 break; 3606 break;
3581 case MEM_OFFLINE: 3607 case MEM_OFFLINE:
3582 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3608 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3583 3609
3584 if (sysctl_user_reserve_kbytes > free_kbytes) { 3610 if (sysctl_user_reserve_kbytes > free_kbytes) {
3585 init_user_reserve(); 3611 init_user_reserve();
diff --git a/mm/mremap.c b/mm/mremap.c
index 3f23715d3c69..7395564daa6c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -384,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
384 if (!vma || vma->vm_start > addr) 384 if (!vma || vma->vm_start > addr)
385 return ERR_PTR(-EFAULT); 385 return ERR_PTR(-EFAULT);
386 386
387 /*
388 * !old_len is a special case where an attempt is made to 'duplicate'
389 * a mapping. This makes no sense for private mappings as it will
390 * instead create a fresh/new mapping unrelated to the original. This
391 * is contrary to the basic idea of mremap which creates new mappings
392 * based on the original. There are no known use cases for this
393 * behavior. As a result, fail such attempts.
394 */
395 if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
396 pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
397 return ERR_PTR(-EINVAL);
398 }
399
387 if (is_vm_hugetlb_page(vma)) 400 if (is_vm_hugetlb_page(vma))
388 return ERR_PTR(-EINVAL); 401 return ERR_PTR(-EINVAL);
389 402
diff --git a/mm/nommu.c b/mm/nommu.c
index fc184f597d59..53d5175a5c14 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void)
1962{ 1962{
1963 unsigned long free_kbytes; 1963 unsigned long free_kbytes;
1964 1964
1965 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 1965 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
1966 1966
1967 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 1967 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
1968 return 0; 1968 return 0;
@@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void)
1983{ 1983{
1984 unsigned long free_kbytes; 1984 unsigned long free_kbytes;
1985 1985
1986 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 1986 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
1987 1987
1988 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 1988 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
1989 return 0; 1989 return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e8b4f030c1c..99736e026712 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
495 } 495 }
496 496
497 /* 497 /*
498 * increase mm_users only after we know we will reap something so 498 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
499 * that the mmput_async is called only when we have reaped something 499 * work on the mm anymore. The check for MMF_OOM_SKIP must run
500 * and delayed __mmput doesn't matter that much 500 * under mmap_sem for reading because it serializes against the
501 * down_write();up_write() cycle in exit_mmap().
501 */ 502 */
502 if (!mmget_not_zero(mm)) { 503 if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
503 up_read(&mm->mmap_sem); 504 up_read(&mm->mmap_sem);
504 trace_skip_task_reaping(tsk->pid); 505 trace_skip_task_reaping(tsk->pid);
505 goto unlock_oom; 506 goto unlock_oom;
@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
542 K(get_mm_counter(mm, MM_SHMEMPAGES))); 543 K(get_mm_counter(mm, MM_SHMEMPAGES)));
543 up_read(&mm->mmap_sem); 544 up_read(&mm->mmap_sem);
544 545
545 /*
546 * Drop our reference but make sure the mmput slow path is called from a
547 * different context because we shouldn't risk we get stuck there and
548 * put the oom_reaper out of the way.
549 */
550 mmput_async(mm);
551 trace_finish_task_reaping(tsk->pid); 546 trace_finish_task_reaping(tsk->pid);
552unlock_oom: 547unlock_oom:
553 mutex_unlock(&oom_lock); 548 mutex_unlock(&oom_lock);
@@ -824,7 +819,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
824 819
825 /* 820 /*
826 * If the task is already exiting, don't alarm the sysadmin or kill 821 * If the task is already exiting, don't alarm the sysadmin or kill
827 * its children or threads, just set TIF_MEMDIE so it can die quickly 822 * its children or threads, just give it access to memory reserves
823 * so it can die quickly
828 */ 824 */
829 task_lock(p); 825 task_lock(p);
830 if (task_will_free_mem(p)) { 826 if (task_will_free_mem(p)) {
@@ -889,9 +885,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
889 count_memcg_event_mm(mm, OOM_KILL); 885 count_memcg_event_mm(mm, OOM_KILL);
890 886
891 /* 887 /*
892 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent 888 * We should send SIGKILL before granting access to memory reserves
893 * the OOM victim from depleting the memory reserves from the user 889 * in order to prevent the OOM victim from depleting the memory
894 * space under its control. 890 * reserves from the user space under its control.
895 */ 891 */
896 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 892 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
897 mark_oom_victim(victim); 893 mark_oom_victim(victim);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bf050ab025b7..0b9c5cbe8eba 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void)
363{ 363{
364 unsigned long x; 364 unsigned long x;
365 365
366 x = global_page_state(NR_FREE_PAGES); 366 x = global_zone_page_state(NR_FREE_PAGES);
367 /* 367 /*
368 * Pages reserved for the kernel should not be considered 368 * Pages reserved for the kernel should not be considered
369 * dirtyable, to prevent a situation where reclaim has to 369 * dirtyable, to prevent a situation where reclaim has to
@@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
1405 * will look to see if it needs to start dirty throttling. 1405 * will look to see if it needs to start dirty throttling.
1406 * 1406 *
1407 * If dirty_poll_interval is too low, big NUMA machines will call the expensive 1407 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
1408 * global_page_state() too often. So scale it near-sqrt to the safety margin 1408 * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
1409 * (the number of pages we may dirty without exceeding the dirty limits). 1409 * (the number of pages we may dirty without exceeding the dirty limits).
1410 */ 1410 */
1411static unsigned long dirty_poll_interval(unsigned long dirty, 1411static unsigned long dirty_poll_interval(unsigned long dirty,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9327a940e373..a9add06fe768 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2951{ 2951{
2952 long min = mark; 2952 long min = mark;
2953 int o; 2953 int o;
2954 const bool alloc_harder = (alloc_flags & ALLOC_HARDER); 2954 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
2955 2955
2956 /* free_pages may go negative - that's OK */ 2956 /* free_pages may go negative - that's OK */
2957 free_pages -= (1 << order) - 1; 2957 free_pages -= (1 << order) - 1;
@@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2964 * the high-atomic reserves. This will over-estimate the size of the 2964 * the high-atomic reserves. This will over-estimate the size of the
2965 * atomic reserve but it avoids a search. 2965 * atomic reserve but it avoids a search.
2966 */ 2966 */
2967 if (likely(!alloc_harder)) 2967 if (likely(!alloc_harder)) {
2968 free_pages -= z->nr_reserved_highatomic; 2968 free_pages -= z->nr_reserved_highatomic;
2969 else 2969 } else {
2970 min -= min / 4; 2970 /*
2971 * OOM victims can try even harder than normal ALLOC_HARDER
2972 * users on the grounds that it's definitely going to be in
2973 * the exit path shortly and free memory. Any allocation it
2974 * makes during the free path will be small and short-lived.
2975 */
2976 if (alloc_flags & ALLOC_OOM)
2977 min -= min / 2;
2978 else
2979 min -= min / 4;
2980 }
2981
2971 2982
2972#ifdef CONFIG_CMA 2983#ifdef CONFIG_CMA
2973 /* If allocation can't use CMA areas don't use free CMA pages */ 2984 /* If allocation can't use CMA areas don't use free CMA pages */
@@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3205 * of allowed nodes. 3216 * of allowed nodes.
3206 */ 3217 */
3207 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3218 if (!(gfp_mask & __GFP_NOMEMALLOC))
3208 if (test_thread_flag(TIF_MEMDIE) || 3219 if (tsk_is_oom_victim(current) ||
3209 (current->flags & (PF_MEMALLOC | PF_EXITING))) 3220 (current->flags & (PF_MEMALLOC | PF_EXITING)))
3210 filter &= ~SHOW_MEM_FILTER_NODES; 3221 filter &= ~SHOW_MEM_FILTER_NODES;
3211 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3222 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
3668 return alloc_flags; 3679 return alloc_flags;
3669} 3680}
3670 3681
3671bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 3682static bool oom_reserves_allowed(struct task_struct *tsk)
3672{ 3683{
3673 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 3684 if (!tsk_is_oom_victim(tsk))
3674 return false; 3685 return false;
3675 3686
3687 /*
3688 * !MMU doesn't have oom reaper so give access to memory reserves
3689 * only to the thread with TIF_MEMDIE set
3690 */
3691 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
3692 return false;
3693
3694 return true;
3695}
3696
3697/*
3698 * Distinguish requests which really need access to full memory
3699 * reserves from oom victims which can live with a portion of it
3700 */
3701static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
3702{
3703 if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3704 return 0;
3676 if (gfp_mask & __GFP_MEMALLOC) 3705 if (gfp_mask & __GFP_MEMALLOC)
3677 return true; 3706 return ALLOC_NO_WATERMARKS;
3678 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 3707 if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3679 return true; 3708 return ALLOC_NO_WATERMARKS;
3680 if (!in_interrupt() && 3709 if (!in_interrupt()) {
3681 ((current->flags & PF_MEMALLOC) || 3710 if (current->flags & PF_MEMALLOC)
3682 unlikely(test_thread_flag(TIF_MEMDIE)))) 3711 return ALLOC_NO_WATERMARKS;
3683 return true; 3712 else if (oom_reserves_allowed(current))
3713 return ALLOC_OOM;
3714 }
3684 3715
3685 return false; 3716 return 0;
3717}
3718
3719bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3720{
3721 return !!__gfp_pfmemalloc_flags(gfp_mask);
3686} 3722}
3687 3723
3688/* 3724/*
@@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3835 unsigned long alloc_start = jiffies; 3871 unsigned long alloc_start = jiffies;
3836 unsigned int stall_timeout = 10 * HZ; 3872 unsigned int stall_timeout = 10 * HZ;
3837 unsigned int cpuset_mems_cookie; 3873 unsigned int cpuset_mems_cookie;
3874 int reserve_flags;
3838 3875
3839 /* 3876 /*
3840 * In the slowpath, we sanity check order to avoid ever trying to 3877 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3940,15 +3977,16 @@ retry:
3940 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3977 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3941 wake_all_kswapds(order, ac); 3978 wake_all_kswapds(order, ac);
3942 3979
3943 if (gfp_pfmemalloc_allowed(gfp_mask)) 3980 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
3944 alloc_flags = ALLOC_NO_WATERMARKS; 3981 if (reserve_flags)
3982 alloc_flags = reserve_flags;
3945 3983
3946 /* 3984 /*
3947 * Reset the zonelist iterators if memory policies can be ignored. 3985 * Reset the zonelist iterators if memory policies can be ignored.
3948 * These allocations are high priority and system rather than user 3986 * These allocations are high priority and system rather than user
3949 * orientated. 3987 * orientated.
3950 */ 3988 */
3951 if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { 3989 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
3952 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); 3990 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
3953 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3991 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3954 ac->high_zoneidx, ac->nodemask); 3992 ac->high_zoneidx, ac->nodemask);
@@ -4025,8 +4063,8 @@ retry:
4025 goto got_pg; 4063 goto got_pg;
4026 4064
4027 /* Avoid allocations with no watermarks from looping endlessly */ 4065 /* Avoid allocations with no watermarks from looping endlessly */
4028 if (test_thread_flag(TIF_MEMDIE) && 4066 if (tsk_is_oom_victim(current) &&
4029 (alloc_flags == ALLOC_NO_WATERMARKS || 4067 (alloc_flags == ALLOC_OOM ||
4030 (gfp_mask & __GFP_NOMEMALLOC))) 4068 (gfp_mask & __GFP_NOMEMALLOC)))
4031 goto nopage; 4069 goto nopage;
4032 4070
@@ -4509,7 +4547,7 @@ long si_mem_available(void)
4509 * Estimate the amount of memory available for userspace allocations, 4547 * Estimate the amount of memory available for userspace allocations,
4510 * without causing swapping. 4548 * without causing swapping.
4511 */ 4549 */
4512 available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; 4550 available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
4513 4551
4514 /* 4552 /*
4515 * Not all the page cache can be freed, otherwise the system will 4553 * Not all the page cache can be freed, otherwise the system will
@@ -4538,7 +4576,7 @@ void si_meminfo(struct sysinfo *val)
4538{ 4576{
4539 val->totalram = totalram_pages; 4577 val->totalram = totalram_pages;
4540 val->sharedram = global_node_page_state(NR_SHMEM); 4578 val->sharedram = global_node_page_state(NR_SHMEM);
4541 val->freeram = global_page_state(NR_FREE_PAGES); 4579 val->freeram = global_zone_page_state(NR_FREE_PAGES);
4542 val->bufferram = nr_blockdev_pages(); 4580 val->bufferram = nr_blockdev_pages();
4543 val->totalhigh = totalhigh_pages; 4581 val->totalhigh = totalhigh_pages;
4544 val->freehigh = nr_free_highpages(); 4582 val->freehigh = nr_free_highpages();
@@ -4673,11 +4711,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4673 global_node_page_state(NR_SLAB_UNRECLAIMABLE), 4711 global_node_page_state(NR_SLAB_UNRECLAIMABLE),
4674 global_node_page_state(NR_FILE_MAPPED), 4712 global_node_page_state(NR_FILE_MAPPED),
4675 global_node_page_state(NR_SHMEM), 4713 global_node_page_state(NR_SHMEM),
4676 global_page_state(NR_PAGETABLE), 4714 global_zone_page_state(NR_PAGETABLE),
4677 global_page_state(NR_BOUNCE), 4715 global_zone_page_state(NR_BOUNCE),
4678 global_page_state(NR_FREE_PAGES), 4716 global_zone_page_state(NR_FREE_PAGES),
4679 free_pcp, 4717 free_pcp,
4680 global_page_state(NR_FREE_CMA_PAGES)); 4718 global_zone_page_state(NR_FREE_CMA_PAGES));
4681 4719
4682 for_each_online_pgdat(pgdat) { 4720 for_each_online_pgdat(pgdat) {
4683 if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) 4721 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -4839,18 +4877,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
4839 * 4877 *
4840 * Add all populated zones of a node to the zonelist. 4878 * Add all populated zones of a node to the zonelist.
4841 */ 4879 */
4842static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 4880static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
4843 int nr_zones)
4844{ 4881{
4845 struct zone *zone; 4882 struct zone *zone;
4846 enum zone_type zone_type = MAX_NR_ZONES; 4883 enum zone_type zone_type = MAX_NR_ZONES;
4884 int nr_zones = 0;
4847 4885
4848 do { 4886 do {
4849 zone_type--; 4887 zone_type--;
4850 zone = pgdat->node_zones + zone_type; 4888 zone = pgdat->node_zones + zone_type;
4851 if (managed_zone(zone)) { 4889 if (managed_zone(zone)) {
4852 zoneref_set_zone(zone, 4890 zoneref_set_zone(zone, &zonerefs[nr_zones++]);
4853 &zonelist->_zonerefs[nr_zones++]);
4854 check_highest_zone(zone_type); 4891 check_highest_zone(zone_type);
4855 } 4892 }
4856 } while (zone_type); 4893 } while (zone_type);
@@ -4858,52 +4895,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
4858 return nr_zones; 4895 return nr_zones;
4859} 4896}
4860 4897
4861
4862/*
4863 * zonelist_order:
4864 * 0 = automatic detection of better ordering.
4865 * 1 = order by ([node] distance, -zonetype)
4866 * 2 = order by (-zonetype, [node] distance)
4867 *
4868 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
4869 * the same zonelist. So only NUMA can configure this param.
4870 */
4871#define ZONELIST_ORDER_DEFAULT 0
4872#define ZONELIST_ORDER_NODE 1
4873#define ZONELIST_ORDER_ZONE 2
4874
4875/* zonelist order in the kernel.
4876 * set_zonelist_order() will set this to NODE or ZONE.
4877 */
4878static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
4879static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
4880
4881
4882#ifdef CONFIG_NUMA 4898#ifdef CONFIG_NUMA
4883/* The value user specified ....changed by config */
4884static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
4885/* string for sysctl */
4886#define NUMA_ZONELIST_ORDER_LEN 16
4887char numa_zonelist_order[16] = "default";
4888
4889/*
4890 * interface for configure zonelist ordering.
4891 * command line option "numa_zonelist_order"
4892 * = "[dD]efault - default, automatic configuration.
4893 * = "[nN]ode - order by node locality, then by zone within node
4894 * = "[zZ]one - order by zone, then by locality within zone
4895 */
4896 4899
4897static int __parse_numa_zonelist_order(char *s) 4900static int __parse_numa_zonelist_order(char *s)
4898{ 4901{
4899 if (*s == 'd' || *s == 'D') { 4902 /*
4900 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 4903 * We used to support different zonlists modes but they turned
4901 } else if (*s == 'n' || *s == 'N') { 4904 * out to be just not useful. Let's keep the warning in place
4902 user_zonelist_order = ZONELIST_ORDER_NODE; 4905 * if somebody still use the cmd line parameter so that we do
4903 } else if (*s == 'z' || *s == 'Z') { 4906 * not fail it silently
4904 user_zonelist_order = ZONELIST_ORDER_ZONE; 4907 */
4905 } else { 4908 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
4906 pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); 4909 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
4907 return -EINVAL; 4910 return -EINVAL;
4908 } 4911 }
4909 return 0; 4912 return 0;
@@ -4911,19 +4914,15 @@ static int __parse_numa_zonelist_order(char *s)
4911 4914
4912static __init int setup_numa_zonelist_order(char *s) 4915static __init int setup_numa_zonelist_order(char *s)
4913{ 4916{
4914 int ret;
4915
4916 if (!s) 4917 if (!s)
4917 return 0; 4918 return 0;
4918 4919
4919 ret = __parse_numa_zonelist_order(s); 4920 return __parse_numa_zonelist_order(s);
4920 if (ret == 0)
4921 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
4922
4923 return ret;
4924} 4921}
4925early_param("numa_zonelist_order", setup_numa_zonelist_order); 4922early_param("numa_zonelist_order", setup_numa_zonelist_order);
4926 4923
4924char numa_zonelist_order[] = "Node";
4925
4927/* 4926/*
4928 * sysctl handler for numa_zonelist_order 4927 * sysctl handler for numa_zonelist_order
4929 */ 4928 */
@@ -4931,42 +4930,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
4931 void __user *buffer, size_t *length, 4930 void __user *buffer, size_t *length,
4932 loff_t *ppos) 4931 loff_t *ppos)
4933{ 4932{
4934 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 4933 char *str;
4935 int ret; 4934 int ret;
4936 static DEFINE_MUTEX(zl_order_mutex);
4937 4935
4938 mutex_lock(&zl_order_mutex); 4936 if (!write)
4939 if (write) { 4937 return proc_dostring(table, write, buffer, length, ppos);
4940 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 4938 str = memdup_user_nul(buffer, 16);
4941 ret = -EINVAL; 4939 if (IS_ERR(str))
4942 goto out; 4940 return PTR_ERR(str);
4943 }
4944 strcpy(saved_string, (char *)table->data);
4945 }
4946 ret = proc_dostring(table, write, buffer, length, ppos);
4947 if (ret)
4948 goto out;
4949 if (write) {
4950 int oldval = user_zonelist_order;
4951 4941
4952 ret = __parse_numa_zonelist_order((char *)table->data); 4942 ret = __parse_numa_zonelist_order(str);
4953 if (ret) { 4943 kfree(str);
4954 /*
4955 * bogus value. restore saved string
4956 */
4957 strncpy((char *)table->data, saved_string,
4958 NUMA_ZONELIST_ORDER_LEN);
4959 user_zonelist_order = oldval;
4960 } else if (oldval != user_zonelist_order) {
4961 mem_hotplug_begin();
4962 mutex_lock(&zonelists_mutex);
4963 build_all_zonelists(NULL, NULL);
4964 mutex_unlock(&zonelists_mutex);
4965 mem_hotplug_done();
4966 }
4967 }
4968out:
4969 mutex_unlock(&zl_order_mutex);
4970 return ret; 4944 return ret;
4971} 4945}
4972 4946
@@ -5040,17 +5014,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
5040 * This results in maximum locality--normal zone overflows into local 5014 * This results in maximum locality--normal zone overflows into local
5041 * DMA zone, if any--but risks exhausting DMA zone. 5015 * DMA zone, if any--but risks exhausting DMA zone.
5042 */ 5016 */
5043static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 5017static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
5018 unsigned nr_nodes)
5044{ 5019{
5045 int j; 5020 struct zoneref *zonerefs;
5046 struct zonelist *zonelist; 5021 int i;
5022
5023 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5024
5025 for (i = 0; i < nr_nodes; i++) {
5026 int nr_zones;
5047 5027
5048 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; 5028 pg_data_t *node = NODE_DATA(node_order[i]);
5049 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 5029
5050 ; 5030 nr_zones = build_zonerefs_node(node, zonerefs);
5051 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 5031 zonerefs += nr_zones;
5052 zonelist->_zonerefs[j].zone = NULL; 5032 }
5053 zonelist->_zonerefs[j].zone_idx = 0; 5033 zonerefs->zone = NULL;
5034 zonerefs->zone_idx = 0;
5054} 5035}
5055 5036
5056/* 5037/*
@@ -5058,13 +5039,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
5058 */ 5039 */
5059static void build_thisnode_zonelists(pg_data_t *pgdat) 5040static void build_thisnode_zonelists(pg_data_t *pgdat)
5060{ 5041{
5061 int j; 5042 struct zoneref *zonerefs;
5062 struct zonelist *zonelist; 5043 int nr_zones;
5063 5044
5064 zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; 5045 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5065 j = build_zonelists_node(pgdat, zonelist, 0); 5046 nr_zones = build_zonerefs_node(pgdat, zonerefs);
5066 zonelist->_zonerefs[j].zone = NULL; 5047 zonerefs += nr_zones;
5067 zonelist->_zonerefs[j].zone_idx = 0; 5048 zonerefs->zone = NULL;
5049 zonerefs->zone_idx = 0;
5068} 5050}
5069 5051
5070/* 5052/*
@@ -5073,79 +5055,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
5073 * exhausted, but results in overflowing to remote node while memory 5055 * exhausted, but results in overflowing to remote node while memory
5074 * may still exist in local DMA zone. 5056 * may still exist in local DMA zone.
5075 */ 5057 */
5076static int node_order[MAX_NUMNODES];
5077
5078static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
5079{
5080 int pos, j, node;
5081 int zone_type; /* needs to be signed */
5082 struct zone *z;
5083 struct zonelist *zonelist;
5084
5085 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
5086 pos = 0;
5087 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
5088 for (j = 0; j < nr_nodes; j++) {
5089 node = node_order[j];
5090 z = &NODE_DATA(node)->node_zones[zone_type];
5091 if (managed_zone(z)) {
5092 zoneref_set_zone(z,
5093 &zonelist->_zonerefs[pos++]);
5094 check_highest_zone(zone_type);
5095 }
5096 }
5097 }
5098 zonelist->_zonerefs[pos].zone = NULL;
5099 zonelist->_zonerefs[pos].zone_idx = 0;
5100}
5101
5102#if defined(CONFIG_64BIT)
5103/*
5104 * Devices that require DMA32/DMA are relatively rare and do not justify a
5105 * penalty to every machine in case the specialised case applies. Default
5106 * to Node-ordering on 64-bit NUMA machines
5107 */
5108static int default_zonelist_order(void)
5109{
5110 return ZONELIST_ORDER_NODE;
5111}
5112#else
5113/*
5114 * On 32-bit, the Normal zone needs to be preserved for allocations accessible
5115 * by the kernel. If processes running on node 0 deplete the low memory zone
5116 * then reclaim will occur more frequency increasing stalls and potentially
5117 * be easier to OOM if a large percentage of the zone is under writeback or
5118 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
5119 * Hence, default to zone ordering on 32-bit.
5120 */
5121static int default_zonelist_order(void)
5122{
5123 return ZONELIST_ORDER_ZONE;
5124}
5125#endif /* CONFIG_64BIT */
5126
5127static void set_zonelist_order(void)
5128{
5129 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
5130 current_zonelist_order = default_zonelist_order();
5131 else
5132 current_zonelist_order = user_zonelist_order;
5133}
5134 5058
5135static void build_zonelists(pg_data_t *pgdat) 5059static void build_zonelists(pg_data_t *pgdat)
5136{ 5060{
5137 int i, node, load; 5061 static int node_order[MAX_NUMNODES];
5062 int node, load, nr_nodes = 0;
5138 nodemask_t used_mask; 5063 nodemask_t used_mask;
5139 int local_node, prev_node; 5064 int local_node, prev_node;
5140 struct zonelist *zonelist;
5141 unsigned int order = current_zonelist_order;
5142
5143 /* initialize zonelists */
5144 for (i = 0; i < MAX_ZONELISTS; i++) {
5145 zonelist = pgdat->node_zonelists + i;
5146 zonelist->_zonerefs[0].zone = NULL;
5147 zonelist->_zonerefs[0].zone_idx = 0;
5148 }
5149 5065
5150 /* NUMA-aware ordering of nodes */ 5066 /* NUMA-aware ordering of nodes */
5151 local_node = pgdat->node_id; 5067 local_node = pgdat->node_id;
@@ -5154,8 +5070,6 @@ static void build_zonelists(pg_data_t *pgdat)
5154 nodes_clear(used_mask); 5070 nodes_clear(used_mask);
5155 5071
5156 memset(node_order, 0, sizeof(node_order)); 5072 memset(node_order, 0, sizeof(node_order));
5157 i = 0;
5158
5159 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 5073 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5160 /* 5074 /*
5161 * We don't want to pressure a particular node. 5075 * We don't want to pressure a particular node.
@@ -5166,19 +5080,12 @@ static void build_zonelists(pg_data_t *pgdat)
5166 node_distance(local_node, prev_node)) 5080 node_distance(local_node, prev_node))
5167 node_load[node] = load; 5081 node_load[node] = load;
5168 5082
5083 node_order[nr_nodes++] = node;
5169 prev_node = node; 5084 prev_node = node;
5170 load--; 5085 load--;
5171 if (order == ZONELIST_ORDER_NODE)
5172 build_zonelists_in_node_order(pgdat, node);
5173 else
5174 node_order[i++] = node; /* remember order */
5175 }
5176
5177 if (order == ZONELIST_ORDER_ZONE) {
5178 /* calculate node order -- i.e., DMA last! */
5179 build_zonelists_in_zone_order(pgdat, i);
5180 } 5086 }
5181 5087
5088 build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5182 build_thisnode_zonelists(pgdat); 5089 build_thisnode_zonelists(pgdat);
5183} 5090}
5184 5091
@@ -5204,21 +5111,17 @@ static void setup_min_unmapped_ratio(void);
5204static void setup_min_slab_ratio(void); 5111static void setup_min_slab_ratio(void);
5205#else /* CONFIG_NUMA */ 5112#else /* CONFIG_NUMA */
5206 5113
5207static void set_zonelist_order(void)
5208{
5209 current_zonelist_order = ZONELIST_ORDER_ZONE;
5210}
5211
5212static void build_zonelists(pg_data_t *pgdat) 5114static void build_zonelists(pg_data_t *pgdat)
5213{ 5115{
5214 int node, local_node; 5116 int node, local_node;
5215 enum zone_type j; 5117 struct zoneref *zonerefs;
5216 struct zonelist *zonelist; 5118 int nr_zones;
5217 5119
5218 local_node = pgdat->node_id; 5120 local_node = pgdat->node_id;
5219 5121
5220 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; 5122 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5221 j = build_zonelists_node(pgdat, zonelist, 0); 5123 nr_zones = build_zonerefs_node(pgdat, zonerefs);
5124 zonerefs += nr_zones;
5222 5125
5223 /* 5126 /*
5224 * Now we build the zonelist so that it contains the zones 5127 * Now we build the zonelist so that it contains the zones
@@ -5231,16 +5134,18 @@ static void build_zonelists(pg_data_t *pgdat)
5231 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 5134 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
5232 if (!node_online(node)) 5135 if (!node_online(node))
5233 continue; 5136 continue;
5234 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 5137 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5138 zonerefs += nr_zones;
5235 } 5139 }
5236 for (node = 0; node < local_node; node++) { 5140 for (node = 0; node < local_node; node++) {
5237 if (!node_online(node)) 5141 if (!node_online(node))
5238 continue; 5142 continue;
5239 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 5143 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5144 zonerefs += nr_zones;
5240 } 5145 }
5241 5146
5242 zonelist->_zonerefs[j].zone = NULL; 5147 zonerefs->zone = NULL;
5243 zonelist->_zonerefs[j].zone_idx = 0; 5148 zonerefs->zone_idx = 0;
5244} 5149}
5245 5150
5246#endif /* CONFIG_NUMA */ 5151#endif /* CONFIG_NUMA */
@@ -5263,50 +5168,32 @@ static void build_zonelists(pg_data_t *pgdat)
5263static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 5168static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
5264static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 5169static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
5265static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 5170static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
5266static void setup_zone_pageset(struct zone *zone);
5267
5268/*
5269 * Global mutex to protect against size modification of zonelists
5270 * as well as to serialize pageset setup for the new populated zone.
5271 */
5272DEFINE_MUTEX(zonelists_mutex);
5273 5171
5274/* return values int ....just for stop_machine() */ 5172static void __build_all_zonelists(void *data)
5275static int __build_all_zonelists(void *data)
5276{ 5173{
5277 int nid; 5174 int nid;
5278 int cpu; 5175 int __maybe_unused cpu;
5279 pg_data_t *self = data; 5176 pg_data_t *self = data;
5177 static DEFINE_SPINLOCK(lock);
5178
5179 spin_lock(&lock);
5280 5180
5281#ifdef CONFIG_NUMA 5181#ifdef CONFIG_NUMA
5282 memset(node_load, 0, sizeof(node_load)); 5182 memset(node_load, 0, sizeof(node_load));
5283#endif 5183#endif
5284 5184
5185 /*
5186 * This node is hotadded and no memory is yet present. So just
5187 * building zonelists is fine - no need to touch other nodes.
5188 */
5285 if (self && !node_online(self->node_id)) { 5189 if (self && !node_online(self->node_id)) {
5286 build_zonelists(self); 5190 build_zonelists(self);
5287 } 5191 } else {
5288 5192 for_each_online_node(nid) {
5289 for_each_online_node(nid) { 5193 pg_data_t *pgdat = NODE_DATA(nid);
5290 pg_data_t *pgdat = NODE_DATA(nid);
5291
5292 build_zonelists(pgdat);
5293 }
5294 5194
5295 /* 5195 build_zonelists(pgdat);
5296 * Initialize the boot_pagesets that are going to be used 5196 }
5297 * for bootstrapping processors. The real pagesets for
5298 * each zone will be allocated later when the per cpu
5299 * allocator is available.
5300 *
5301 * boot_pagesets are used also for bootstrapping offline
5302 * cpus if the system is already booted because the pagesets
5303 * are needed to initialize allocators on a specific cpu too.
5304 * F.e. the percpu allocator needs the page allocator which
5305 * needs the percpu allocator in order to allocate its pagesets
5306 * (a chicken-egg dilemma).
5307 */
5308 for_each_possible_cpu(cpu) {
5309 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
5310 5197
5311#ifdef CONFIG_HAVE_MEMORYLESS_NODES 5198#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5312 /* 5199 /*
@@ -5317,45 +5204,53 @@ static int __build_all_zonelists(void *data)
5317 * secondary cpus' numa_mem as they come on-line. During 5204 * secondary cpus' numa_mem as they come on-line. During
5318 * node/memory hotplug, we'll fixup all on-line cpus. 5205 * node/memory hotplug, we'll fixup all on-line cpus.
5319 */ 5206 */
5320 if (cpu_online(cpu)) 5207 for_each_online_cpu(cpu)
5321 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 5208 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5322#endif 5209#endif
5323 } 5210 }
5324 5211
5325 return 0; 5212 spin_unlock(&lock);
5326} 5213}
5327 5214
5328static noinline void __init 5215static noinline void __init
5329build_all_zonelists_init(void) 5216build_all_zonelists_init(void)
5330{ 5217{
5218 int cpu;
5219
5331 __build_all_zonelists(NULL); 5220 __build_all_zonelists(NULL);
5221
5222 /*
5223 * Initialize the boot_pagesets that are going to be used
5224 * for bootstrapping processors. The real pagesets for
5225 * each zone will be allocated later when the per cpu
5226 * allocator is available.
5227 *
5228 * boot_pagesets are used also for bootstrapping offline
5229 * cpus if the system is already booted because the pagesets
5230 * are needed to initialize allocators on a specific cpu too.
5231 * F.e. the percpu allocator needs the page allocator which
5232 * needs the percpu allocator in order to allocate its pagesets
5233 * (a chicken-egg dilemma).
5234 */
5235 for_each_possible_cpu(cpu)
5236 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
5237
5332 mminit_verify_zonelist(); 5238 mminit_verify_zonelist();
5333 cpuset_init_current_mems_allowed(); 5239 cpuset_init_current_mems_allowed();
5334} 5240}
5335 5241
5336/* 5242/*
5337 * Called with zonelists_mutex held always
5338 * unless system_state == SYSTEM_BOOTING. 5243 * unless system_state == SYSTEM_BOOTING.
5339 * 5244 *
5340 * __ref due to (1) call of __meminit annotated setup_zone_pageset 5245 * __ref due to call of __init annotated helper build_all_zonelists_init
5341 * [we're only called with non-NULL zone through __meminit paths] and
5342 * (2) call of __init annotated helper build_all_zonelists_init
5343 * [protected by SYSTEM_BOOTING]. 5246 * [protected by SYSTEM_BOOTING].
5344 */ 5247 */
5345void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 5248void __ref build_all_zonelists(pg_data_t *pgdat)
5346{ 5249{
5347 set_zonelist_order();
5348
5349 if (system_state == SYSTEM_BOOTING) { 5250 if (system_state == SYSTEM_BOOTING) {
5350 build_all_zonelists_init(); 5251 build_all_zonelists_init();
5351 } else { 5252 } else {
5352#ifdef CONFIG_MEMORY_HOTPLUG 5253 __build_all_zonelists(pgdat);
5353 if (zone)
5354 setup_zone_pageset(zone);
5355#endif
5356 /* we have to stop all cpus to guarantee there is no user
5357 of zonelist */
5358 stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
5359 /* cpuset refresh routine should be here */ 5254 /* cpuset refresh routine should be here */
5360 } 5255 }
5361 vm_total_pages = nr_free_pagecache_pages(); 5256 vm_total_pages = nr_free_pagecache_pages();
@@ -5371,9 +5266,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
5371 else 5266 else
5372 page_group_by_mobility_disabled = 0; 5267 page_group_by_mobility_disabled = 0;
5373 5268
5374 pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", 5269 pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
5375 nr_online_nodes, 5270 nr_online_nodes,
5376 zonelist_order_name[current_zonelist_order],
5377 page_group_by_mobility_disabled ? "off" : "on", 5271 page_group_by_mobility_disabled ? "off" : "on",
5378 vm_total_pages); 5272 vm_total_pages);
5379#ifdef CONFIG_NUMA 5273#ifdef CONFIG_NUMA
@@ -5627,7 +5521,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
5627 pageset_set_high_and_batch(zone, pcp); 5521 pageset_set_high_and_batch(zone, pcp);
5628} 5522}
5629 5523
5630static void __meminit setup_zone_pageset(struct zone *zone) 5524void __meminit setup_zone_pageset(struct zone *zone)
5631{ 5525{
5632 int cpu; 5526 int cpu;
5633 zone->pageset = alloc_percpu(struct per_cpu_pageset); 5527 zone->pageset = alloc_percpu(struct per_cpu_pageset);
@@ -7081,9 +6975,11 @@ static void __setup_per_zone_wmarks(void)
7081 */ 6975 */
7082void setup_per_zone_wmarks(void) 6976void setup_per_zone_wmarks(void)
7083{ 6977{
7084 mutex_lock(&zonelists_mutex); 6978 static DEFINE_SPINLOCK(lock);
6979
6980 spin_lock(&lock);
7085 __setup_per_zone_wmarks(); 6981 __setup_per_zone_wmarks();
7086 mutex_unlock(&zonelists_mutex); 6982 spin_unlock(&lock);
7087} 6983}
7088 6984
7089/* 6985/*
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 88ccc044b09a..32f18911deda 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid)
222 return addr; 222 return addr;
223 } 223 }
224 224
225 if (node_state(nid, N_HIGH_MEMORY)) 225 addr = vzalloc_node(size, nid);
226 addr = vzalloc_node(size, nid);
227 else
228 addr = vzalloc(size);
229 226
230 return addr; 227 return addr;
231} 228}
@@ -409,6 +406,7 @@ void __init page_ext_init(void)
409 continue; 406 continue;
410 if (init_section_page_ext(pfn, nid)) 407 if (init_section_page_ext(pfn, nid))
411 goto oom; 408 goto oom;
409 cond_resched();
412 } 410 }
413 } 411 }
414 hotplug_memory_notifier(page_ext_callback, 0); 412 hotplug_memory_notifier(page_ext_callback, 0);
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 1b0f48c62316..4bd03a8d809e 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = {
204 NULL, 204 NULL,
205}; 205};
206 206
207static struct attribute_group page_idle_attr_group = { 207static const struct attribute_group page_idle_attr_group = {
208 .bin_attrs = page_idle_bin_attrs, 208 .bin_attrs = page_idle_bin_attrs,
209 .name = "page_idle", 209 .name = "page_idle",
210}; 210};
diff --git a/mm/page_io.c b/mm/page_io.c
index 5f61b54ee1f3..20139b90125a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -28,16 +28,18 @@
28static struct bio *get_swap_bio(gfp_t gfp_flags, 28static struct bio *get_swap_bio(gfp_t gfp_flags,
29 struct page *page, bio_end_io_t end_io) 29 struct page *page, bio_end_io_t end_io)
30{ 30{
31 int i, nr = hpage_nr_pages(page);
31 struct bio *bio; 32 struct bio *bio;
32 33
33 bio = bio_alloc(gfp_flags, 1); 34 bio = bio_alloc(gfp_flags, nr);
34 if (bio) { 35 if (bio) {
35 bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); 36 bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
36 bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; 37 bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
37 bio->bi_end_io = end_io; 38 bio->bi_end_io = end_io;
38 39
39 bio_add_page(bio, page, PAGE_SIZE, 0); 40 for (i = 0; i < nr; i++)
40 BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE); 41 bio_add_page(bio, page + i, PAGE_SIZE, 0);
42 VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
41 } 43 }
42 return bio; 44 return bio;
43} 45}
@@ -262,6 +264,15 @@ static sector_t swap_page_sector(struct page *page)
262 return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9); 264 return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
263} 265}
264 266
267static inline void count_swpout_vm_event(struct page *page)
268{
269#ifdef CONFIG_TRANSPARENT_HUGEPAGE
270 if (unlikely(PageTransHuge(page)))
271 count_vm_event(THP_SWPOUT);
272#endif
273 count_vm_events(PSWPOUT, hpage_nr_pages(page));
274}
275
265int __swap_writepage(struct page *page, struct writeback_control *wbc, 276int __swap_writepage(struct page *page, struct writeback_control *wbc,
266 bio_end_io_t end_write_func) 277 bio_end_io_t end_write_func)
267{ 278{
@@ -313,7 +324,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
313 324
314 ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); 325 ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
315 if (!ret) { 326 if (!ret) {
316 count_vm_event(PSWPOUT); 327 count_swpout_vm_event(page);
317 return 0; 328 return 0;
318 } 329 }
319 330
@@ -326,7 +337,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
326 goto out; 337 goto out;
327 } 338 }
328 bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); 339 bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
329 count_vm_event(PSWPOUT); 340 count_swpout_vm_event(page);
330 set_page_writeback(page); 341 set_page_writeback(page);
331 unlock_page(page); 342 unlock_page(page);
332 submit_bio(bio); 343 submit_bio(bio);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 0fd9dcf2c5dc..8e2d7137510c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited);
30 30
31static depot_stack_handle_t dummy_handle; 31static depot_stack_handle_t dummy_handle;
32static depot_stack_handle_t failure_handle; 32static depot_stack_handle_t failure_handle;
33static depot_stack_handle_t early_handle;
33 34
34static void init_early_allocated_pages(void); 35static void init_early_allocated_pages(void);
35 36
@@ -53,7 +54,7 @@ static bool need_page_owner(void)
53 return true; 54 return true;
54} 55}
55 56
56static noinline void register_dummy_stack(void) 57static __always_inline depot_stack_handle_t create_dummy_stack(void)
57{ 58{
58 unsigned long entries[4]; 59 unsigned long entries[4];
59 struct stack_trace dummy; 60 struct stack_trace dummy;
@@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void)
64 dummy.skip = 0; 65 dummy.skip = 0;
65 66
66 save_stack_trace(&dummy); 67 save_stack_trace(&dummy);
67 dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); 68 return depot_save_stack(&dummy, GFP_KERNEL);
68} 69}
69 70
70static noinline void register_failure_stack(void) 71static noinline void register_dummy_stack(void)
71{ 72{
72 unsigned long entries[4]; 73 dummy_handle = create_dummy_stack();
73 struct stack_trace failure; 74}
74 75
75 failure.nr_entries = 0; 76static noinline void register_failure_stack(void)
76 failure.max_entries = ARRAY_SIZE(entries); 77{
77 failure.entries = &entries[0]; 78 failure_handle = create_dummy_stack();
78 failure.skip = 0; 79}
79 80
80 save_stack_trace(&failure); 81static noinline void register_early_stack(void)
81 failure_handle = depot_save_stack(&failure, GFP_KERNEL); 82{
83 early_handle = create_dummy_stack();
82} 84}
83 85
84static void init_page_owner(void) 86static void init_page_owner(void)
@@ -88,6 +90,7 @@ static void init_page_owner(void)
88 90
89 register_dummy_stack(); 91 register_dummy_stack();
90 register_failure_stack(); 92 register_failure_stack();
93 register_early_stack();
91 static_branch_enable(&page_owner_inited); 94 static_branch_enable(&page_owner_inited);
92 init_early_allocated_pages(); 95 init_early_allocated_pages();
93} 96}
@@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
165 return handle; 168 return handle;
166} 169}
167 170
168noinline void __set_page_owner(struct page *page, unsigned int order, 171static inline void __set_page_owner_handle(struct page_ext *page_ext,
169 gfp_t gfp_mask) 172 depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
170{ 173{
171 struct page_ext *page_ext = lookup_page_ext(page);
172 struct page_owner *page_owner; 174 struct page_owner *page_owner;
173 175
174 if (unlikely(!page_ext))
175 return;
176
177 page_owner = get_page_owner(page_ext); 176 page_owner = get_page_owner(page_ext);
178 page_owner->handle = save_stack(gfp_mask); 177 page_owner->handle = handle;
179 page_owner->order = order; 178 page_owner->order = order;
180 page_owner->gfp_mask = gfp_mask; 179 page_owner->gfp_mask = gfp_mask;
181 page_owner->last_migrate_reason = -1; 180 page_owner->last_migrate_reason = -1;
@@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
183 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 182 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
184} 183}
185 184
185noinline void __set_page_owner(struct page *page, unsigned int order,
186 gfp_t gfp_mask)
187{
188 struct page_ext *page_ext = lookup_page_ext(page);
189 depot_stack_handle_t handle;
190
191 if (unlikely(!page_ext))
192 return;
193
194 handle = save_stack(gfp_mask);
195 __set_page_owner_handle(page_ext, handle, order, gfp_mask);
196}
197
186void __set_page_owner_migrate_reason(struct page *page, int reason) 198void __set_page_owner_migrate_reason(struct page *page, int reason)
187{ 199{
188 struct page_ext *page_ext = lookup_page_ext(page); 200 struct page_ext *page_ext = lookup_page_ext(page);
@@ -550,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
550 continue; 562 continue;
551 563
552 /* 564 /*
553 * We are safe to check buddy flag and order, because 565 * To avoid having to grab zone->lock, be a little
554 * this is init stage and only single thread runs. 566 * careful when reading buddy page order. The only
567 * danger is that we skip too much and potentially miss
568 * some early allocated pages, which is better than
569 * heavy lock contention.
555 */ 570 */
556 if (PageBuddy(page)) { 571 if (PageBuddy(page)) {
557 pfn += (1UL << page_order(page)) - 1; 572 unsigned long order = page_order_unsafe(page);
573
574 if (order > 0 && order < MAX_ORDER)
575 pfn += (1UL << order) - 1;
558 continue; 576 continue;
559 } 577 }
560 578
@@ -565,14 +583,15 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
565 if (unlikely(!page_ext)) 583 if (unlikely(!page_ext))
566 continue; 584 continue;
567 585
568 /* Maybe overraping zone */ 586 /* Maybe overlapping zone */
569 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 587 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
570 continue; 588 continue;
571 589
572 /* Found early allocated page */ 590 /* Found early allocated page */
573 set_page_owner(page, 0, 0); 591 __set_page_owner_handle(page_ext, early_handle, 0, 0);
574 count++; 592 count++;
575 } 593 }
594 cond_resched();
576 } 595 }
577 596
578 pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", 597 pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
@@ -583,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat)
583{ 602{
584 struct zone *zone; 603 struct zone *zone;
585 struct zone *node_zones = pgdat->node_zones; 604 struct zone *node_zones = pgdat->node_zones;
586 unsigned long flags;
587 605
588 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 606 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
589 if (!populated_zone(zone)) 607 if (!populated_zone(zone))
590 continue; 608 continue;
591 609
592 spin_lock_irqsave(&zone->lock, flags);
593 init_pages_in_zone(pgdat, zone); 610 init_pages_in_zone(pgdat, zone);
594 spin_unlock_irqrestore(&zone->lock, flags);
595 } 611 }
596} 612}
597 613
diff --git a/mm/shmem.c b/mm/shmem.c
index fbcb3c96a186..ace53a582be5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,7 @@
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/uio.h> 35#include <linux/uio.h>
36#include <linux/khugepaged.h> 36#include <linux/khugepaged.h>
37#include <linux/hugetlb.h>
37 38
38#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ 39#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
39 40
@@ -188,6 +189,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
188 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 189 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
189} 190}
190 191
192static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
193{
194 struct shmem_inode_info *info = SHMEM_I(inode);
195 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
196
197 if (shmem_acct_block(info->flags, pages))
198 return false;
199
200 if (sbinfo->max_blocks) {
201 if (percpu_counter_compare(&sbinfo->used_blocks,
202 sbinfo->max_blocks - pages) > 0)
203 goto unacct;
204 percpu_counter_add(&sbinfo->used_blocks, pages);
205 }
206
207 return true;
208
209unacct:
210 shmem_unacct_blocks(info->flags, pages);
211 return false;
212}
213
214static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
215{
216 struct shmem_inode_info *info = SHMEM_I(inode);
217 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
218
219 if (sbinfo->max_blocks)
220 percpu_counter_sub(&sbinfo->used_blocks, pages);
221 shmem_unacct_blocks(info->flags, pages);
222}
223
191static const struct super_operations shmem_ops; 224static const struct super_operations shmem_ops;
192static const struct address_space_operations shmem_aops; 225static const struct address_space_operations shmem_aops;
193static const struct file_operations shmem_file_operations; 226static const struct file_operations shmem_file_operations;
@@ -249,23 +282,20 @@ static void shmem_recalc_inode(struct inode *inode)
249 282
250 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 283 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
251 if (freed > 0) { 284 if (freed > 0) {
252 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
253 if (sbinfo->max_blocks)
254 percpu_counter_add(&sbinfo->used_blocks, -freed);
255 info->alloced -= freed; 285 info->alloced -= freed;
256 inode->i_blocks -= freed * BLOCKS_PER_PAGE; 286 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
257 shmem_unacct_blocks(info->flags, freed); 287 shmem_inode_unacct_blocks(inode, freed);
258 } 288 }
259} 289}
260 290
261bool shmem_charge(struct inode *inode, long pages) 291bool shmem_charge(struct inode *inode, long pages)
262{ 292{
263 struct shmem_inode_info *info = SHMEM_I(inode); 293 struct shmem_inode_info *info = SHMEM_I(inode);
264 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
265 unsigned long flags; 294 unsigned long flags;
266 295
267 if (shmem_acct_block(info->flags, pages)) 296 if (!shmem_inode_acct_block(inode, pages))
268 return false; 297 return false;
298
269 spin_lock_irqsave(&info->lock, flags); 299 spin_lock_irqsave(&info->lock, flags);
270 info->alloced += pages; 300 info->alloced += pages;
271 inode->i_blocks += pages * BLOCKS_PER_PAGE; 301 inode->i_blocks += pages * BLOCKS_PER_PAGE;
@@ -273,26 +303,12 @@ bool shmem_charge(struct inode *inode, long pages)
273 spin_unlock_irqrestore(&info->lock, flags); 303 spin_unlock_irqrestore(&info->lock, flags);
274 inode->i_mapping->nrpages += pages; 304 inode->i_mapping->nrpages += pages;
275 305
276 if (!sbinfo->max_blocks)
277 return true;
278 if (percpu_counter_compare(&sbinfo->used_blocks,
279 sbinfo->max_blocks - pages) > 0) {
280 inode->i_mapping->nrpages -= pages;
281 spin_lock_irqsave(&info->lock, flags);
282 info->alloced -= pages;
283 shmem_recalc_inode(inode);
284 spin_unlock_irqrestore(&info->lock, flags);
285 shmem_unacct_blocks(info->flags, pages);
286 return false;
287 }
288 percpu_counter_add(&sbinfo->used_blocks, pages);
289 return true; 306 return true;
290} 307}
291 308
292void shmem_uncharge(struct inode *inode, long pages) 309void shmem_uncharge(struct inode *inode, long pages)
293{ 310{
294 struct shmem_inode_info *info = SHMEM_I(inode); 311 struct shmem_inode_info *info = SHMEM_I(inode);
295 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
296 unsigned long flags; 312 unsigned long flags;
297 313
298 spin_lock_irqsave(&info->lock, flags); 314 spin_lock_irqsave(&info->lock, flags);
@@ -301,9 +317,7 @@ void shmem_uncharge(struct inode *inode, long pages)
301 shmem_recalc_inode(inode); 317 shmem_recalc_inode(inode);
302 spin_unlock_irqrestore(&info->lock, flags); 318 spin_unlock_irqrestore(&info->lock, flags);
303 319
304 if (sbinfo->max_blocks) 320 shmem_inode_unacct_blocks(inode, pages);
305 percpu_counter_sub(&sbinfo->used_blocks, pages);
306 shmem_unacct_blocks(info->flags, pages);
307} 321}
308 322
309/* 323/*
@@ -1452,9 +1466,10 @@ static struct page *shmem_alloc_page(gfp_t gfp,
1452} 1466}
1453 1467
1454static struct page *shmem_alloc_and_acct_page(gfp_t gfp, 1468static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1455 struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, 1469 struct inode *inode,
1456 pgoff_t index, bool huge) 1470 pgoff_t index, bool huge)
1457{ 1471{
1472 struct shmem_inode_info *info = SHMEM_I(inode);
1458 struct page *page; 1473 struct page *page;
1459 int nr; 1474 int nr;
1460 int err = -ENOSPC; 1475 int err = -ENOSPC;
@@ -1463,14 +1478,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1463 huge = false; 1478 huge = false;
1464 nr = huge ? HPAGE_PMD_NR : 1; 1479 nr = huge ? HPAGE_PMD_NR : 1;
1465 1480
1466 if (shmem_acct_block(info->flags, nr)) 1481 if (!shmem_inode_acct_block(inode, nr))
1467 goto failed; 1482 goto failed;
1468 if (sbinfo->max_blocks) {
1469 if (percpu_counter_compare(&sbinfo->used_blocks,
1470 sbinfo->max_blocks - nr) > 0)
1471 goto unacct;
1472 percpu_counter_add(&sbinfo->used_blocks, nr);
1473 }
1474 1483
1475 if (huge) 1484 if (huge)
1476 page = shmem_alloc_hugepage(gfp, info, index); 1485 page = shmem_alloc_hugepage(gfp, info, index);
@@ -1483,10 +1492,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1483 } 1492 }
1484 1493
1485 err = -ENOMEM; 1494 err = -ENOMEM;
1486 if (sbinfo->max_blocks) 1495 shmem_inode_unacct_blocks(inode, nr);
1487 percpu_counter_add(&sbinfo->used_blocks, -nr);
1488unacct:
1489 shmem_unacct_blocks(info->flags, nr);
1490failed: 1496failed:
1491 return ERR_PTR(err); 1497 return ERR_PTR(err);
1492} 1498}
@@ -1644,7 +1650,7 @@ repeat:
1644 1650
1645 if (swap.val) { 1651 if (swap.val) {
1646 /* Look it up and read it in.. */ 1652 /* Look it up and read it in.. */
1647 page = lookup_swap_cache(swap); 1653 page = lookup_swap_cache(swap, NULL, 0);
1648 if (!page) { 1654 if (!page) {
1649 /* Or update major stats only when swapin succeeds?? */ 1655 /* Or update major stats only when swapin succeeds?? */
1650 if (fault_type) { 1656 if (fault_type) {
@@ -1751,10 +1757,9 @@ repeat:
1751 } 1757 }
1752 1758
1753alloc_huge: 1759alloc_huge:
1754 page = shmem_alloc_and_acct_page(gfp, info, sbinfo, 1760 page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1755 index, true);
1756 if (IS_ERR(page)) { 1761 if (IS_ERR(page)) {
1757alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, 1762alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
1758 index, false); 1763 index, false);
1759 } 1764 }
1760 if (IS_ERR(page)) { 1765 if (IS_ERR(page)) {
@@ -1876,10 +1881,7 @@ clear:
1876 * Error recovery. 1881 * Error recovery.
1877 */ 1882 */
1878unacct: 1883unacct:
1879 if (sbinfo->max_blocks) 1884 shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
1880 percpu_counter_sub(&sbinfo->used_blocks,
1881 1 << compound_order(page));
1882 shmem_unacct_blocks(info->flags, 1 << compound_order(page));
1883 1885
1884 if (PageTransHuge(page)) { 1886 if (PageTransHuge(page)) {
1885 unlock_page(page); 1887 unlock_page(page);
@@ -2206,16 +2208,16 @@ bool shmem_mapping(struct address_space *mapping)
2206 return mapping->a_ops == &shmem_aops; 2208 return mapping->a_ops == &shmem_aops;
2207} 2209}
2208 2210
2209int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 2211static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2210 pmd_t *dst_pmd, 2212 pmd_t *dst_pmd,
2211 struct vm_area_struct *dst_vma, 2213 struct vm_area_struct *dst_vma,
2212 unsigned long dst_addr, 2214 unsigned long dst_addr,
2213 unsigned long src_addr, 2215 unsigned long src_addr,
2214 struct page **pagep) 2216 bool zeropage,
2217 struct page **pagep)
2215{ 2218{
2216 struct inode *inode = file_inode(dst_vma->vm_file); 2219 struct inode *inode = file_inode(dst_vma->vm_file);
2217 struct shmem_inode_info *info = SHMEM_I(inode); 2220 struct shmem_inode_info *info = SHMEM_I(inode);
2218 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2219 struct address_space *mapping = inode->i_mapping; 2221 struct address_space *mapping = inode->i_mapping;
2220 gfp_t gfp = mapping_gfp_mask(mapping); 2222 gfp_t gfp = mapping_gfp_mask(mapping);
2221 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 2223 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -2227,33 +2229,30 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2227 int ret; 2229 int ret;
2228 2230
2229 ret = -ENOMEM; 2231 ret = -ENOMEM;
2230 if (shmem_acct_block(info->flags, 1)) 2232 if (!shmem_inode_acct_block(inode, 1))
2231 goto out; 2233 goto out;
2232 if (sbinfo->max_blocks) {
2233 if (percpu_counter_compare(&sbinfo->used_blocks,
2234 sbinfo->max_blocks) >= 0)
2235 goto out_unacct_blocks;
2236 percpu_counter_inc(&sbinfo->used_blocks);
2237 }
2238 2234
2239 if (!*pagep) { 2235 if (!*pagep) {
2240 page = shmem_alloc_page(gfp, info, pgoff); 2236 page = shmem_alloc_page(gfp, info, pgoff);
2241 if (!page) 2237 if (!page)
2242 goto out_dec_used_blocks; 2238 goto out_unacct_blocks;
2243 2239
2244 page_kaddr = kmap_atomic(page); 2240 if (!zeropage) { /* mcopy_atomic */
2245 ret = copy_from_user(page_kaddr, (const void __user *)src_addr, 2241 page_kaddr = kmap_atomic(page);
2246 PAGE_SIZE); 2242 ret = copy_from_user(page_kaddr,
2247 kunmap_atomic(page_kaddr); 2243 (const void __user *)src_addr,
2248 2244 PAGE_SIZE);
2249 /* fallback to copy_from_user outside mmap_sem */ 2245 kunmap_atomic(page_kaddr);
2250 if (unlikely(ret)) { 2246
2251 *pagep = page; 2247 /* fallback to copy_from_user outside mmap_sem */
2252 if (sbinfo->max_blocks) 2248 if (unlikely(ret)) {
2253 percpu_counter_add(&sbinfo->used_blocks, -1); 2249 *pagep = page;
2254 shmem_unacct_blocks(info->flags, 1); 2250 shmem_inode_unacct_blocks(inode, 1);
2255 /* don't free the page */ 2251 /* don't free the page */
2256 return -EFAULT; 2252 return -EFAULT;
2253 }
2254 } else { /* mfill_zeropage_atomic */
2255 clear_highpage(page);
2257 } 2256 }
2258 } else { 2257 } else {
2259 page = *pagep; 2258 page = *pagep;
@@ -2314,14 +2313,33 @@ out_release_uncharge:
2314out_release: 2313out_release:
2315 unlock_page(page); 2314 unlock_page(page);
2316 put_page(page); 2315 put_page(page);
2317out_dec_used_blocks:
2318 if (sbinfo->max_blocks)
2319 percpu_counter_add(&sbinfo->used_blocks, -1);
2320out_unacct_blocks: 2316out_unacct_blocks:
2321 shmem_unacct_blocks(info->flags, 1); 2317 shmem_inode_unacct_blocks(inode, 1);
2322 goto out; 2318 goto out;
2323} 2319}
2324 2320
2321int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2322 pmd_t *dst_pmd,
2323 struct vm_area_struct *dst_vma,
2324 unsigned long dst_addr,
2325 unsigned long src_addr,
2326 struct page **pagep)
2327{
2328 return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2329 dst_addr, src_addr, false, pagep);
2330}
2331
2332int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
2333 pmd_t *dst_pmd,
2334 struct vm_area_struct *dst_vma,
2335 unsigned long dst_addr)
2336{
2337 struct page *page = NULL;
2338
2339 return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2340 dst_addr, 0, true, &page);
2341}
2342
2325#ifdef CONFIG_TMPFS 2343#ifdef CONFIG_TMPFS
2326static const struct inode_operations shmem_symlink_inode_operations; 2344static const struct inode_operations shmem_symlink_inode_operations;
2327static const struct inode_operations shmem_short_symlink_operations; 2345static const struct inode_operations shmem_short_symlink_operations;
@@ -3635,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3635#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 3653#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
3636#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 3654#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
3637 3655
3638#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) 3656#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
3639 3657
3640SYSCALL_DEFINE2(memfd_create, 3658SYSCALL_DEFINE2(memfd_create,
3641 const char __user *, uname, 3659 const char __user *, uname,
@@ -3647,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create,
3647 char *name; 3665 char *name;
3648 long len; 3666 long len;
3649 3667
3650 if (flags & ~(unsigned int)MFD_ALL_FLAGS) 3668 if (!(flags & MFD_HUGETLB)) {
3651 return -EINVAL; 3669 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
3670 return -EINVAL;
3671 } else {
3672 /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
3673 if (flags & MFD_ALLOW_SEALING)
3674 return -EINVAL;
3675 /* Allow huge page size encoding in flags. */
3676 if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
3677 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
3678 return -EINVAL;
3679 }
3652 3680
3653 /* length includes terminating zero */ 3681 /* length includes terminating zero */
3654 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); 3682 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
@@ -3679,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create,
3679 goto err_name; 3707 goto err_name;
3680 } 3708 }
3681 3709
3682 file = shmem_file_setup(name, 0, VM_NORESERVE); 3710 if (flags & MFD_HUGETLB) {
3711 struct user_struct *user = NULL;
3712
3713 file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
3714 HUGETLB_ANONHUGE_INODE,
3715 (flags >> MFD_HUGE_SHIFT) &
3716 MFD_HUGE_MASK);
3717 } else
3718 file = shmem_file_setup(name, 0, VM_NORESERVE);
3683 if (IS_ERR(file)) { 3719 if (IS_ERR(file)) {
3684 error = PTR_ERR(file); 3720 error = PTR_ERR(file);
3685 goto err_fd; 3721 goto err_fd;
3686 } 3722 }
3687 info = SHMEM_I(file_inode(file));
3688 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 3723 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
3689 file->f_flags |= O_RDWR | O_LARGEFILE; 3724 file->f_flags |= O_RDWR | O_LARGEFILE;
3690 if (flags & MFD_ALLOW_SEALING) 3725
3726 if (flags & MFD_ALLOW_SEALING) {
3727 /*
3728 * flags check at beginning of function ensures
3729 * this is not a hugetlbfs (MFD_HUGETLB) file.
3730 */
3731 info = SHMEM_I(file_inode(file));
3691 info->seals &= ~F_SEAL_SEAL; 3732 info->seals &= ~F_SEAL_SEAL;
3733 }
3692 3734
3693 fd_install(fd, file); 3735 fd_install(fd, file);
3694 kfree(name); 3736 kfree(name);
diff --git a/mm/slub.c b/mm/slub.c
index e8b4e31162ca..ddb04576b342 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,7 @@
34#include <linux/stacktrace.h> 34#include <linux/stacktrace.h>
35#include <linux/prefetch.h> 35#include <linux/prefetch.h>
36#include <linux/memcontrol.h> 36#include <linux/memcontrol.h>
37#include <linux/random.h>
37 38
38#include <trace/events/kmem.h> 39#include <trace/events/kmem.h>
39 40
@@ -238,30 +239,62 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
238 * Core slab cache functions 239 * Core slab cache functions
239 *******************************************************************/ 240 *******************************************************************/
240 241
242/*
243 * Returns freelist pointer (ptr). With hardening, this is obfuscated
244 * with an XOR of the address where the pointer is held and a per-cache
245 * random number.
246 */
247static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
248 unsigned long ptr_addr)
249{
250#ifdef CONFIG_SLAB_FREELIST_HARDENED
251 return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
252#else
253 return ptr;
254#endif
255}
256
257/* Returns the freelist pointer recorded at location ptr_addr. */
258static inline void *freelist_dereference(const struct kmem_cache *s,
259 void *ptr_addr)
260{
261 return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
262 (unsigned long)ptr_addr);
263}
264
241static inline void *get_freepointer(struct kmem_cache *s, void *object) 265static inline void *get_freepointer(struct kmem_cache *s, void *object)
242{ 266{
243 return *(void **)(object + s->offset); 267 return freelist_dereference(s, object + s->offset);
244} 268}
245 269
246static void prefetch_freepointer(const struct kmem_cache *s, void *object) 270static void prefetch_freepointer(const struct kmem_cache *s, void *object)
247{ 271{
248 prefetch(object + s->offset); 272 if (object)
273 prefetch(freelist_dereference(s, object + s->offset));
249} 274}
250 275
251static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 276static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
252{ 277{
278 unsigned long freepointer_addr;
253 void *p; 279 void *p;
254 280
255 if (!debug_pagealloc_enabled()) 281 if (!debug_pagealloc_enabled())
256 return get_freepointer(s, object); 282 return get_freepointer(s, object);
257 283
258 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); 284 freepointer_addr = (unsigned long)object + s->offset;
259 return p; 285 probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
286 return freelist_ptr(s, p, freepointer_addr);
260} 287}
261 288
262static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 289static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
263{ 290{
264 *(void **)(object + s->offset) = fp; 291 unsigned long freeptr_addr = (unsigned long)object + s->offset;
292
293#ifdef CONFIG_SLAB_FREELIST_HARDENED
294 BUG_ON(object == fp); /* naive detection of double free or corruption */
295#endif
296
297 *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
265} 298}
266 299
267/* Loop over all objects in a slab */ 300/* Loop over all objects in a slab */
@@ -3358,8 +3391,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
3358 struct kmem_cache_node *n; 3391 struct kmem_cache_node *n;
3359 3392
3360 for_each_kmem_cache_node(s, node, n) { 3393 for_each_kmem_cache_node(s, node, n) {
3361 kmem_cache_free(kmem_cache_node, n);
3362 s->node[node] = NULL; 3394 s->node[node] = NULL;
3395 kmem_cache_free(kmem_cache_node, n);
3363 } 3396 }
3364} 3397}
3365 3398
@@ -3389,8 +3422,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
3389 return 0; 3422 return 0;
3390 } 3423 }
3391 3424
3392 s->node[node] = n;
3393 init_kmem_cache_node(n); 3425 init_kmem_cache_node(n);
3426 s->node[node] = n;
3394 } 3427 }
3395 return 1; 3428 return 1;
3396} 3429}
@@ -3563,6 +3596,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3563{ 3596{
3564 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 3597 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3565 s->reserved = 0; 3598 s->reserved = 0;
3599#ifdef CONFIG_SLAB_FREELIST_HARDENED
3600 s->random = get_random_long();
3601#endif
3566 3602
3567 if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) 3603 if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
3568 s->reserved = sizeof(struct rcu_head); 3604 s->reserved = sizeof(struct rcu_head);
@@ -5423,7 +5459,7 @@ static struct attribute *slab_attrs[] = {
5423 NULL 5459 NULL
5424}; 5460};
5425 5461
5426static struct attribute_group slab_attr_group = { 5462static const struct attribute_group slab_attr_group = {
5427 .attrs = slab_attrs, 5463 .attrs = slab_attrs,
5428}; 5464};
5429 5465
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c50b1a14d55e..d1a39b8051e0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
54 if (slab_is_available()) { 54 if (slab_is_available()) {
55 struct page *page; 55 struct page *page;
56 56
57 if (node_state(node, N_HIGH_MEMORY)) 57 page = alloc_pages_node(node,
58 page = alloc_pages_node( 58 GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
59 node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, 59 get_order(size));
60 get_order(size));
61 else
62 page = alloc_pages(
63 GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
64 get_order(size));
65 if (page) 60 if (page)
66 return page_address(page); 61 return page_address(page);
67 return NULL; 62 return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..a9783acf2bb9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid)
65 unsigned long array_size = SECTIONS_PER_ROOT * 65 unsigned long array_size = SECTIONS_PER_ROOT *
66 sizeof(struct mem_section); 66 sizeof(struct mem_section);
67 67
68 if (slab_is_available()) { 68 if (slab_is_available())
69 if (node_state(nid, N_HIGH_MEMORY)) 69 section = kzalloc_node(array_size, GFP_KERNEL, nid);
70 section = kzalloc_node(array_size, GFP_KERNEL, nid); 70 else
71 else
72 section = kzalloc(array_size, GFP_KERNEL);
73 } else {
74 section = memblock_virt_alloc_node(array_size, nid); 71 section = memblock_virt_alloc_node(array_size, nid);
75 }
76 72
77 return section; 73 return section;
78} 74}
diff --git a/mm/swap.c b/mm/swap.c
index 60b1d2a75852..62d96b8e5eb3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -946,28 +946,34 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
946} 946}
947 947
948/** 948/**
949 * pagevec_lookup - gang pagecache lookup 949 * pagevec_lookup_range - gang pagecache lookup
950 * @pvec: Where the resulting pages are placed 950 * @pvec: Where the resulting pages are placed
951 * @mapping: The address_space to search 951 * @mapping: The address_space to search
952 * @start: The starting page index 952 * @start: The starting page index
953 * @end: The final page index
953 * @nr_pages: The maximum number of pages 954 * @nr_pages: The maximum number of pages
954 * 955 *
955 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 956 * pagevec_lookup_range() will search for and return a group of up to @nr_pages
956 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 957 * pages in the mapping starting from index @start and upto index @end
958 * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
957 * reference against the pages in @pvec. 959 * reference against the pages in @pvec.
958 * 960 *
959 * The search returns a group of mapping-contiguous pages with ascending 961 * The search returns a group of mapping-contiguous pages with ascending
960 * indexes. There may be holes in the indices due to not-present pages. 962 * indexes. There may be holes in the indices due to not-present pages. We
963 * also update @start to index the next page for the traversal.
961 * 964 *
962 * pagevec_lookup() returns the number of pages which were found. 965 * pagevec_lookup_range() returns the number of pages which were found. If this
966 * number is smaller than @nr_pages, the end of specified range has been
967 * reached.
963 */ 968 */
964unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 969unsigned pagevec_lookup_range(struct pagevec *pvec,
965 pgoff_t start, unsigned nr_pages) 970 struct address_space *mapping, pgoff_t *start, pgoff_t end)
966{ 971{
967 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 972 pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
973 pvec->pages);
968 return pagevec_count(pvec); 974 return pagevec_count(pvec);
969} 975}
970EXPORT_SYMBOL(pagevec_lookup); 976EXPORT_SYMBOL(pagevec_lookup_range);
971 977
972unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 978unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
973 pgoff_t *index, int tag, unsigned nr_pages) 979 pgoff_t *index, int tag, unsigned nr_pages)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b68c93014f50..71ce2d1ccbf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
37 37
38struct address_space *swapper_spaces[MAX_SWAPFILES]; 38struct address_space *swapper_spaces[MAX_SWAPFILES];
39static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; 39static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
40bool swap_vma_readahead = true;
41
42#define SWAP_RA_MAX_ORDER_DEFAULT 3
43
44static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
45
46#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
47#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
48#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
49#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
50
51#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
52#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
53#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
54
55#define SWAP_RA_VAL(addr, win, hits) \
56 (((addr) & PAGE_MASK) | \
57 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \
58 ((hits) & SWAP_RA_HITS_MASK))
59
60/* Initial readahead hits is 4 to start up with a small window */
61#define GET_SWAP_RA_VAL(vma) \
62 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
40 63
41#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 64#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
42#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) 65#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
@@ -297,19 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
297 * lock getting page table operations atomic even if we drop the page 320 * lock getting page table operations atomic even if we drop the page
298 * lock before returning. 321 * lock before returning.
299 */ 322 */
300struct page * lookup_swap_cache(swp_entry_t entry) 323struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
324 unsigned long addr)
301{ 325{
302 struct page *page; 326 struct page *page;
327 unsigned long ra_info;
328 int win, hits, readahead;
303 329
304 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 330 page = find_get_page(swap_address_space(entry), swp_offset(entry));
305 331
306 if (page && likely(!PageTransCompound(page))) { 332 INC_CACHE_INFO(find_total);
333 if (page) {
307 INC_CACHE_INFO(find_success); 334 INC_CACHE_INFO(find_success);
308 if (TestClearPageReadahead(page)) 335 if (unlikely(PageTransCompound(page)))
309 atomic_inc(&swapin_readahead_hits); 336 return page;
337 readahead = TestClearPageReadahead(page);
338 if (vma) {
339 ra_info = GET_SWAP_RA_VAL(vma);
340 win = SWAP_RA_WIN(ra_info);
341 hits = SWAP_RA_HITS(ra_info);
342 if (readahead)
343 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
344 atomic_long_set(&vma->swap_readahead_info,
345 SWAP_RA_VAL(addr, win, hits));
346 }
347 if (readahead) {
348 count_vm_event(SWAP_RA_HIT);
349 if (!vma)
350 atomic_inc(&swapin_readahead_hits);
351 }
310 } 352 }
311
312 INC_CACHE_INFO(find_total);
313 return page; 353 return page;
314} 354}
315 355
@@ -424,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
424 return retpage; 464 return retpage;
425} 465}
426 466
427static unsigned long swapin_nr_pages(unsigned long offset) 467static unsigned int __swapin_nr_pages(unsigned long prev_offset,
468 unsigned long offset,
469 int hits,
470 int max_pages,
471 int prev_win)
428{ 472{
429 static unsigned long prev_offset; 473 unsigned int pages, last_ra;
430 unsigned int pages, max_pages, last_ra;
431 static atomic_t last_readahead_pages;
432
433 max_pages = 1 << READ_ONCE(page_cluster);
434 if (max_pages <= 1)
435 return 1;
436 474
437 /* 475 /*
438 * This heuristic has been found to work well on both sequential and 476 * This heuristic has been found to work well on both sequential and
439 * random loads, swapping to hard disk or to SSD: please don't ask 477 * random loads, swapping to hard disk or to SSD: please don't ask
440 * what the "+ 2" means, it just happens to work well, that's all. 478 * what the "+ 2" means, it just happens to work well, that's all.
441 */ 479 */
442 pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; 480 pages = hits + 2;
443 if (pages == 2) { 481 if (pages == 2) {
444 /* 482 /*
445 * We can have no readahead hits to judge by: but must not get 483 * We can have no readahead hits to judge by: but must not get
@@ -448,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
448 */ 486 */
449 if (offset != prev_offset + 1 && offset != prev_offset - 1) 487 if (offset != prev_offset + 1 && offset != prev_offset - 1)
450 pages = 1; 488 pages = 1;
451 prev_offset = offset;
452 } else { 489 } else {
453 unsigned int roundup = 4; 490 unsigned int roundup = 4;
454 while (roundup < pages) 491 while (roundup < pages)
@@ -460,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
460 pages = max_pages; 497 pages = max_pages;
461 498
462 /* Don't shrink readahead too fast */ 499 /* Don't shrink readahead too fast */
463 last_ra = atomic_read(&last_readahead_pages) / 2; 500 last_ra = prev_win / 2;
464 if (pages < last_ra) 501 if (pages < last_ra)
465 pages = last_ra; 502 pages = last_ra;
503
504 return pages;
505}
506
507static unsigned long swapin_nr_pages(unsigned long offset)
508{
509 static unsigned long prev_offset;
510 unsigned int hits, pages, max_pages;
511 static atomic_t last_readahead_pages;
512
513 max_pages = 1 << READ_ONCE(page_cluster);
514 if (max_pages <= 1)
515 return 1;
516
517 hits = atomic_xchg(&swapin_readahead_hits, 0);
518 pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
519 atomic_read(&last_readahead_pages));
520 if (!hits)
521 prev_offset = offset;
466 atomic_set(&last_readahead_pages, pages); 522 atomic_set(&last_readahead_pages, pages);
467 523
468 return pages; 524 return pages;
@@ -496,7 +552,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
496 unsigned long start_offset, end_offset; 552 unsigned long start_offset, end_offset;
497 unsigned long mask; 553 unsigned long mask;
498 struct blk_plug plug; 554 struct blk_plug plug;
499 bool do_poll = true; 555 bool do_poll = true, page_allocated;
500 556
501 mask = swapin_nr_pages(offset) - 1; 557 mask = swapin_nr_pages(offset) - 1;
502 if (!mask) 558 if (!mask)
@@ -512,12 +568,19 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
512 blk_start_plug(&plug); 568 blk_start_plug(&plug);
513 for (offset = start_offset; offset <= end_offset ; offset++) { 569 for (offset = start_offset; offset <= end_offset ; offset++) {
514 /* Ok, do the async read-ahead now */ 570 /* Ok, do the async read-ahead now */
515 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 571 page = __read_swap_cache_async(
516 gfp_mask, vma, addr, false); 572 swp_entry(swp_type(entry), offset),
573 gfp_mask, vma, addr, &page_allocated);
517 if (!page) 574 if (!page)
518 continue; 575 continue;
519 if (offset != entry_offset && likely(!PageTransCompound(page))) 576 if (page_allocated) {
520 SetPageReadahead(page); 577 swap_readpage(page, false);
578 if (offset != entry_offset &&
579 likely(!PageTransCompound(page))) {
580 SetPageReadahead(page);
581 count_vm_event(SWAP_RA);
582 }
583 }
521 put_page(page); 584 put_page(page);
522 } 585 }
523 blk_finish_plug(&plug); 586 blk_finish_plug(&plug);
@@ -561,3 +624,210 @@ void exit_swap_address_space(unsigned int type)
561 synchronize_rcu(); 624 synchronize_rcu();
562 kvfree(spaces); 625 kvfree(spaces);
563} 626}
627
628static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
629 unsigned long faddr,
630 unsigned long lpfn,
631 unsigned long rpfn,
632 unsigned long *start,
633 unsigned long *end)
634{
635 *start = max3(lpfn, PFN_DOWN(vma->vm_start),
636 PFN_DOWN(faddr & PMD_MASK));
637 *end = min3(rpfn, PFN_DOWN(vma->vm_end),
638 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
639}
640
641struct page *swap_readahead_detect(struct vm_fault *vmf,
642 struct vma_swap_readahead *swap_ra)
643{
644 struct vm_area_struct *vma = vmf->vma;
645 unsigned long swap_ra_info;
646 struct page *page;
647 swp_entry_t entry;
648 unsigned long faddr, pfn, fpfn;
649 unsigned long start, end;
650 pte_t *pte;
651 unsigned int max_win, hits, prev_win, win, left;
652#ifndef CONFIG_64BIT
653 pte_t *tpte;
654#endif
655
656 faddr = vmf->address;
657 entry = pte_to_swp_entry(vmf->orig_pte);
658 if ((unlikely(non_swap_entry(entry))))
659 return NULL;
660 page = lookup_swap_cache(entry, vma, faddr);
661 if (page)
662 return page;
663
664 max_win = 1 << READ_ONCE(swap_ra_max_order);
665 if (max_win == 1) {
666 swap_ra->win = 1;
667 return NULL;
668 }
669
670 fpfn = PFN_DOWN(faddr);
671 swap_ra_info = GET_SWAP_RA_VAL(vma);
672 pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
673 prev_win = SWAP_RA_WIN(swap_ra_info);
674 hits = SWAP_RA_HITS(swap_ra_info);
675 swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
676 max_win, prev_win);
677 atomic_long_set(&vma->swap_readahead_info,
678 SWAP_RA_VAL(faddr, win, 0));
679
680 if (win == 1)
681 return NULL;
682
683 /* Copy the PTEs because the page table may be unmapped */
684 if (fpfn == pfn + 1)
685 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
686 else if (pfn == fpfn + 1)
687 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
688 &start, &end);
689 else {
690 left = (win - 1) / 2;
691 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
692 &start, &end);
693 }
694 swap_ra->nr_pte = end - start;
695 swap_ra->offset = fpfn - start;
696 pte = vmf->pte - swap_ra->offset;
697#ifdef CONFIG_64BIT
698 swap_ra->ptes = pte;
699#else
700 tpte = swap_ra->ptes;
701 for (pfn = start; pfn != end; pfn++)
702 *tpte++ = *pte++;
703#endif
704
705 return NULL;
706}
707
708struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
709 struct vm_fault *vmf,
710 struct vma_swap_readahead *swap_ra)
711{
712 struct blk_plug plug;
713 struct vm_area_struct *vma = vmf->vma;
714 struct page *page;
715 pte_t *pte, pentry;
716 swp_entry_t entry;
717 unsigned int i;
718 bool page_allocated;
719
720 if (swap_ra->win == 1)
721 goto skip;
722
723 blk_start_plug(&plug);
724 for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
725 i++, pte++) {
726 pentry = *pte;
727 if (pte_none(pentry))
728 continue;
729 if (pte_present(pentry))
730 continue;
731 entry = pte_to_swp_entry(pentry);
732 if (unlikely(non_swap_entry(entry)))
733 continue;
734 page = __read_swap_cache_async(entry, gfp_mask, vma,
735 vmf->address, &page_allocated);
736 if (!page)
737 continue;
738 if (page_allocated) {
739 swap_readpage(page, false);
740 if (i != swap_ra->offset &&
741 likely(!PageTransCompound(page))) {
742 SetPageReadahead(page);
743 count_vm_event(SWAP_RA);
744 }
745 }
746 put_page(page);
747 }
748 blk_finish_plug(&plug);
749 lru_add_drain();
750skip:
751 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
752 swap_ra->win == 1);
753}
754
755#ifdef CONFIG_SYSFS
756static ssize_t vma_ra_enabled_show(struct kobject *kobj,
757 struct kobj_attribute *attr, char *buf)
758{
759 return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false");
760}
761static ssize_t vma_ra_enabled_store(struct kobject *kobj,
762 struct kobj_attribute *attr,
763 const char *buf, size_t count)
764{
765 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
766 swap_vma_readahead = true;
767 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
768 swap_vma_readahead = false;
769 else
770 return -EINVAL;
771
772 return count;
773}
774static struct kobj_attribute vma_ra_enabled_attr =
775 __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
776 vma_ra_enabled_store);
777
778static ssize_t vma_ra_max_order_show(struct kobject *kobj,
779 struct kobj_attribute *attr, char *buf)
780{
781 return sprintf(buf, "%d\n", swap_ra_max_order);
782}
783static ssize_t vma_ra_max_order_store(struct kobject *kobj,
784 struct kobj_attribute *attr,
785 const char *buf, size_t count)
786{
787 int err, v;
788
789 err = kstrtoint(buf, 10, &v);
790 if (err || v > SWAP_RA_ORDER_CEILING || v <= 0)
791 return -EINVAL;
792
793 swap_ra_max_order = v;
794
795 return count;
796}
797static struct kobj_attribute vma_ra_max_order_attr =
798 __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show,
799 vma_ra_max_order_store);
800
801static struct attribute *swap_attrs[] = {
802 &vma_ra_enabled_attr.attr,
803 &vma_ra_max_order_attr.attr,
804 NULL,
805};
806
807static struct attribute_group swap_attr_group = {
808 .attrs = swap_attrs,
809};
810
811static int __init swap_init_sysfs(void)
812{
813 int err;
814 struct kobject *swap_kobj;
815
816 swap_kobj = kobject_create_and_add("swap", mm_kobj);
817 if (!swap_kobj) {
818 pr_err("failed to create swap kobject\n");
819 return -ENOMEM;
820 }
821 err = sysfs_create_group(swap_kobj, &swap_attr_group);
822 if (err) {
823 pr_err("failed to register swap group\n");
824 goto delete_obj;
825 }
826 return 0;
827
828delete_obj:
829 kobject_put(swap_kobj);
830 return err;
831}
832subsys_initcall(swap_init_sysfs);
833#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6ba4aab2db0b..d483278ee35b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
60EXPORT_SYMBOL_GPL(nr_swap_pages); 60EXPORT_SYMBOL_GPL(nr_swap_pages);
61/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 61/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
62long total_swap_pages; 62long total_swap_pages;
63static int least_priority; 63static int least_priority = -1;
64 64
65static const char Bad_file[] = "Bad swap file entry "; 65static const char Bad_file[] = "Bad swap file entry ";
66static const char Unused_file[] = "Unused swap file entry "; 66static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
85 * is held and the locking order requires swap_lock to be taken 85 * is held and the locking order requires swap_lock to be taken
86 * before any swap_info_struct->lock. 86 * before any swap_info_struct->lock.
87 */ 87 */
88static PLIST_HEAD(swap_avail_head); 88struct plist_head *swap_avail_heads;
89static DEFINE_SPINLOCK(swap_avail_lock); 89static DEFINE_SPINLOCK(swap_avail_lock);
90 90
91struct swap_info_struct *swap_info[MAX_SWAPFILES]; 91struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
96/* Activity counter to indicate that a swapon or swapoff has occurred */ 96/* Activity counter to indicate that a swapon or swapoff has occurred */
97static atomic_t proc_poll_event = ATOMIC_INIT(0); 97static atomic_t proc_poll_event = ATOMIC_INIT(0);
98 98
99atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100
99static inline unsigned char swap_count(unsigned char ent) 101static inline unsigned char swap_count(unsigned char ent)
100{ 102{
101 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ 103 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
265 info->data = 0; 267 info->data = 0;
266} 268}
267 269
270static inline bool cluster_is_huge(struct swap_cluster_info *info)
271{
272 return info->flags & CLUSTER_FLAG_HUGE;
273}
274
275static inline void cluster_clear_huge(struct swap_cluster_info *info)
276{
277 info->flags &= ~CLUSTER_FLAG_HUGE;
278}
279
268static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, 280static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
269 unsigned long offset) 281 unsigned long offset)
270{ 282{
@@ -580,6 +592,21 @@ new_cluster:
580 return found_free; 592 return found_free;
581} 593}
582 594
595static void __del_from_avail_list(struct swap_info_struct *p)
596{
597 int nid;
598
599 for_each_node(nid)
600 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
601}
602
603static void del_from_avail_list(struct swap_info_struct *p)
604{
605 spin_lock(&swap_avail_lock);
606 __del_from_avail_list(p);
607 spin_unlock(&swap_avail_lock);
608}
609
583static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 610static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
584 unsigned int nr_entries) 611 unsigned int nr_entries)
585{ 612{
@@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
593 if (si->inuse_pages == si->pages) { 620 if (si->inuse_pages == si->pages) {
594 si->lowest_bit = si->max; 621 si->lowest_bit = si->max;
595 si->highest_bit = 0; 622 si->highest_bit = 0;
596 spin_lock(&swap_avail_lock); 623 del_from_avail_list(si);
597 plist_del(&si->avail_list, &swap_avail_head); 624 }
598 spin_unlock(&swap_avail_lock); 625}
626
627static void add_to_avail_list(struct swap_info_struct *p)
628{
629 int nid;
630
631 spin_lock(&swap_avail_lock);
632 for_each_node(nid) {
633 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
634 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
599 } 635 }
636 spin_unlock(&swap_avail_lock);
600} 637}
601 638
602static void swap_range_free(struct swap_info_struct *si, unsigned long offset, 639static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
611 bool was_full = !si->highest_bit; 648 bool was_full = !si->highest_bit;
612 649
613 si->highest_bit = end; 650 si->highest_bit = end;
614 if (was_full && (si->flags & SWP_WRITEOK)) { 651 if (was_full && (si->flags & SWP_WRITEOK))
615 spin_lock(&swap_avail_lock); 652 add_to_avail_list(si);
616 WARN_ON(!plist_node_empty(&si->avail_list));
617 if (plist_node_empty(&si->avail_list))
618 plist_add(&si->avail_list, &swap_avail_head);
619 spin_unlock(&swap_avail_lock);
620 }
621 } 653 }
622 atomic_long_add(nr_entries, &nr_swap_pages); 654 atomic_long_add(nr_entries, &nr_swap_pages);
623 si->inuse_pages -= nr_entries; 655 si->inuse_pages -= nr_entries;
@@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
846 offset = idx * SWAPFILE_CLUSTER; 878 offset = idx * SWAPFILE_CLUSTER;
847 ci = lock_cluster(si, offset); 879 ci = lock_cluster(si, offset);
848 alloc_cluster(si, idx); 880 alloc_cluster(si, idx);
849 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); 881 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
850 882
851 map = si->swap_map + offset; 883 map = si->swap_map + offset;
852 for (i = 0; i < SWAPFILE_CLUSTER; i++) 884 for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
898 struct swap_info_struct *si, *next; 930 struct swap_info_struct *si, *next;
899 long avail_pgs; 931 long avail_pgs;
900 int n_ret = 0; 932 int n_ret = 0;
933 int node;
901 934
902 /* Only single cluster request supported */ 935 /* Only single cluster request supported */
903 WARN_ON_ONCE(n_goal > 1 && cluster); 936 WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
917 spin_lock(&swap_avail_lock); 950 spin_lock(&swap_avail_lock);
918 951
919start_over: 952start_over:
920 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { 953 node = numa_node_id();
954 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
921 /* requeue si to after same-priority siblings */ 955 /* requeue si to after same-priority siblings */
922 plist_requeue(&si->avail_list, &swap_avail_head); 956 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
923 spin_unlock(&swap_avail_lock); 957 spin_unlock(&swap_avail_lock);
924 spin_lock(&si->lock); 958 spin_lock(&si->lock);
925 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 959 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
926 spin_lock(&swap_avail_lock); 960 spin_lock(&swap_avail_lock);
927 if (plist_node_empty(&si->avail_list)) { 961 if (plist_node_empty(&si->avail_lists[node])) {
928 spin_unlock(&si->lock); 962 spin_unlock(&si->lock);
929 goto nextsi; 963 goto nextsi;
930 } 964 }
@@ -934,13 +968,14 @@ start_over:
934 WARN(!(si->flags & SWP_WRITEOK), 968 WARN(!(si->flags & SWP_WRITEOK),
935 "swap_info %d in list but !SWP_WRITEOK\n", 969 "swap_info %d in list but !SWP_WRITEOK\n",
936 si->type); 970 si->type);
937 plist_del(&si->avail_list, &swap_avail_head); 971 __del_from_avail_list(si);
938 spin_unlock(&si->lock); 972 spin_unlock(&si->lock);
939 goto nextsi; 973 goto nextsi;
940 } 974 }
941 if (cluster) 975 if (cluster) {
942 n_ret = swap_alloc_cluster(si, swp_entries); 976 if (!(si->flags & SWP_FILE))
943 else 977 n_ret = swap_alloc_cluster(si, swp_entries);
978 } else
944 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, 979 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
945 n_goal, swp_entries); 980 n_goal, swp_entries);
946 spin_unlock(&si->lock); 981 spin_unlock(&si->lock);
@@ -962,7 +997,7 @@ nextsi:
962 * swap_avail_head list then try it, otherwise start over 997 * swap_avail_head list then try it, otherwise start over
963 * if we have not gotten any slots. 998 * if we have not gotten any slots.
964 */ 999 */
965 if (plist_node_empty(&next->avail_list)) 1000 if (plist_node_empty(&next->avail_lists[node]))
966 goto start_over; 1001 goto start_over;
967 } 1002 }
968 1003
@@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry)
1168 struct swap_cluster_info *ci; 1203 struct swap_cluster_info *ci;
1169 struct swap_info_struct *si; 1204 struct swap_info_struct *si;
1170 unsigned char *map; 1205 unsigned char *map;
1171 unsigned int i; 1206 unsigned int i, free_entries = 0;
1207 unsigned char val;
1172 1208
1173 si = swap_info_get(entry); 1209 si = _swap_info_get(entry);
1174 if (!si) 1210 if (!si)
1175 return; 1211 return;
1176 1212
1177 ci = lock_cluster(si, offset); 1213 ci = lock_cluster(si, offset);
1214 VM_BUG_ON(!cluster_is_huge(ci));
1178 map = si->swap_map + offset; 1215 map = si->swap_map + offset;
1179 for (i = 0; i < SWAPFILE_CLUSTER; i++) { 1216 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1180 VM_BUG_ON(map[i] != SWAP_HAS_CACHE); 1217 val = map[i];
1181 map[i] = 0; 1218 VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1219 if (val == SWAP_HAS_CACHE)
1220 free_entries++;
1221 }
1222 if (!free_entries) {
1223 for (i = 0; i < SWAPFILE_CLUSTER; i++)
1224 map[i] &= ~SWAP_HAS_CACHE;
1182 } 1225 }
1226 cluster_clear_huge(ci);
1183 unlock_cluster(ci); 1227 unlock_cluster(ci);
1184 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); 1228 if (free_entries == SWAPFILE_CLUSTER) {
1185 swap_free_cluster(si, idx); 1229 spin_lock(&si->lock);
1186 spin_unlock(&si->lock); 1230 ci = lock_cluster(si, offset);
1231 memset(map, 0, SWAPFILE_CLUSTER);
1232 unlock_cluster(ci);
1233 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1234 swap_free_cluster(si, idx);
1235 spin_unlock(&si->lock);
1236 } else if (free_entries) {
1237 for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
1238 if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
1239 free_swap_slot(entry);
1240 }
1241 }
1242}
1243
1244int split_swap_cluster(swp_entry_t entry)
1245{
1246 struct swap_info_struct *si;
1247 struct swap_cluster_info *ci;
1248 unsigned long offset = swp_offset(entry);
1249
1250 si = _swap_info_get(entry);
1251 if (!si)
1252 return -EBUSY;
1253 ci = lock_cluster(si, offset);
1254 cluster_clear_huge(ci);
1255 unlock_cluster(ci);
1256 return 0;
1187} 1257}
1188#else 1258#else
1189static inline void swapcache_free_cluster(swp_entry_t entry) 1259static inline void swapcache_free_cluster(swp_entry_t entry)
@@ -1332,29 +1402,161 @@ out:
1332 return count; 1402 return count;
1333} 1403}
1334 1404
1405#ifdef CONFIG_THP_SWAP
1406static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1407 swp_entry_t entry)
1408{
1409 struct swap_cluster_info *ci;
1410 unsigned char *map = si->swap_map;
1411 unsigned long roffset = swp_offset(entry);
1412 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1413 int i;
1414 bool ret = false;
1415
1416 ci = lock_cluster_or_swap_info(si, offset);
1417 if (!ci || !cluster_is_huge(ci)) {
1418 if (map[roffset] != SWAP_HAS_CACHE)
1419 ret = true;
1420 goto unlock_out;
1421 }
1422 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1423 if (map[offset + i] != SWAP_HAS_CACHE) {
1424 ret = true;
1425 break;
1426 }
1427 }
1428unlock_out:
1429 unlock_cluster_or_swap_info(si, ci);
1430 return ret;
1431}
1432
1433static bool page_swapped(struct page *page)
1434{
1435 swp_entry_t entry;
1436 struct swap_info_struct *si;
1437
1438 if (likely(!PageTransCompound(page)))
1439 return page_swapcount(page) != 0;
1440
1441 page = compound_head(page);
1442 entry.val = page_private(page);
1443 si = _swap_info_get(entry);
1444 if (si)
1445 return swap_page_trans_huge_swapped(si, entry);
1446 return false;
1447}
1448
1449static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1450 int *total_swapcount)
1451{
1452 int i, map_swapcount, _total_mapcount, _total_swapcount;
1453 unsigned long offset = 0;
1454 struct swap_info_struct *si;
1455 struct swap_cluster_info *ci = NULL;
1456 unsigned char *map = NULL;
1457 int mapcount, swapcount = 0;
1458
1459 /* hugetlbfs shouldn't call it */
1460 VM_BUG_ON_PAGE(PageHuge(page), page);
1461
1462 if (likely(!PageTransCompound(page))) {
1463 mapcount = atomic_read(&page->_mapcount) + 1;
1464 if (total_mapcount)
1465 *total_mapcount = mapcount;
1466 if (PageSwapCache(page))
1467 swapcount = page_swapcount(page);
1468 if (total_swapcount)
1469 *total_swapcount = swapcount;
1470 return mapcount + swapcount;
1471 }
1472
1473 page = compound_head(page);
1474
1475 _total_mapcount = _total_swapcount = map_swapcount = 0;
1476 if (PageSwapCache(page)) {
1477 swp_entry_t entry;
1478
1479 entry.val = page_private(page);
1480 si = _swap_info_get(entry);
1481 if (si) {
1482 map = si->swap_map;
1483 offset = swp_offset(entry);
1484 }
1485 }
1486 if (map)
1487 ci = lock_cluster(si, offset);
1488 for (i = 0; i < HPAGE_PMD_NR; i++) {
1489 mapcount = atomic_read(&page[i]._mapcount) + 1;
1490 _total_mapcount += mapcount;
1491 if (map) {
1492 swapcount = swap_count(map[offset + i]);
1493 _total_swapcount += swapcount;
1494 }
1495 map_swapcount = max(map_swapcount, mapcount + swapcount);
1496 }
1497 unlock_cluster(ci);
1498 if (PageDoubleMap(page)) {
1499 map_swapcount -= 1;
1500 _total_mapcount -= HPAGE_PMD_NR;
1501 }
1502 mapcount = compound_mapcount(page);
1503 map_swapcount += mapcount;
1504 _total_mapcount += mapcount;
1505 if (total_mapcount)
1506 *total_mapcount = _total_mapcount;
1507 if (total_swapcount)
1508 *total_swapcount = _total_swapcount;
1509
1510 return map_swapcount;
1511}
1512#else
1513#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
1514#define page_swapped(page) (page_swapcount(page) != 0)
1515
1516static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1517 int *total_swapcount)
1518{
1519 int mapcount, swapcount = 0;
1520
1521 /* hugetlbfs shouldn't call it */
1522 VM_BUG_ON_PAGE(PageHuge(page), page);
1523
1524 mapcount = page_trans_huge_mapcount(page, total_mapcount);
1525 if (PageSwapCache(page))
1526 swapcount = page_swapcount(page);
1527 if (total_swapcount)
1528 *total_swapcount = swapcount;
1529 return mapcount + swapcount;
1530}
1531#endif
1532
1335/* 1533/*
1336 * We can write to an anon page without COW if there are no other references 1534 * We can write to an anon page without COW if there are no other references
1337 * to it. And as a side-effect, free up its swap: because the old content 1535 * to it. And as a side-effect, free up its swap: because the old content
1338 * on disk will never be read, and seeking back there to write new content 1536 * on disk will never be read, and seeking back there to write new content
1339 * later would only waste time away from clustering. 1537 * later would only waste time away from clustering.
1340 * 1538 *
1341 * NOTE: total_mapcount should not be relied upon by the caller if 1539 * NOTE: total_map_swapcount should not be relied upon by the caller if
1342 * reuse_swap_page() returns false, but it may be always overwritten 1540 * reuse_swap_page() returns false, but it may be always overwritten
1343 * (see the other implementation for CONFIG_SWAP=n). 1541 * (see the other implementation for CONFIG_SWAP=n).
1344 */ 1542 */
1345bool reuse_swap_page(struct page *page, int *total_mapcount) 1543bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1346{ 1544{
1347 int count; 1545 int count, total_mapcount, total_swapcount;
1348 1546
1349 VM_BUG_ON_PAGE(!PageLocked(page), page); 1547 VM_BUG_ON_PAGE(!PageLocked(page), page);
1350 if (unlikely(PageKsm(page))) 1548 if (unlikely(PageKsm(page)))
1351 return false; 1549 return false;
1352 count = page_trans_huge_mapcount(page, total_mapcount); 1550 count = page_trans_huge_map_swapcount(page, &total_mapcount,
1353 if (count <= 1 && PageSwapCache(page)) { 1551 &total_swapcount);
1354 count += page_swapcount(page); 1552 if (total_map_swapcount)
1355 if (count != 1) 1553 *total_map_swapcount = total_mapcount + total_swapcount;
1356 goto out; 1554 if (count == 1 && PageSwapCache(page) &&
1555 (likely(!PageTransCompound(page)) ||
1556 /* The remaining swap count will be freed soon */
1557 total_swapcount == page_swapcount(page))) {
1357 if (!PageWriteback(page)) { 1558 if (!PageWriteback(page)) {
1559 page = compound_head(page);
1358 delete_from_swap_cache(page); 1560 delete_from_swap_cache(page);
1359 SetPageDirty(page); 1561 SetPageDirty(page);
1360 } else { 1562 } else {
@@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
1370 spin_unlock(&p->lock); 1572 spin_unlock(&p->lock);
1371 } 1573 }
1372 } 1574 }
1373out: 1575
1374 return count <= 1; 1576 return count <= 1;
1375} 1577}
1376 1578
@@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page)
1386 return 0; 1588 return 0;
1387 if (PageWriteback(page)) 1589 if (PageWriteback(page))
1388 return 0; 1590 return 0;
1389 if (page_swapcount(page)) 1591 if (page_swapped(page))
1390 return 0; 1592 return 0;
1391 1593
1392 /* 1594 /*
@@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page)
1407 if (pm_suspended_storage()) 1609 if (pm_suspended_storage())
1408 return 0; 1610 return 0;
1409 1611
1612 page = compound_head(page);
1410 delete_from_swap_cache(page); 1613 delete_from_swap_cache(page);
1411 SetPageDirty(page); 1614 SetPageDirty(page);
1412 return 1; 1615 return 1;
@@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry)
1428 p = _swap_info_get(entry); 1631 p = _swap_info_get(entry);
1429 if (p) { 1632 if (p) {
1430 count = __swap_entry_free(p, entry, 1); 1633 count = __swap_entry_free(p, entry, 1);
1431 if (count == SWAP_HAS_CACHE) { 1634 if (count == SWAP_HAS_CACHE &&
1635 !swap_page_trans_huge_swapped(p, entry)) {
1432 page = find_get_page(swap_address_space(entry), 1636 page = find_get_page(swap_address_space(entry),
1433 swp_offset(entry)); 1637 swp_offset(entry));
1434 if (page && !trylock_page(page)) { 1638 if (page && !trylock_page(page)) {
@@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry)
1445 */ 1649 */
1446 if (PageSwapCache(page) && !PageWriteback(page) && 1650 if (PageSwapCache(page) && !PageWriteback(page) &&
1447 (!page_mapped(page) || mem_cgroup_swap_full(page)) && 1651 (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
1448 !swap_swapcount(p, entry)) { 1652 !swap_page_trans_huge_swapped(p, entry)) {
1653 page = compound_head(page);
1449 delete_from_swap_cache(page); 1654 delete_from_swap_cache(page);
1450 SetPageDirty(page); 1655 SetPageDirty(page);
1451 } 1656 }
@@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
1999 .sync_mode = WB_SYNC_NONE, 2204 .sync_mode = WB_SYNC_NONE,
2000 }; 2205 };
2001 2206
2002 swap_writepage(page, &wbc); 2207 swap_writepage(compound_head(page), &wbc);
2003 lock_page(page); 2208 lock_page(page);
2004 wait_on_page_writeback(page); 2209 wait_on_page_writeback(page);
2005 } 2210 }
@@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
2012 * delete, since it may not have been written out to swap yet. 2217 * delete, since it may not have been written out to swap yet.
2013 */ 2218 */
2014 if (PageSwapCache(page) && 2219 if (PageSwapCache(page) &&
2015 likely(page_private(page) == entry.val)) 2220 likely(page_private(page) == entry.val) &&
2016 delete_from_swap_cache(page); 2221 !page_swapped(page))
2222 delete_from_swap_cache(compound_head(page));
2017 2223
2018 /* 2224 /*
2019 * So we could skip searching mms once swap count went 2225 * So we could skip searching mms once swap count went
@@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2226 return generic_swapfile_activate(sis, swap_file, span); 2432 return generic_swapfile_activate(sis, swap_file, span);
2227} 2433}
2228 2434
2435static int swap_node(struct swap_info_struct *p)
2436{
2437 struct block_device *bdev;
2438
2439 if (p->bdev)
2440 bdev = p->bdev;
2441 else
2442 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2443
2444 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2445}
2446
2229static void _enable_swap_info(struct swap_info_struct *p, int prio, 2447static void _enable_swap_info(struct swap_info_struct *p, int prio,
2230 unsigned char *swap_map, 2448 unsigned char *swap_map,
2231 struct swap_cluster_info *cluster_info) 2449 struct swap_cluster_info *cluster_info)
2232{ 2450{
2451 int i;
2452
2233 if (prio >= 0) 2453 if (prio >= 0)
2234 p->prio = prio; 2454 p->prio = prio;
2235 else 2455 else
@@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
2239 * low-to-high, while swap ordering is high-to-low 2459 * low-to-high, while swap ordering is high-to-low
2240 */ 2460 */
2241 p->list.prio = -p->prio; 2461 p->list.prio = -p->prio;
2242 p->avail_list.prio = -p->prio; 2462 for_each_node(i) {
2463 if (p->prio >= 0)
2464 p->avail_lists[i].prio = -p->prio;
2465 else {
2466 if (swap_node(p) == i)
2467 p->avail_lists[i].prio = 1;
2468 else
2469 p->avail_lists[i].prio = -p->prio;
2470 }
2471 }
2243 p->swap_map = swap_map; 2472 p->swap_map = swap_map;
2244 p->cluster_info = cluster_info; 2473 p->cluster_info = cluster_info;
2245 p->flags |= SWP_WRITEOK; 2474 p->flags |= SWP_WRITEOK;
@@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
2258 * swap_info_struct. 2487 * swap_info_struct.
2259 */ 2488 */
2260 plist_add(&p->list, &swap_active_head); 2489 plist_add(&p->list, &swap_active_head);
2261 spin_lock(&swap_avail_lock); 2490 add_to_avail_list(p);
2262 plist_add(&p->avail_list, &swap_avail_head);
2263 spin_unlock(&swap_avail_lock);
2264} 2491}
2265 2492
2266static void enable_swap_info(struct swap_info_struct *p, int prio, 2493static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2345 spin_unlock(&swap_lock); 2572 spin_unlock(&swap_lock);
2346 goto out_dput; 2573 goto out_dput;
2347 } 2574 }
2348 spin_lock(&swap_avail_lock); 2575 del_from_avail_list(p);
2349 plist_del(&p->avail_list, &swap_avail_head);
2350 spin_unlock(&swap_avail_lock);
2351 spin_lock(&p->lock); 2576 spin_lock(&p->lock);
2352 if (p->prio < 0) { 2577 if (p->prio < 0) {
2353 struct swap_info_struct *si = p; 2578 struct swap_info_struct *si = p;
2579 int nid;
2354 2580
2355 plist_for_each_entry_continue(si, &swap_active_head, list) { 2581 plist_for_each_entry_continue(si, &swap_active_head, list) {
2356 si->prio++; 2582 si->prio++;
2357 si->list.prio--; 2583 si->list.prio--;
2358 si->avail_list.prio--; 2584 for_each_node(nid) {
2585 if (si->avail_lists[nid].prio != 1)
2586 si->avail_lists[nid].prio--;
2587 }
2359 } 2588 }
2360 least_priority++; 2589 least_priority++;
2361 } 2590 }
@@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2387 if (p->flags & SWP_CONTINUED) 2616 if (p->flags & SWP_CONTINUED)
2388 free_swap_count_continuations(p); 2617 free_swap_count_continuations(p);
2389 2618
2619 if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2620 atomic_dec(&nr_rotate_swap);
2621
2390 mutex_lock(&swapon_mutex); 2622 mutex_lock(&swapon_mutex);
2391 spin_lock(&swap_lock); 2623 spin_lock(&swap_lock);
2392 spin_lock(&p->lock); 2624 spin_lock(&p->lock);
@@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
2596{ 2828{
2597 struct swap_info_struct *p; 2829 struct swap_info_struct *p;
2598 unsigned int type; 2830 unsigned int type;
2831 int i;
2599 2832
2600 p = kzalloc(sizeof(*p), GFP_KERNEL); 2833 p = kzalloc(sizeof(*p), GFP_KERNEL);
2601 if (!p) 2834 if (!p)
@@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
2631 } 2864 }
2632 INIT_LIST_HEAD(&p->first_swap_extent.list); 2865 INIT_LIST_HEAD(&p->first_swap_extent.list);
2633 plist_node_init(&p->list, 0); 2866 plist_node_init(&p->list, 0);
2634 plist_node_init(&p->avail_list, 0); 2867 for_each_node(i)
2868 plist_node_init(&p->avail_lists[i], 0);
2635 p->flags = SWP_USED; 2869 p->flags = SWP_USED;
2636 spin_unlock(&swap_lock); 2870 spin_unlock(&swap_lock);
2637 spin_lock_init(&p->lock); 2871 spin_lock_init(&p->lock);
@@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2873 if (!capable(CAP_SYS_ADMIN)) 3107 if (!capable(CAP_SYS_ADMIN))
2874 return -EPERM; 3108 return -EPERM;
2875 3109
3110 if (!swap_avail_heads)
3111 return -ENOMEM;
3112
2876 p = alloc_swap_info(); 3113 p = alloc_swap_info();
2877 if (IS_ERR(p)) 3114 if (IS_ERR(p))
2878 return PTR_ERR(p); 3115 return PTR_ERR(p);
@@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2963 cluster = per_cpu_ptr(p->percpu_cluster, cpu); 3200 cluster = per_cpu_ptr(p->percpu_cluster, cpu);
2964 cluster_set_null(&cluster->index); 3201 cluster_set_null(&cluster->index);
2965 } 3202 }
2966 } 3203 } else
3204 atomic_inc(&nr_rotate_swap);
2967 3205
2968 error = swap_cgroup_swapon(p->type, maxpages); 3206 error = swap_cgroup_swapon(p->type, maxpages);
2969 if (error) 3207 if (error)
@@ -3457,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
3457 } 3695 }
3458 } 3696 }
3459} 3697}
3698
3699static int __init swapfile_init(void)
3700{
3701 int nid;
3702
3703 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3704 GFP_KERNEL);
3705 if (!swap_avail_heads) {
3706 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3707 return -ENOMEM;
3708 }
3709
3710 for_each_node(nid)
3711 plist_head_init(&swap_avail_heads[nid]);
3712
3713 return 0;
3714}
3715subsys_initcall(swapfile_init);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 8bcb501bce60..81192701964d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -371,6 +371,36 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
371 bool zeropage); 371 bool zeropage);
372#endif /* CONFIG_HUGETLB_PAGE */ 372#endif /* CONFIG_HUGETLB_PAGE */
373 373
374static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
375 pmd_t *dst_pmd,
376 struct vm_area_struct *dst_vma,
377 unsigned long dst_addr,
378 unsigned long src_addr,
379 struct page **page,
380 bool zeropage)
381{
382 ssize_t err;
383
384 if (vma_is_anonymous(dst_vma)) {
385 if (!zeropage)
386 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
387 dst_addr, src_addr, page);
388 else
389 err = mfill_zeropage_pte(dst_mm, dst_pmd,
390 dst_vma, dst_addr);
391 } else {
392 if (!zeropage)
393 err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
394 dst_vma, dst_addr,
395 src_addr, page);
396 else
397 err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
398 dst_vma, dst_addr);
399 }
400
401 return err;
402}
403
374static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, 404static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
375 unsigned long dst_start, 405 unsigned long dst_start,
376 unsigned long src_start, 406 unsigned long src_start,
@@ -487,22 +517,8 @@ retry:
487 BUG_ON(pmd_none(*dst_pmd)); 517 BUG_ON(pmd_none(*dst_pmd));
488 BUG_ON(pmd_trans_huge(*dst_pmd)); 518 BUG_ON(pmd_trans_huge(*dst_pmd));
489 519
490 if (vma_is_anonymous(dst_vma)) { 520 err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
491 if (!zeropage) 521 src_addr, &page, zeropage);
492 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
493 dst_addr, src_addr,
494 &page);
495 else
496 err = mfill_zeropage_pte(dst_mm, dst_pmd,
497 dst_vma, dst_addr);
498 } else {
499 err = -EINVAL; /* if zeropage is true return -EINVAL */
500 if (likely(!zeropage))
501 err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
502 dst_vma, dst_addr,
503 src_addr, &page);
504 }
505
506 cond_resched(); 522 cond_resched();
507 523
508 if (unlikely(err == -EFAULT)) { 524 if (unlikely(err == -EFAULT)) {
diff --git a/mm/util.c b/mm/util.c
index 9ecddf568fe3..34e57fae959d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
614 return 0; 614 return 0;
615 615
616 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 616 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
617 free = global_page_state(NR_FREE_PAGES); 617 free = global_zone_page_state(NR_FREE_PAGES);
618 free += global_node_page_state(NR_FILE_PAGES); 618 free += global_node_page_state(NR_FILE_PAGES);
619 619
620 /* 620 /*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a47e3894c775..8a43db6284eb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -49,12 +49,10 @@ static void __vunmap(const void *, int);
49static void free_work(struct work_struct *w) 49static void free_work(struct work_struct *w)
50{ 50{
51 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 51 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
52 struct llist_node *llnode = llist_del_all(&p->list); 52 struct llist_node *t, *llnode;
53 while (llnode) { 53
54 void *p = llnode; 54 llist_for_each_safe(llnode, t, llist_del_all(&p->list))
55 llnode = llist_next(llnode); 55 __vunmap((void *)llnode, 1);
56 __vunmap(p, 1);
57 }
58} 56}
59 57
60/*** Page table manipulation functions ***/ 58/*** Page table manipulation functions ***/
@@ -2482,7 +2480,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2482 * matching slot. While scanning, if any of the areas overlaps with 2480 * matching slot. While scanning, if any of the areas overlaps with
2483 * existing vmap_area, the base address is pulled down to fit the 2481 * existing vmap_area, the base address is pulled down to fit the
2484 * area. Scanning is repeated till all the areas fit and then all 2482 * area. Scanning is repeated till all the areas fit and then all
2485 * necessary data structres are inserted and the result is returned. 2483 * necessary data structures are inserted and the result is returned.
2486 */ 2484 */
2487struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2485struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2488 const size_t *sizes, int nr_vms, 2486 const size_t *sizes, int nr_vms,
@@ -2510,15 +2508,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2510 if (start > offsets[last_area]) 2508 if (start > offsets[last_area])
2511 last_area = area; 2509 last_area = area;
2512 2510
2513 for (area2 = 0; area2 < nr_vms; area2++) { 2511 for (area2 = area + 1; area2 < nr_vms; area2++) {
2514 unsigned long start2 = offsets[area2]; 2512 unsigned long start2 = offsets[area2];
2515 unsigned long end2 = start2 + sizes[area2]; 2513 unsigned long end2 = start2 + sizes[area2];
2516 2514
2517 if (area2 == area) 2515 BUG_ON(start2 < end && start < end2);
2518 continue;
2519
2520 BUG_ON(start2 >= start && start2 < end);
2521 BUG_ON(end2 <= end && end2 > start);
2522 } 2516 }
2523 } 2517 }
2524 last_end = offsets[last_area] + sizes[last_area]; 2518 last_end = offsets[last_area] + sizes[last_area];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f957afe900ec..13d711dd8776 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
393 unsigned long nr_to_scan = min(batch_size, total_scan); 393 unsigned long nr_to_scan = min(batch_size, total_scan);
394 394
395 shrinkctl->nr_to_scan = nr_to_scan; 395 shrinkctl->nr_to_scan = nr_to_scan;
396 shrinkctl->nr_scanned = nr_to_scan;
396 ret = shrinker->scan_objects(shrinker, shrinkctl); 397 ret = shrinker->scan_objects(shrinker, shrinkctl);
397 if (ret == SHRINK_STOP) 398 if (ret == SHRINK_STOP)
398 break; 399 break;
399 freed += ret; 400 freed += ret;
400 401
401 count_vm_events(SLABS_SCANNED, nr_to_scan); 402 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
402 total_scan -= nr_to_scan; 403 total_scan -= shrinkctl->nr_scanned;
403 scanned += nr_to_scan; 404 scanned += shrinkctl->nr_scanned;
404 405
405 cond_resched(); 406 cond_resched();
406 } 407 }
@@ -535,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page)
535 * that isolated the page, the page cache radix tree and 536 * that isolated the page, the page cache radix tree and
536 * optional buffer heads at page->private. 537 * optional buffer heads at page->private.
537 */ 538 */
538 return page_count(page) - page_has_private(page) == 2; 539 int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
540 HPAGE_PMD_NR : 1;
541 return page_count(page) - page_has_private(page) == 1 + radix_pins;
539} 542}
540 543
541static int may_write_to_inode(struct inode *inode, struct scan_control *sc) 544static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -665,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
665 bool reclaimed) 668 bool reclaimed)
666{ 669{
667 unsigned long flags; 670 unsigned long flags;
671 int refcount;
668 672
669 BUG_ON(!PageLocked(page)); 673 BUG_ON(!PageLocked(page));
670 BUG_ON(mapping != page_mapping(page)); 674 BUG_ON(mapping != page_mapping(page));
@@ -695,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
695 * Note that if SetPageDirty is always performed via set_page_dirty, 699 * Note that if SetPageDirty is always performed via set_page_dirty,
696 * and thus under tree_lock, then this ordering is not required. 700 * and thus under tree_lock, then this ordering is not required.
697 */ 701 */
698 if (!page_ref_freeze(page, 2)) 702 if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
703 refcount = 1 + HPAGE_PMD_NR;
704 else
705 refcount = 2;
706 if (!page_ref_freeze(page, refcount))
699 goto cannot_free; 707 goto cannot_free;
700 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ 708 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
701 if (unlikely(PageDirty(page))) { 709 if (unlikely(PageDirty(page))) {
702 page_ref_unfreeze(page, 2); 710 page_ref_unfreeze(page, refcount);
703 goto cannot_free; 711 goto cannot_free;
704 } 712 }
705 713
@@ -1121,58 +1129,59 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1121 * Try to allocate it some swap space here. 1129 * Try to allocate it some swap space here.
1122 * Lazyfree page could be freed directly 1130 * Lazyfree page could be freed directly
1123 */ 1131 */
1124 if (PageAnon(page) && PageSwapBacked(page) && 1132 if (PageAnon(page) && PageSwapBacked(page)) {
1125 !PageSwapCache(page)) { 1133 if (!PageSwapCache(page)) {
1126 if (!(sc->gfp_mask & __GFP_IO)) 1134 if (!(sc->gfp_mask & __GFP_IO))
1127 goto keep_locked; 1135 goto keep_locked;
1128 if (PageTransHuge(page)) { 1136 if (PageTransHuge(page)) {
1129 /* cannot split THP, skip it */ 1137 /* cannot split THP, skip it */
1130 if (!can_split_huge_page(page, NULL)) 1138 if (!can_split_huge_page(page, NULL))
1131 goto activate_locked; 1139 goto activate_locked;
1132 /* 1140 /*
1133 * Split pages without a PMD map right 1141 * Split pages without a PMD map right
1134 * away. Chances are some or all of the 1142 * away. Chances are some or all of the
1135 * tail pages can be freed without IO. 1143 * tail pages can be freed without IO.
1136 */ 1144 */
1137 if (!compound_mapcount(page) && 1145 if (!compound_mapcount(page) &&
1138 split_huge_page_to_list(page, page_list)) 1146 split_huge_page_to_list(page,
1139 goto activate_locked; 1147 page_list))
1140 } 1148 goto activate_locked;
1141 if (!add_to_swap(page)) { 1149 }
1142 if (!PageTransHuge(page)) 1150 if (!add_to_swap(page)) {
1143 goto activate_locked; 1151 if (!PageTransHuge(page))
1144 /* Split THP and swap individual base pages */ 1152 goto activate_locked;
1145 if (split_huge_page_to_list(page, page_list)) 1153 /* Fallback to swap normal pages */
1146 goto activate_locked; 1154 if (split_huge_page_to_list(page,
1147 if (!add_to_swap(page)) 1155 page_list))
1148 goto activate_locked; 1156 goto activate_locked;
1149 } 1157#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1150 1158 count_vm_event(THP_SWPOUT_FALLBACK);
1151 /* XXX: We don't support THP writes */ 1159#endif
1152 if (PageTransHuge(page) && 1160 if (!add_to_swap(page))
1153 split_huge_page_to_list(page, page_list)) { 1161 goto activate_locked;
1154 delete_from_swap_cache(page); 1162 }
1155 goto activate_locked;
1156 }
1157 1163
1158 may_enter_fs = 1; 1164 may_enter_fs = 1;
1159 1165
1160 /* Adding to swap updated mapping */ 1166 /* Adding to swap updated mapping */
1161 mapping = page_mapping(page); 1167 mapping = page_mapping(page);
1168 }
1162 } else if (unlikely(PageTransHuge(page))) { 1169 } else if (unlikely(PageTransHuge(page))) {
1163 /* Split file THP */ 1170 /* Split file THP */
1164 if (split_huge_page_to_list(page, page_list)) 1171 if (split_huge_page_to_list(page, page_list))
1165 goto keep_locked; 1172 goto keep_locked;
1166 } 1173 }
1167 1174
1168 VM_BUG_ON_PAGE(PageTransHuge(page), page);
1169
1170 /* 1175 /*
1171 * The page is mapped into the page tables of one or more 1176 * The page is mapped into the page tables of one or more
1172 * processes. Try to unmap it here. 1177 * processes. Try to unmap it here.
1173 */ 1178 */
1174 if (page_mapped(page)) { 1179 if (page_mapped(page)) {
1175 if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { 1180 enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
1181
1182 if (unlikely(PageTransHuge(page)))
1183 flags |= TTU_SPLIT_HUGE_PMD;
1184 if (!try_to_unmap(page, flags)) {
1176 nr_unmap_fail++; 1185 nr_unmap_fail++;
1177 goto activate_locked; 1186 goto activate_locked;
1178 } 1187 }
@@ -1312,7 +1321,11 @@ free_it:
1312 * Is there need to periodically free_page_list? It would 1321 * Is there need to periodically free_page_list? It would
1313 * appear not as the counts should be low 1322 * appear not as the counts should be low
1314 */ 1323 */
1315 list_add(&page->lru, &free_pages); 1324 if (unlikely(PageTransHuge(page))) {
1325 mem_cgroup_uncharge(page);
1326 (*get_compound_page_dtor(page))(page);
1327 } else
1328 list_add(&page->lru, &free_pages);
1316 continue; 1329 continue;
1317 1330
1318activate_locked: 1331activate_locked:
@@ -1742,9 +1755,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1742 int file = is_file_lru(lru); 1755 int file = is_file_lru(lru);
1743 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1756 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1744 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1757 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1758 bool stalled = false;
1745 1759
1746 while (unlikely(too_many_isolated(pgdat, file, sc))) { 1760 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1747 congestion_wait(BLK_RW_ASYNC, HZ/10); 1761 if (stalled)
1762 return 0;
1763
1764 /* wait a bit for the reclaimer. */
1765 msleep(100);
1766 stalled = true;
1748 1767
1749 /* We are about to die and free our memory. Return now. */ 1768 /* We are about to die and free our memory. Return now. */
1750 if (fatal_signal_pending(current)) 1769 if (fatal_signal_pending(current))
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441bbeef2..c7e4b8458023 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -870,6 +870,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
870{ 870{
871 unsigned long requested = 1UL << order; 871 unsigned long requested = 1UL << order;
872 872
873 if (WARN_ON_ONCE(order >= MAX_ORDER))
874 return 0;
875
873 if (!info->free_blocks_total) 876 if (!info->free_blocks_total)
874 return 0; 877 return 0;
875 878
@@ -1071,6 +1074,8 @@ const char * const vmstat_text[] = {
1071#endif 1074#endif
1072 "thp_zero_page_alloc", 1075 "thp_zero_page_alloc",
1073 "thp_zero_page_alloc_failed", 1076 "thp_zero_page_alloc_failed",
1077 "thp_swpout",
1078 "thp_swpout_fallback",
1074#endif 1079#endif
1075#ifdef CONFIG_MEMORY_BALLOON 1080#ifdef CONFIG_MEMORY_BALLOON
1076 "balloon_inflate", 1081 "balloon_inflate",
@@ -1093,6 +1098,10 @@ const char * const vmstat_text[] = {
1093 "vmacache_find_hits", 1098 "vmacache_find_hits",
1094 "vmacache_full_flushes", 1099 "vmacache_full_flushes",
1095#endif 1100#endif
1101#ifdef CONFIG_SWAP
1102 "swap_ra",
1103 "swap_ra_hit",
1104#endif
1096#endif /* CONFIG_VM_EVENTS_COUNTERS */ 1105#endif /* CONFIG_VM_EVENTS_COUNTERS */
1097}; 1106};
1098#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 1107#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
@@ -1250,7 +1259,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1250 seq_putc(m, '\n'); 1259 seq_putc(m, '\n');
1251} 1260}
1252 1261
1253/* Print out the free pages at each order for each migratetype */ 1262/* Print out the number of pageblocks for each migratetype */
1254static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) 1263static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1255{ 1264{
1256 int mtype; 1265 int mtype;
@@ -1500,7 +1509,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
1500 if (!v) 1509 if (!v)
1501 return ERR_PTR(-ENOMEM); 1510 return ERR_PTR(-ENOMEM);
1502 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1511 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1503 v[i] = global_page_state(i); 1512 v[i] = global_zone_page_state(i);
1504 v += NR_VM_ZONE_STAT_ITEMS; 1513 v += NR_VM_ZONE_STAT_ITEMS;
1505 1514
1506 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 1515 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
@@ -1589,7 +1598,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
1589 * which can equally be echo'ed to or cat'ted from (by root), 1598 * which can equally be echo'ed to or cat'ted from (by root),
1590 * can be used to update the stats just before reading them. 1599 * can be used to update the stats just before reading them.
1591 * 1600 *
1592 * Oh, and since global_page_state() etc. are so careful to hide 1601 * Oh, and since global_zone_page_state() etc. are so careful to hide
1593 * transiently negative values, report an error here if any of 1602 * transiently negative values, report an error here if any of
1594 * the stats is negative, so we know to go looking for imbalance. 1603 * the stats is negative, so we know to go looking for imbalance.
1595 */ 1604 */
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 54f63c4a809a..486550df32be 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -23,10 +23,13 @@
23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
24 24
25#include <linux/atomic.h> 25#include <linux/atomic.h>
26#include <linux/sched.h>
26#include <linux/list.h> 27#include <linux/list.h>
27#include <linux/mm.h> 28#include <linux/mm.h>
28#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/percpu.h>
29#include <linux/preempt.h> 31#include <linux/preempt.h>
32#include <linux/workqueue.h>
30#include <linux/slab.h> 33#include <linux/slab.h>
31#include <linux/spinlock.h> 34#include <linux/spinlock.h>
32#include <linux/zpool.h> 35#include <linux/zpool.h>
@@ -48,11 +51,15 @@ enum buddy {
48}; 51};
49 52
50/* 53/*
51 * struct z3fold_header - z3fold page metadata occupying the first chunk of each 54 * struct z3fold_header - z3fold page metadata occupying first chunks of each
52 * z3fold page, except for HEADLESS pages 55 * z3fold page, except for HEADLESS pages
53 * @buddy: links the z3fold page into the relevant list in the pool 56 * @buddy: links the z3fold page into the relevant list in the
57 * pool
54 * @page_lock: per-page lock 58 * @page_lock: per-page lock
55 * @refcount: reference cound for the z3fold page 59 * @refcount: reference count for the z3fold page
60 * @work: work_struct for page layout optimization
61 * @pool: pointer to the pool which this page belongs to
62 * @cpu: CPU which this page "belongs" to
56 * @first_chunks: the size of the first buddy in chunks, 0 if free 63 * @first_chunks: the size of the first buddy in chunks, 0 if free
57 * @middle_chunks: the size of the middle buddy in chunks, 0 if free 64 * @middle_chunks: the size of the middle buddy in chunks, 0 if free
58 * @last_chunks: the size of the last buddy in chunks, 0 if free 65 * @last_chunks: the size of the last buddy in chunks, 0 if free
@@ -62,6 +69,9 @@ struct z3fold_header {
62 struct list_head buddy; 69 struct list_head buddy;
63 spinlock_t page_lock; 70 spinlock_t page_lock;
64 struct kref refcount; 71 struct kref refcount;
72 struct work_struct work;
73 struct z3fold_pool *pool;
74 short cpu;
65 unsigned short first_chunks; 75 unsigned short first_chunks;
66 unsigned short middle_chunks; 76 unsigned short middle_chunks;
67 unsigned short last_chunks; 77 unsigned short last_chunks;
@@ -92,28 +102,39 @@ struct z3fold_header {
92 102
93/** 103/**
94 * struct z3fold_pool - stores metadata for each z3fold pool 104 * struct z3fold_pool - stores metadata for each z3fold pool
95 * @lock: protects all pool fields and first|last_chunk fields of any 105 * @name: pool name
96 * z3fold page in the pool 106 * @lock: protects pool unbuddied/lru lists
97 * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; 107 * @stale_lock: protects pool stale page list
98 * the lists each z3fold page is added to depends on the size of 108 * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
99 * its free region. 109 * buddies; the list each z3fold page is added to depends on
110 * the size of its free region.
100 * @lru: list tracking the z3fold pages in LRU order by most recently 111 * @lru: list tracking the z3fold pages in LRU order by most recently
101 * added buddy. 112 * added buddy.
113 * @stale: list of pages marked for freeing
102 * @pages_nr: number of z3fold pages in the pool. 114 * @pages_nr: number of z3fold pages in the pool.
103 * @ops: pointer to a structure of user defined operations specified at 115 * @ops: pointer to a structure of user defined operations specified at
104 * pool creation time. 116 * pool creation time.
117 * @compact_wq: workqueue for page layout background optimization
118 * @release_wq: workqueue for safe page release
119 * @work: work_struct for safe page release
105 * 120 *
106 * This structure is allocated at pool creation time and maintains metadata 121 * This structure is allocated at pool creation time and maintains metadata
107 * pertaining to a particular z3fold pool. 122 * pertaining to a particular z3fold pool.
108 */ 123 */
109struct z3fold_pool { 124struct z3fold_pool {
125 const char *name;
110 spinlock_t lock; 126 spinlock_t lock;
111 struct list_head unbuddied[NCHUNKS]; 127 spinlock_t stale_lock;
128 struct list_head *unbuddied;
112 struct list_head lru; 129 struct list_head lru;
130 struct list_head stale;
113 atomic64_t pages_nr; 131 atomic64_t pages_nr;
114 const struct z3fold_ops *ops; 132 const struct z3fold_ops *ops;
115 struct zpool *zpool; 133 struct zpool *zpool;
116 const struct zpool_ops *zpool_ops; 134 const struct zpool_ops *zpool_ops;
135 struct workqueue_struct *compact_wq;
136 struct workqueue_struct *release_wq;
137 struct work_struct work;
117}; 138};
118 139
119/* 140/*
@@ -122,9 +143,10 @@ struct z3fold_pool {
122enum z3fold_page_flags { 143enum z3fold_page_flags {
123 PAGE_HEADLESS = 0, 144 PAGE_HEADLESS = 0,
124 MIDDLE_CHUNK_MAPPED, 145 MIDDLE_CHUNK_MAPPED,
146 NEEDS_COMPACTING,
147 PAGE_STALE
125}; 148};
126 149
127
128/***************** 150/*****************
129 * Helpers 151 * Helpers
130*****************/ 152*****************/
@@ -138,14 +160,19 @@ static int size_to_chunks(size_t size)
138#define for_each_unbuddied_list(_iter, _begin) \ 160#define for_each_unbuddied_list(_iter, _begin) \
139 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) 161 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
140 162
163static void compact_page_work(struct work_struct *w);
164
141/* Initializes the z3fold header of a newly allocated z3fold page */ 165/* Initializes the z3fold header of a newly allocated z3fold page */
142static struct z3fold_header *init_z3fold_page(struct page *page) 166static struct z3fold_header *init_z3fold_page(struct page *page,
167 struct z3fold_pool *pool)
143{ 168{
144 struct z3fold_header *zhdr = page_address(page); 169 struct z3fold_header *zhdr = page_address(page);
145 170
146 INIT_LIST_HEAD(&page->lru); 171 INIT_LIST_HEAD(&page->lru);
147 clear_bit(PAGE_HEADLESS, &page->private); 172 clear_bit(PAGE_HEADLESS, &page->private);
148 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); 173 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
174 clear_bit(NEEDS_COMPACTING, &page->private);
175 clear_bit(PAGE_STALE, &page->private);
149 176
150 spin_lock_init(&zhdr->page_lock); 177 spin_lock_init(&zhdr->page_lock);
151 kref_init(&zhdr->refcount); 178 kref_init(&zhdr->refcount);
@@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
154 zhdr->last_chunks = 0; 181 zhdr->last_chunks = 0;
155 zhdr->first_num = 0; 182 zhdr->first_num = 0;
156 zhdr->start_middle = 0; 183 zhdr->start_middle = 0;
184 zhdr->cpu = -1;
185 zhdr->pool = pool;
157 INIT_LIST_HEAD(&zhdr->buddy); 186 INIT_LIST_HEAD(&zhdr->buddy);
187 INIT_WORK(&zhdr->work, compact_page_work);
158 return zhdr; 188 return zhdr;
159} 189}
160 190
@@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page)
164 __free_page(page); 194 __free_page(page);
165} 195}
166 196
167static void release_z3fold_page(struct kref *ref)
168{
169 struct z3fold_header *zhdr;
170 struct page *page;
171
172 zhdr = container_of(ref, struct z3fold_header, refcount);
173 page = virt_to_page(zhdr);
174
175 if (!list_empty(&zhdr->buddy))
176 list_del(&zhdr->buddy);
177 if (!list_empty(&page->lru))
178 list_del(&page->lru);
179 free_z3fold_page(page);
180}
181
182/* Lock a z3fold page */ 197/* Lock a z3fold page */
183static inline void z3fold_page_lock(struct z3fold_header *zhdr) 198static inline void z3fold_page_lock(struct z3fold_header *zhdr)
184{ 199{
@@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle)
228 return (handle - zhdr->first_num) & BUDDY_MASK; 243 return (handle - zhdr->first_num) & BUDDY_MASK;
229} 244}
230 245
246static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
247{
248 struct page *page = virt_to_page(zhdr);
249 struct z3fold_pool *pool = zhdr->pool;
250
251 WARN_ON(!list_empty(&zhdr->buddy));
252 set_bit(PAGE_STALE, &page->private);
253 spin_lock(&pool->lock);
254 if (!list_empty(&page->lru))
255 list_del(&page->lru);
256 spin_unlock(&pool->lock);
257 if (locked)
258 z3fold_page_unlock(zhdr);
259 spin_lock(&pool->stale_lock);
260 list_add(&zhdr->buddy, &pool->stale);
261 queue_work(pool->release_wq, &pool->work);
262 spin_unlock(&pool->stale_lock);
263}
264
265static void __attribute__((__unused__))
266 release_z3fold_page(struct kref *ref)
267{
268 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
269 refcount);
270 __release_z3fold_page(zhdr, false);
271}
272
273static void release_z3fold_page_locked(struct kref *ref)
274{
275 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
276 refcount);
277 WARN_ON(z3fold_page_trylock(zhdr));
278 __release_z3fold_page(zhdr, true);
279}
280
281static void release_z3fold_page_locked_list(struct kref *ref)
282{
283 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
284 refcount);
285 spin_lock(&zhdr->pool->lock);
286 list_del_init(&zhdr->buddy);
287 spin_unlock(&zhdr->pool->lock);
288
289 WARN_ON(z3fold_page_trylock(zhdr));
290 __release_z3fold_page(zhdr, true);
291}
292
293static void free_pages_work(struct work_struct *w)
294{
295 struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
296
297 spin_lock(&pool->stale_lock);
298 while (!list_empty(&pool->stale)) {
299 struct z3fold_header *zhdr = list_first_entry(&pool->stale,
300 struct z3fold_header, buddy);
301 struct page *page = virt_to_page(zhdr);
302
303 list_del(&zhdr->buddy);
304 if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
305 continue;
306 clear_bit(NEEDS_COMPACTING, &page->private);
307 spin_unlock(&pool->stale_lock);
308 cancel_work_sync(&zhdr->work);
309 free_z3fold_page(page);
310 cond_resched();
311 spin_lock(&pool->stale_lock);
312 }
313 spin_unlock(&pool->stale_lock);
314}
315
231/* 316/*
232 * Returns the number of free chunks in a z3fold page. 317 * Returns the number of free chunks in a z3fold page.
233 * NB: can't be used with HEADLESS pages. 318 * NB: can't be used with HEADLESS pages.
@@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr)
252 return nfree; 337 return nfree;
253} 338}
254 339
255/*****************
256 * API Functions
257*****************/
258/**
259 * z3fold_create_pool() - create a new z3fold pool
260 * @gfp: gfp flags when allocating the z3fold pool structure
261 * @ops: user-defined operations for the z3fold pool
262 *
263 * Return: pointer to the new z3fold pool or NULL if the metadata allocation
264 * failed.
265 */
266static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
267 const struct z3fold_ops *ops)
268{
269 struct z3fold_pool *pool;
270 int i;
271
272 pool = kzalloc(sizeof(struct z3fold_pool), gfp);
273 if (!pool)
274 return NULL;
275 spin_lock_init(&pool->lock);
276 for_each_unbuddied_list(i, 0)
277 INIT_LIST_HEAD(&pool->unbuddied[i]);
278 INIT_LIST_HEAD(&pool->lru);
279 atomic64_set(&pool->pages_nr, 0);
280 pool->ops = ops;
281 return pool;
282}
283
284/**
285 * z3fold_destroy_pool() - destroys an existing z3fold pool
286 * @pool: the z3fold pool to be destroyed
287 *
288 * The pool should be emptied before this function is called.
289 */
290static void z3fold_destroy_pool(struct z3fold_pool *pool)
291{
292 kfree(pool);
293}
294
295static inline void *mchunk_memmove(struct z3fold_header *zhdr, 340static inline void *mchunk_memmove(struct z3fold_header *zhdr,
296 unsigned short dst_chunk) 341 unsigned short dst_chunk)
297{ 342{
@@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
347 return 0; 392 return 0;
348} 393}
349 394
395static void do_compact_page(struct z3fold_header *zhdr, bool locked)
396{
397 struct z3fold_pool *pool = zhdr->pool;
398 struct page *page;
399 struct list_head *unbuddied;
400 int fchunks;
401
402 page = virt_to_page(zhdr);
403 if (locked)
404 WARN_ON(z3fold_page_trylock(zhdr));
405 else
406 z3fold_page_lock(zhdr);
407 if (test_bit(PAGE_STALE, &page->private) ||
408 !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) {
409 z3fold_page_unlock(zhdr);
410 return;
411 }
412 spin_lock(&pool->lock);
413 list_del_init(&zhdr->buddy);
414 spin_unlock(&pool->lock);
415
416 z3fold_compact_page(zhdr);
417 unbuddied = get_cpu_ptr(pool->unbuddied);
418 fchunks = num_free_chunks(zhdr);
419 if (fchunks < NCHUNKS &&
420 (!zhdr->first_chunks || !zhdr->middle_chunks ||
421 !zhdr->last_chunks)) {
422 /* the page's not completely free and it's unbuddied */
423 spin_lock(&pool->lock);
424 list_add(&zhdr->buddy, &unbuddied[fchunks]);
425 spin_unlock(&pool->lock);
426 zhdr->cpu = smp_processor_id();
427 }
428 put_cpu_ptr(pool->unbuddied);
429 z3fold_page_unlock(zhdr);
430}
431
432static void compact_page_work(struct work_struct *w)
433{
434 struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
435 work);
436
437 do_compact_page(zhdr, false);
438}
439
440
441/*
442 * API Functions
443 */
444
445/**
446 * z3fold_create_pool() - create a new z3fold pool
447 * @name: pool name
448 * @gfp: gfp flags when allocating the z3fold pool structure
449 * @ops: user-defined operations for the z3fold pool
450 *
451 * Return: pointer to the new z3fold pool or NULL if the metadata allocation
452 * failed.
453 */
454static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
455 const struct z3fold_ops *ops)
456{
457 struct z3fold_pool *pool = NULL;
458 int i, cpu;
459
460 pool = kzalloc(sizeof(struct z3fold_pool), gfp);
461 if (!pool)
462 goto out;
463 spin_lock_init(&pool->lock);
464 spin_lock_init(&pool->stale_lock);
465 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
466 for_each_possible_cpu(cpu) {
467 struct list_head *unbuddied =
468 per_cpu_ptr(pool->unbuddied, cpu);
469 for_each_unbuddied_list(i, 0)
470 INIT_LIST_HEAD(&unbuddied[i]);
471 }
472 INIT_LIST_HEAD(&pool->lru);
473 INIT_LIST_HEAD(&pool->stale);
474 atomic64_set(&pool->pages_nr, 0);
475 pool->name = name;
476 pool->compact_wq = create_singlethread_workqueue(pool->name);
477 if (!pool->compact_wq)
478 goto out;
479 pool->release_wq = create_singlethread_workqueue(pool->name);
480 if (!pool->release_wq)
481 goto out_wq;
482 INIT_WORK(&pool->work, free_pages_work);
483 pool->ops = ops;
484 return pool;
485
486out_wq:
487 destroy_workqueue(pool->compact_wq);
488out:
489 kfree(pool);
490 return NULL;
491}
492
493/**
494 * z3fold_destroy_pool() - destroys an existing z3fold pool
495 * @pool: the z3fold pool to be destroyed
496 *
497 * The pool should be emptied before this function is called.
498 */
499static void z3fold_destroy_pool(struct z3fold_pool *pool)
500{
501 destroy_workqueue(pool->release_wq);
502 destroy_workqueue(pool->compact_wq);
503 kfree(pool);
504}
505
350/** 506/**
351 * z3fold_alloc() - allocates a region of a given size 507 * z3fold_alloc() - allocates a region of a given size
352 * @pool: z3fold pool from which to allocate 508 * @pool: z3fold pool from which to allocate
@@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
371{ 527{
372 int chunks = 0, i, freechunks; 528 int chunks = 0, i, freechunks;
373 struct z3fold_header *zhdr = NULL; 529 struct z3fold_header *zhdr = NULL;
530 struct page *page = NULL;
374 enum buddy bud; 531 enum buddy bud;
375 struct page *page; 532 bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM;
376 533
377 if (!size || (gfp & __GFP_HIGHMEM)) 534 if (!size || (gfp & __GFP_HIGHMEM))
378 return -EINVAL; 535 return -EINVAL;
@@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
383 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) 540 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
384 bud = HEADLESS; 541 bud = HEADLESS;
385 else { 542 else {
543 struct list_head *unbuddied;
386 chunks = size_to_chunks(size); 544 chunks = size_to_chunks(size);
387 545
546lookup:
388 /* First, try to find an unbuddied z3fold page. */ 547 /* First, try to find an unbuddied z3fold page. */
389 zhdr = NULL; 548 unbuddied = get_cpu_ptr(pool->unbuddied);
390 for_each_unbuddied_list(i, chunks) { 549 for_each_unbuddied_list(i, chunks) {
391 spin_lock(&pool->lock); 550 struct list_head *l = &unbuddied[i];
392 zhdr = list_first_entry_or_null(&pool->unbuddied[i], 551
552 zhdr = list_first_entry_or_null(READ_ONCE(l),
393 struct z3fold_header, buddy); 553 struct z3fold_header, buddy);
394 if (!zhdr || !z3fold_page_trylock(zhdr)) { 554
395 spin_unlock(&pool->lock); 555 if (!zhdr)
396 continue; 556 continue;
557
558 /* Re-check under lock. */
559 spin_lock(&pool->lock);
560 l = &unbuddied[i];
561 if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
562 struct z3fold_header, buddy)) ||
563 !z3fold_page_trylock(zhdr)) {
564 spin_unlock(&pool->lock);
565 put_cpu_ptr(pool->unbuddied);
566 goto lookup;
397 } 567 }
398 kref_get(&zhdr->refcount);
399 list_del_init(&zhdr->buddy); 568 list_del_init(&zhdr->buddy);
569 zhdr->cpu = -1;
400 spin_unlock(&pool->lock); 570 spin_unlock(&pool->lock);
401 571
402 page = virt_to_page(zhdr); 572 page = virt_to_page(zhdr);
573 if (test_bit(NEEDS_COMPACTING, &page->private)) {
574 z3fold_page_unlock(zhdr);
575 zhdr = NULL;
576 put_cpu_ptr(pool->unbuddied);
577 if (can_sleep)
578 cond_resched();
579 goto lookup;
580 }
581
582 /*
583 * this page could not be removed from its unbuddied
584 * list while pool lock was held, and then we've taken
585 * page lock so kref_put could not be called before
586 * we got here, so it's safe to just call kref_get()
587 */
588 kref_get(&zhdr->refcount);
589 break;
590 }
591 put_cpu_ptr(pool->unbuddied);
592
593 if (zhdr) {
403 if (zhdr->first_chunks == 0) { 594 if (zhdr->first_chunks == 0) {
404 if (zhdr->middle_chunks != 0 && 595 if (zhdr->middle_chunks != 0 &&
405 chunks >= zhdr->start_middle) 596 chunks >= zhdr->start_middle)
@@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
411 else if (zhdr->middle_chunks == 0) 602 else if (zhdr->middle_chunks == 0)
412 bud = MIDDLE; 603 bud = MIDDLE;
413 else { 604 else {
414 z3fold_page_unlock(zhdr);
415 spin_lock(&pool->lock);
416 if (kref_put(&zhdr->refcount, 605 if (kref_put(&zhdr->refcount,
417 release_z3fold_page)) 606 release_z3fold_page_locked))
418 atomic64_dec(&pool->pages_nr); 607 atomic64_dec(&pool->pages_nr);
419 spin_unlock(&pool->lock); 608 else
609 z3fold_page_unlock(zhdr);
420 pr_err("No free chunks in unbuddied\n"); 610 pr_err("No free chunks in unbuddied\n");
421 WARN_ON(1); 611 WARN_ON(1);
422 continue; 612 goto lookup;
423 } 613 }
424 goto found; 614 goto found;
425 } 615 }
426 bud = FIRST; 616 bud = FIRST;
427 } 617 }
428 618
429 /* Couldn't find unbuddied z3fold page, create new one */ 619 spin_lock(&pool->stale_lock);
430 page = alloc_page(gfp); 620 zhdr = list_first_entry_or_null(&pool->stale,
621 struct z3fold_header, buddy);
622 /*
623 * Before allocating a page, let's see if we can take one from the
624 * stale pages list. cancel_work_sync() can sleep so we must make
625 * sure it won't be called in case we're in atomic context.
626 */
627 if (zhdr && (can_sleep || !work_pending(&zhdr->work) ||
628 !unlikely(work_busy(&zhdr->work)))) {
629 list_del(&zhdr->buddy);
630 clear_bit(NEEDS_COMPACTING, &page->private);
631 spin_unlock(&pool->stale_lock);
632 if (can_sleep)
633 cancel_work_sync(&zhdr->work);
634 page = virt_to_page(zhdr);
635 } else {
636 spin_unlock(&pool->stale_lock);
637 page = alloc_page(gfp);
638 }
639
431 if (!page) 640 if (!page)
432 return -ENOMEM; 641 return -ENOMEM;
433 642
434 atomic64_inc(&pool->pages_nr); 643 atomic64_inc(&pool->pages_nr);
435 zhdr = init_z3fold_page(page); 644 zhdr = init_z3fold_page(page, pool);
436 645
437 if (bud == HEADLESS) { 646 if (bud == HEADLESS) {
438 set_bit(PAGE_HEADLESS, &page->private); 647 set_bit(PAGE_HEADLESS, &page->private);
439 spin_lock(&pool->lock);
440 goto headless; 648 goto headless;
441 } 649 }
442 z3fold_page_lock(zhdr); 650 z3fold_page_lock(zhdr);
@@ -451,15 +659,21 @@ found:
451 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; 659 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
452 } 660 }
453 661
454 spin_lock(&pool->lock);
455 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || 662 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
456 zhdr->middle_chunks == 0) { 663 zhdr->middle_chunks == 0) {
664 struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
665
457 /* Add to unbuddied list */ 666 /* Add to unbuddied list */
458 freechunks = num_free_chunks(zhdr); 667 freechunks = num_free_chunks(zhdr);
459 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); 668 spin_lock(&pool->lock);
669 list_add(&zhdr->buddy, &unbuddied[freechunks]);
670 spin_unlock(&pool->lock);
671 zhdr->cpu = smp_processor_id();
672 put_cpu_ptr(pool->unbuddied);
460 } 673 }
461 674
462headless: 675headless:
676 spin_lock(&pool->lock);
463 /* Add/move z3fold page to beginning of LRU */ 677 /* Add/move z3fold page to beginning of LRU */
464 if (!list_empty(&page->lru)) 678 if (!list_empty(&page->lru))
465 list_del(&page->lru); 679 list_del(&page->lru);
@@ -487,7 +701,6 @@ headless:
487static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) 701static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
488{ 702{
489 struct z3fold_header *zhdr; 703 struct z3fold_header *zhdr;
490 int freechunks;
491 struct page *page; 704 struct page *page;
492 enum buddy bud; 705 enum buddy bud;
493 706
@@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
526 spin_unlock(&pool->lock); 739 spin_unlock(&pool->lock);
527 free_z3fold_page(page); 740 free_z3fold_page(page);
528 atomic64_dec(&pool->pages_nr); 741 atomic64_dec(&pool->pages_nr);
529 } else { 742 return;
530 if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 || 743 }
531 zhdr->last_chunks != 0) { 744
532 z3fold_compact_page(zhdr); 745 if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
533 /* Add to the unbuddied list */ 746 atomic64_dec(&pool->pages_nr);
534 spin_lock(&pool->lock); 747 return;
535 if (!list_empty(&zhdr->buddy)) 748 }
536 list_del(&zhdr->buddy); 749 if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
537 freechunks = num_free_chunks(zhdr);
538 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
539 spin_unlock(&pool->lock);
540 }
541 z3fold_page_unlock(zhdr); 750 z3fold_page_unlock(zhdr);
751 return;
752 }
753 if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
542 spin_lock(&pool->lock); 754 spin_lock(&pool->lock);
543 if (kref_put(&zhdr->refcount, release_z3fold_page)) 755 list_del_init(&zhdr->buddy);
544 atomic64_dec(&pool->pages_nr);
545 spin_unlock(&pool->lock); 756 spin_unlock(&pool->lock);
757 zhdr->cpu = -1;
758 do_compact_page(zhdr, true);
759 return;
546 } 760 }
547 761 queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
762 z3fold_page_unlock(zhdr);
548} 763}
549 764
550/** 765/**
@@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
585 */ 800 */
586static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) 801static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
587{ 802{
588 int i, ret = 0, freechunks; 803 int i, ret = 0;
589 struct z3fold_header *zhdr; 804 struct z3fold_header *zhdr = NULL;
590 struct page *page; 805 struct page *page = NULL;
806 struct list_head *pos;
591 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; 807 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
592 808
593 spin_lock(&pool->lock); 809 spin_lock(&pool->lock);
@@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
600 spin_unlock(&pool->lock); 816 spin_unlock(&pool->lock);
601 return -EINVAL; 817 return -EINVAL;
602 } 818 }
603 page = list_last_entry(&pool->lru, struct page, lru); 819 list_for_each_prev(pos, &pool->lru) {
820 page = list_entry(pos, struct page, lru);
821 if (test_bit(PAGE_HEADLESS, &page->private))
822 /* candidate found */
823 break;
824
825 zhdr = page_address(page);
826 if (!z3fold_page_trylock(zhdr))
827 continue; /* can't evict at this point */
828 kref_get(&zhdr->refcount);
829 list_del_init(&zhdr->buddy);
830 zhdr->cpu = -1;
831 }
832
604 list_del_init(&page->lru); 833 list_del_init(&page->lru);
834 spin_unlock(&pool->lock);
605 835
606 zhdr = page_address(page);
607 if (!test_bit(PAGE_HEADLESS, &page->private)) { 836 if (!test_bit(PAGE_HEADLESS, &page->private)) {
608 if (!list_empty(&zhdr->buddy))
609 list_del_init(&zhdr->buddy);
610 kref_get(&zhdr->refcount);
611 spin_unlock(&pool->lock);
612 z3fold_page_lock(zhdr);
613 /* 837 /*
614 * We need encode the handles before unlocking, since 838 * We need encode the handles before unlocking, since
615 * we can race with free that will set 839 * we can race with free that will set
@@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
624 middle_handle = encode_handle(zhdr, MIDDLE); 848 middle_handle = encode_handle(zhdr, MIDDLE);
625 if (zhdr->last_chunks) 849 if (zhdr->last_chunks)
626 last_handle = encode_handle(zhdr, LAST); 850 last_handle = encode_handle(zhdr, LAST);
851 /*
852 * it's safe to unlock here because we hold a
853 * reference to this page
854 */
627 z3fold_page_unlock(zhdr); 855 z3fold_page_unlock(zhdr);
628 } else { 856 } else {
629 first_handle = encode_handle(zhdr, HEADLESS); 857 first_handle = encode_handle(zhdr, HEADLESS);
630 last_handle = middle_handle = 0; 858 last_handle = middle_handle = 0;
631 spin_unlock(&pool->lock);
632 } 859 }
633 860
634 /* Issue the eviction callback(s) */ 861 /* Issue the eviction callback(s) */
@@ -652,31 +879,12 @@ next:
652 if (ret == 0) { 879 if (ret == 0) {
653 free_z3fold_page(page); 880 free_z3fold_page(page);
654 return 0; 881 return 0;
655 } else {
656 spin_lock(&pool->lock);
657 }
658 } else {
659 z3fold_page_lock(zhdr);
660 if ((zhdr->first_chunks || zhdr->last_chunks ||
661 zhdr->middle_chunks) &&
662 !(zhdr->first_chunks && zhdr->last_chunks &&
663 zhdr->middle_chunks)) {
664 z3fold_compact_page(zhdr);
665 /* add to unbuddied list */
666 spin_lock(&pool->lock);
667 freechunks = num_free_chunks(zhdr);
668 list_add(&zhdr->buddy,
669 &pool->unbuddied[freechunks]);
670 spin_unlock(&pool->lock);
671 }
672 z3fold_page_unlock(zhdr);
673 spin_lock(&pool->lock);
674 if (kref_put(&zhdr->refcount, release_z3fold_page)) {
675 spin_unlock(&pool->lock);
676 atomic64_dec(&pool->pages_nr);
677 return 0;
678 } 882 }
883 } else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
884 atomic64_dec(&pool->pages_nr);
885 return 0;
679 } 886 }
887 spin_lock(&pool->lock);
680 888
681 /* 889 /*
682 * Add to the beginning of LRU. 890 * Add to the beginning of LRU.
@@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp,
795{ 1003{
796 struct z3fold_pool *pool; 1004 struct z3fold_pool *pool;
797 1005
798 pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL); 1006 pool = z3fold_create_pool(name, gfp,
1007 zpool_ops ? &z3fold_zpool_ops : NULL);
799 if (pool) { 1008 if (pool) {
800 pool->zpool = zpool; 1009 pool->zpool = zpool;
801 pool->zpool_ops = zpool_ops; 1010 pool->zpool_ops = zpool_ops;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 308acb9d814b..62457eb82330 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1983,8 +1983,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
1983 1983
1984 spin_lock(&class->lock); 1984 spin_lock(&class->lock);
1985 if (!get_zspage_inuse(zspage)) { 1985 if (!get_zspage_inuse(zspage)) {
1986 ret = -EBUSY; 1986 /*
1987 goto unlock_class; 1987 * Set "offset" to end of the page so that every loops
1988 * skips unnecessary object scanning.
1989 */
1990 offset = PAGE_SIZE;
1988 } 1991 }
1989 1992
1990 pos = offset; 1993 pos = offset;
@@ -2052,7 +2055,6 @@ unpin_objects:
2052 } 2055 }
2053 } 2056 }
2054 kunmap_atomic(s_addr); 2057 kunmap_atomic(s_addr);
2055unlock_class:
2056 spin_unlock(&class->lock); 2058 spin_unlock(&class->lock);
2057 migrate_write_unlock(zspage); 2059 migrate_write_unlock(zspage);
2058 2060
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 48397feb08fb..b920d186ad4a 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -261,7 +261,17 @@ static enum export export_no(const char *s)
261 return export_unknown; 261 return export_unknown;
262} 262}
263 263
264static const char *sec_name(struct elf_info *elf, int secindex); 264static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr)
265{
266 return (void *)elf->hdr +
267 elf->sechdrs[elf->secindex_strings].sh_offset +
268 sechdr->sh_name;
269}
270
271static const char *sec_name(struct elf_info *elf, int secindex)
272{
273 return sech_name(elf, &elf->sechdrs[secindex]);
274}
265 275
266#define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) 276#define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
267 277
@@ -775,21 +785,6 @@ static const char *sym_name(struct elf_info *elf, Elf_Sym *sym)
775 return "(unknown)"; 785 return "(unknown)";
776} 786}
777 787
778static const char *sec_name(struct elf_info *elf, int secindex)
779{
780 Elf_Shdr *sechdrs = elf->sechdrs;
781 return (void *)elf->hdr +
782 elf->sechdrs[elf->secindex_strings].sh_offset +
783 sechdrs[secindex].sh_name;
784}
785
786static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr)
787{
788 return (void *)elf->hdr +
789 elf->sechdrs[elf->secindex_strings].sh_offset +
790 sechdr->sh_name;
791}
792
793/* The pattern is an array of simple patterns. 788/* The pattern is an array of simple patterns.
794 * "foo" will match an exact string equal to "foo" 789 * "foo" will match an exact string equal to "foo"
795 * "*foo" will match a string that ends with "foo" 790 * "*foo" will match a string that ends with "foo"
diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile
index ad8a0897e47f..bc9d02d615da 100644
--- a/tools/testing/selftests/memfd/Makefile
+++ b/tools/testing/selftests/memfd/Makefile
@@ -3,7 +3,7 @@ CFLAGS += -I../../../../include/uapi/
3CFLAGS += -I../../../../include/ 3CFLAGS += -I../../../../include/
4CFLAGS += -I../../../../usr/include/ 4CFLAGS += -I../../../../usr/include/
5 5
6TEST_PROGS := run_fuse_test.sh 6TEST_PROGS := run_tests.sh
7TEST_GEN_FILES := memfd_test fuse_mnt fuse_test 7TEST_GEN_FILES := memfd_test fuse_mnt fuse_test
8 8
9fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags) 9fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags)
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 26546892cd54..f94c6d1fb46f 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -18,12 +18,48 @@
18#include <sys/wait.h> 18#include <sys/wait.h>
19#include <unistd.h> 19#include <unistd.h>
20 20
21#define MEMFD_STR "memfd:"
22#define SHARED_FT_STR "(shared file-table)"
23
21#define MFD_DEF_SIZE 8192 24#define MFD_DEF_SIZE 8192
22#define STACK_SIZE 65536 25#define STACK_SIZE 65536
23 26
27/*
28 * Default is not to test hugetlbfs
29 */
30static int hugetlbfs_test;
31static size_t mfd_def_size = MFD_DEF_SIZE;
32
33/*
34 * Copied from mlock2-tests.c
35 */
36static unsigned long default_huge_page_size(void)
37{
38 unsigned long hps = 0;
39 char *line = NULL;
40 size_t linelen = 0;
41 FILE *f = fopen("/proc/meminfo", "r");
42
43 if (!f)
44 return 0;
45 while (getline(&line, &linelen, f) > 0) {
46 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
47 hps <<= 10;
48 break;
49 }
50 }
51
52 free(line);
53 fclose(f);
54 return hps;
55}
56
24static int sys_memfd_create(const char *name, 57static int sys_memfd_create(const char *name,
25 unsigned int flags) 58 unsigned int flags)
26{ 59{
60 if (hugetlbfs_test)
61 flags |= MFD_HUGETLB;
62
27 return syscall(__NR_memfd_create, name, flags); 63 return syscall(__NR_memfd_create, name, flags);
28} 64}
29 65
@@ -150,7 +186,7 @@ static void *mfd_assert_mmap_shared(int fd)
150 void *p; 186 void *p;
151 187
152 p = mmap(NULL, 188 p = mmap(NULL,
153 MFD_DEF_SIZE, 189 mfd_def_size,
154 PROT_READ | PROT_WRITE, 190 PROT_READ | PROT_WRITE,
155 MAP_SHARED, 191 MAP_SHARED,
156 fd, 192 fd,
@@ -168,7 +204,7 @@ static void *mfd_assert_mmap_private(int fd)
168 void *p; 204 void *p;
169 205
170 p = mmap(NULL, 206 p = mmap(NULL,
171 MFD_DEF_SIZE, 207 mfd_def_size,
172 PROT_READ, 208 PROT_READ,
173 MAP_PRIVATE, 209 MAP_PRIVATE,
174 fd, 210 fd,
@@ -223,7 +259,7 @@ static void mfd_assert_read(int fd)
223 259
224 /* verify PROT_READ *is* allowed */ 260 /* verify PROT_READ *is* allowed */
225 p = mmap(NULL, 261 p = mmap(NULL,
226 MFD_DEF_SIZE, 262 mfd_def_size,
227 PROT_READ, 263 PROT_READ,
228 MAP_PRIVATE, 264 MAP_PRIVATE,
229 fd, 265 fd,
@@ -232,11 +268,11 @@ static void mfd_assert_read(int fd)
232 printf("mmap() failed: %m\n"); 268 printf("mmap() failed: %m\n");
233 abort(); 269 abort();
234 } 270 }
235 munmap(p, MFD_DEF_SIZE); 271 munmap(p, mfd_def_size);
236 272
237 /* verify MAP_PRIVATE is *always* allowed (even writable) */ 273 /* verify MAP_PRIVATE is *always* allowed (even writable) */
238 p = mmap(NULL, 274 p = mmap(NULL,
239 MFD_DEF_SIZE, 275 mfd_def_size,
240 PROT_READ | PROT_WRITE, 276 PROT_READ | PROT_WRITE,
241 MAP_PRIVATE, 277 MAP_PRIVATE,
242 fd, 278 fd,
@@ -245,7 +281,7 @@ static void mfd_assert_read(int fd)
245 printf("mmap() failed: %m\n"); 281 printf("mmap() failed: %m\n");
246 abort(); 282 abort();
247 } 283 }
248 munmap(p, MFD_DEF_SIZE); 284 munmap(p, mfd_def_size);
249} 285}
250 286
251static void mfd_assert_write(int fd) 287static void mfd_assert_write(int fd)
@@ -254,16 +290,22 @@ static void mfd_assert_write(int fd)
254 void *p; 290 void *p;
255 int r; 291 int r;
256 292
257 /* verify write() succeeds */ 293 /*
258 l = write(fd, "\0\0\0\0", 4); 294 * huegtlbfs does not support write, but we want to
259 if (l != 4) { 295 * verify everything else here.
260 printf("write() failed: %m\n"); 296 */
261 abort(); 297 if (!hugetlbfs_test) {
298 /* verify write() succeeds */
299 l = write(fd, "\0\0\0\0", 4);
300 if (l != 4) {
301 printf("write() failed: %m\n");
302 abort();
303 }
262 } 304 }
263 305
264 /* verify PROT_READ | PROT_WRITE is allowed */ 306 /* verify PROT_READ | PROT_WRITE is allowed */
265 p = mmap(NULL, 307 p = mmap(NULL,
266 MFD_DEF_SIZE, 308 mfd_def_size,
267 PROT_READ | PROT_WRITE, 309 PROT_READ | PROT_WRITE,
268 MAP_SHARED, 310 MAP_SHARED,
269 fd, 311 fd,
@@ -273,11 +315,11 @@ static void mfd_assert_write(int fd)
273 abort(); 315 abort();
274 } 316 }
275 *(char *)p = 0; 317 *(char *)p = 0;
276 munmap(p, MFD_DEF_SIZE); 318 munmap(p, mfd_def_size);
277 319
278 /* verify PROT_WRITE is allowed */ 320 /* verify PROT_WRITE is allowed */
279 p = mmap(NULL, 321 p = mmap(NULL,
280 MFD_DEF_SIZE, 322 mfd_def_size,
281 PROT_WRITE, 323 PROT_WRITE,
282 MAP_SHARED, 324 MAP_SHARED,
283 fd, 325 fd,
@@ -287,12 +329,12 @@ static void mfd_assert_write(int fd)
287 abort(); 329 abort();
288 } 330 }
289 *(char *)p = 0; 331 *(char *)p = 0;
290 munmap(p, MFD_DEF_SIZE); 332 munmap(p, mfd_def_size);
291 333
292 /* verify PROT_READ with MAP_SHARED is allowed and a following 334 /* verify PROT_READ with MAP_SHARED is allowed and a following
293 * mprotect(PROT_WRITE) allows writing */ 335 * mprotect(PROT_WRITE) allows writing */
294 p = mmap(NULL, 336 p = mmap(NULL,
295 MFD_DEF_SIZE, 337 mfd_def_size,
296 PROT_READ, 338 PROT_READ,
297 MAP_SHARED, 339 MAP_SHARED,
298 fd, 340 fd,
@@ -302,20 +344,20 @@ static void mfd_assert_write(int fd)
302 abort(); 344 abort();
303 } 345 }
304 346
305 r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE); 347 r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE);
306 if (r < 0) { 348 if (r < 0) {
307 printf("mprotect() failed: %m\n"); 349 printf("mprotect() failed: %m\n");
308 abort(); 350 abort();
309 } 351 }
310 352
311 *(char *)p = 0; 353 *(char *)p = 0;
312 munmap(p, MFD_DEF_SIZE); 354 munmap(p, mfd_def_size);
313 355
314 /* verify PUNCH_HOLE works */ 356 /* verify PUNCH_HOLE works */
315 r = fallocate(fd, 357 r = fallocate(fd,
316 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 358 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
317 0, 359 0,
318 MFD_DEF_SIZE); 360 mfd_def_size);
319 if (r < 0) { 361 if (r < 0) {
320 printf("fallocate(PUNCH_HOLE) failed: %m\n"); 362 printf("fallocate(PUNCH_HOLE) failed: %m\n");
321 abort(); 363 abort();
@@ -337,7 +379,7 @@ static void mfd_fail_write(int fd)
337 379
338 /* verify PROT_READ | PROT_WRITE is not allowed */ 380 /* verify PROT_READ | PROT_WRITE is not allowed */
339 p = mmap(NULL, 381 p = mmap(NULL,
340 MFD_DEF_SIZE, 382 mfd_def_size,
341 PROT_READ | PROT_WRITE, 383 PROT_READ | PROT_WRITE,
342 MAP_SHARED, 384 MAP_SHARED,
343 fd, 385 fd,
@@ -349,7 +391,7 @@ static void mfd_fail_write(int fd)
349 391
350 /* verify PROT_WRITE is not allowed */ 392 /* verify PROT_WRITE is not allowed */
351 p = mmap(NULL, 393 p = mmap(NULL,
352 MFD_DEF_SIZE, 394 mfd_def_size,
353 PROT_WRITE, 395 PROT_WRITE,
354 MAP_SHARED, 396 MAP_SHARED,
355 fd, 397 fd,
@@ -362,13 +404,13 @@ static void mfd_fail_write(int fd)
362 /* Verify PROT_READ with MAP_SHARED with a following mprotect is not 404 /* Verify PROT_READ with MAP_SHARED with a following mprotect is not
363 * allowed. Note that for r/w the kernel already prevents the mmap. */ 405 * allowed. Note that for r/w the kernel already prevents the mmap. */
364 p = mmap(NULL, 406 p = mmap(NULL,
365 MFD_DEF_SIZE, 407 mfd_def_size,
366 PROT_READ, 408 PROT_READ,
367 MAP_SHARED, 409 MAP_SHARED,
368 fd, 410 fd,
369 0); 411 0);
370 if (p != MAP_FAILED) { 412 if (p != MAP_FAILED) {
371 r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE); 413 r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE);
372 if (r >= 0) { 414 if (r >= 0) {
373 printf("mmap()+mprotect() didn't fail as expected\n"); 415 printf("mmap()+mprotect() didn't fail as expected\n");
374 abort(); 416 abort();
@@ -379,7 +421,7 @@ static void mfd_fail_write(int fd)
379 r = fallocate(fd, 421 r = fallocate(fd,
380 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 422 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
381 0, 423 0,
382 MFD_DEF_SIZE); 424 mfd_def_size);
383 if (r >= 0) { 425 if (r >= 0) {
384 printf("fallocate(PUNCH_HOLE) didn't fail as expected\n"); 426 printf("fallocate(PUNCH_HOLE) didn't fail as expected\n");
385 abort(); 427 abort();
@@ -390,13 +432,13 @@ static void mfd_assert_shrink(int fd)
390{ 432{
391 int r, fd2; 433 int r, fd2;
392 434
393 r = ftruncate(fd, MFD_DEF_SIZE / 2); 435 r = ftruncate(fd, mfd_def_size / 2);
394 if (r < 0) { 436 if (r < 0) {
395 printf("ftruncate(SHRINK) failed: %m\n"); 437 printf("ftruncate(SHRINK) failed: %m\n");
396 abort(); 438 abort();
397 } 439 }
398 440
399 mfd_assert_size(fd, MFD_DEF_SIZE / 2); 441 mfd_assert_size(fd, mfd_def_size / 2);
400 442
401 fd2 = mfd_assert_open(fd, 443 fd2 = mfd_assert_open(fd,
402 O_RDWR | O_CREAT | O_TRUNC, 444 O_RDWR | O_CREAT | O_TRUNC,
@@ -410,7 +452,7 @@ static void mfd_fail_shrink(int fd)
410{ 452{
411 int r; 453 int r;
412 454
413 r = ftruncate(fd, MFD_DEF_SIZE / 2); 455 r = ftruncate(fd, mfd_def_size / 2);
414 if (r >= 0) { 456 if (r >= 0) {
415 printf("ftruncate(SHRINK) didn't fail as expected\n"); 457 printf("ftruncate(SHRINK) didn't fail as expected\n");
416 abort(); 458 abort();
@@ -425,31 +467,31 @@ static void mfd_assert_grow(int fd)
425{ 467{
426 int r; 468 int r;
427 469
428 r = ftruncate(fd, MFD_DEF_SIZE * 2); 470 r = ftruncate(fd, mfd_def_size * 2);
429 if (r < 0) { 471 if (r < 0) {
430 printf("ftruncate(GROW) failed: %m\n"); 472 printf("ftruncate(GROW) failed: %m\n");
431 abort(); 473 abort();
432 } 474 }
433 475
434 mfd_assert_size(fd, MFD_DEF_SIZE * 2); 476 mfd_assert_size(fd, mfd_def_size * 2);
435 477
436 r = fallocate(fd, 478 r = fallocate(fd,
437 0, 479 0,
438 0, 480 0,
439 MFD_DEF_SIZE * 4); 481 mfd_def_size * 4);
440 if (r < 0) { 482 if (r < 0) {
441 printf("fallocate(ALLOC) failed: %m\n"); 483 printf("fallocate(ALLOC) failed: %m\n");
442 abort(); 484 abort();
443 } 485 }
444 486
445 mfd_assert_size(fd, MFD_DEF_SIZE * 4); 487 mfd_assert_size(fd, mfd_def_size * 4);
446} 488}
447 489
448static void mfd_fail_grow(int fd) 490static void mfd_fail_grow(int fd)
449{ 491{
450 int r; 492 int r;
451 493
452 r = ftruncate(fd, MFD_DEF_SIZE * 2); 494 r = ftruncate(fd, mfd_def_size * 2);
453 if (r >= 0) { 495 if (r >= 0) {
454 printf("ftruncate(GROW) didn't fail as expected\n"); 496 printf("ftruncate(GROW) didn't fail as expected\n");
455 abort(); 497 abort();
@@ -458,7 +500,7 @@ static void mfd_fail_grow(int fd)
458 r = fallocate(fd, 500 r = fallocate(fd,
459 0, 501 0,
460 0, 502 0,
461 MFD_DEF_SIZE * 4); 503 mfd_def_size * 4);
462 if (r >= 0) { 504 if (r >= 0) {
463 printf("fallocate(ALLOC) didn't fail as expected\n"); 505 printf("fallocate(ALLOC) didn't fail as expected\n");
464 abort(); 506 abort();
@@ -467,25 +509,37 @@ static void mfd_fail_grow(int fd)
467 509
468static void mfd_assert_grow_write(int fd) 510static void mfd_assert_grow_write(int fd)
469{ 511{
470 static char buf[MFD_DEF_SIZE * 8]; 512 static char *buf;
471 ssize_t l; 513 ssize_t l;
472 514
473 l = pwrite(fd, buf, sizeof(buf), 0); 515 buf = malloc(mfd_def_size * 8);
474 if (l != sizeof(buf)) { 516 if (!buf) {
517 printf("malloc(%d) failed: %m\n", mfd_def_size * 8);
518 abort();
519 }
520
521 l = pwrite(fd, buf, mfd_def_size * 8, 0);
522 if (l != (mfd_def_size * 8)) {
475 printf("pwrite() failed: %m\n"); 523 printf("pwrite() failed: %m\n");
476 abort(); 524 abort();
477 } 525 }
478 526
479 mfd_assert_size(fd, MFD_DEF_SIZE * 8); 527 mfd_assert_size(fd, mfd_def_size * 8);
480} 528}
481 529
482static void mfd_fail_grow_write(int fd) 530static void mfd_fail_grow_write(int fd)
483{ 531{
484 static char buf[MFD_DEF_SIZE * 8]; 532 static char *buf;
485 ssize_t l; 533 ssize_t l;
486 534
487 l = pwrite(fd, buf, sizeof(buf), 0); 535 buf = malloc(mfd_def_size * 8);
488 if (l == sizeof(buf)) { 536 if (!buf) {
537 printf("malloc(%d) failed: %m\n", mfd_def_size * 8);
538 abort();
539 }
540
541 l = pwrite(fd, buf, mfd_def_size * 8, 0);
542 if (l == (mfd_def_size * 8)) {
489 printf("pwrite() didn't fail as expected\n"); 543 printf("pwrite() didn't fail as expected\n");
490 abort(); 544 abort();
491 } 545 }
@@ -543,6 +597,8 @@ static void test_create(void)
543 char buf[2048]; 597 char buf[2048];
544 int fd; 598 int fd;
545 599
600 printf("%s CREATE\n", MEMFD_STR);
601
546 /* test NULL name */ 602 /* test NULL name */
547 mfd_fail_new(NULL, 0); 603 mfd_fail_new(NULL, 0);
548 604
@@ -570,13 +626,18 @@ static void test_create(void)
570 fd = mfd_assert_new("", 0, MFD_CLOEXEC); 626 fd = mfd_assert_new("", 0, MFD_CLOEXEC);
571 close(fd); 627 close(fd);
572 628
573 /* verify MFD_ALLOW_SEALING is allowed */ 629 if (!hugetlbfs_test) {
574 fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); 630 /* verify MFD_ALLOW_SEALING is allowed */
575 close(fd); 631 fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
576 632 close(fd);
577 /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ 633
578 fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); 634 /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
579 close(fd); 635 fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
636 close(fd);
637 } else {
638 /* sealing is not supported on hugetlbfs */
639 mfd_fail_new("", MFD_ALLOW_SEALING);
640 }
580} 641}
581 642
582/* 643/*
@@ -587,8 +648,14 @@ static void test_basic(void)
587{ 648{
588 int fd; 649 int fd;
589 650
651 /* hugetlbfs does not contain sealing support */
652 if (hugetlbfs_test)
653 return;
654
655 printf("%s BASIC\n", MEMFD_STR);
656
590 fd = mfd_assert_new("kern_memfd_basic", 657 fd = mfd_assert_new("kern_memfd_basic",
591 MFD_DEF_SIZE, 658 mfd_def_size,
592 MFD_CLOEXEC | MFD_ALLOW_SEALING); 659 MFD_CLOEXEC | MFD_ALLOW_SEALING);
593 660
594 /* add basic seals */ 661 /* add basic seals */
@@ -619,7 +686,7 @@ static void test_basic(void)
619 686
620 /* verify sealing does not work without MFD_ALLOW_SEALING */ 687 /* verify sealing does not work without MFD_ALLOW_SEALING */
621 fd = mfd_assert_new("kern_memfd_basic", 688 fd = mfd_assert_new("kern_memfd_basic",
622 MFD_DEF_SIZE, 689 mfd_def_size,
623 MFD_CLOEXEC); 690 MFD_CLOEXEC);
624 mfd_assert_has_seals(fd, F_SEAL_SEAL); 691 mfd_assert_has_seals(fd, F_SEAL_SEAL);
625 mfd_fail_add_seals(fd, F_SEAL_SHRINK | 692 mfd_fail_add_seals(fd, F_SEAL_SHRINK |
@@ -630,6 +697,28 @@ static void test_basic(void)
630} 697}
631 698
632/* 699/*
700 * hugetlbfs doesn't support seals or write, so just verify grow and shrink
701 * on a hugetlbfs file created via memfd_create.
702 */
703static void test_hugetlbfs_grow_shrink(void)
704{
705 int fd;
706
707 printf("%s HUGETLBFS-GROW-SHRINK\n", MEMFD_STR);
708
709 fd = mfd_assert_new("kern_memfd_seal_write",
710 mfd_def_size,
711 MFD_CLOEXEC);
712
713 mfd_assert_read(fd);
714 mfd_assert_write(fd);
715 mfd_assert_shrink(fd);
716 mfd_assert_grow(fd);
717
718 close(fd);
719}
720
721/*
633 * Test SEAL_WRITE 722 * Test SEAL_WRITE
634 * Test whether SEAL_WRITE actually prevents modifications. 723 * Test whether SEAL_WRITE actually prevents modifications.
635 */ 724 */
@@ -637,8 +726,17 @@ static void test_seal_write(void)
637{ 726{
638 int fd; 727 int fd;
639 728
729 /*
730 * hugetlbfs does not contain sealing or write support. Just test
731 * basic grow and shrink via test_hugetlbfs_grow_shrink.
732 */
733 if (hugetlbfs_test)
734 return test_hugetlbfs_grow_shrink();
735
736 printf("%s SEAL-WRITE\n", MEMFD_STR);
737
640 fd = mfd_assert_new("kern_memfd_seal_write", 738 fd = mfd_assert_new("kern_memfd_seal_write",
641 MFD_DEF_SIZE, 739 mfd_def_size,
642 MFD_CLOEXEC | MFD_ALLOW_SEALING); 740 MFD_CLOEXEC | MFD_ALLOW_SEALING);
643 mfd_assert_has_seals(fd, 0); 741 mfd_assert_has_seals(fd, 0);
644 mfd_assert_add_seals(fd, F_SEAL_WRITE); 742 mfd_assert_add_seals(fd, F_SEAL_WRITE);
@@ -661,8 +759,14 @@ static void test_seal_shrink(void)
661{ 759{
662 int fd; 760 int fd;
663 761
762 /* hugetlbfs does not contain sealing support */
763 if (hugetlbfs_test)
764 return;
765
766 printf("%s SEAL-SHRINK\n", MEMFD_STR);
767
664 fd = mfd_assert_new("kern_memfd_seal_shrink", 768 fd = mfd_assert_new("kern_memfd_seal_shrink",
665 MFD_DEF_SIZE, 769 mfd_def_size,
666 MFD_CLOEXEC | MFD_ALLOW_SEALING); 770 MFD_CLOEXEC | MFD_ALLOW_SEALING);
667 mfd_assert_has_seals(fd, 0); 771 mfd_assert_has_seals(fd, 0);
668 mfd_assert_add_seals(fd, F_SEAL_SHRINK); 772 mfd_assert_add_seals(fd, F_SEAL_SHRINK);
@@ -685,8 +789,14 @@ static void test_seal_grow(void)
685{ 789{
686 int fd; 790 int fd;
687 791
792 /* hugetlbfs does not contain sealing support */
793 if (hugetlbfs_test)
794 return;
795
796 printf("%s SEAL-GROW\n", MEMFD_STR);
797
688 fd = mfd_assert_new("kern_memfd_seal_grow", 798 fd = mfd_assert_new("kern_memfd_seal_grow",
689 MFD_DEF_SIZE, 799 mfd_def_size,
690 MFD_CLOEXEC | MFD_ALLOW_SEALING); 800 MFD_CLOEXEC | MFD_ALLOW_SEALING);
691 mfd_assert_has_seals(fd, 0); 801 mfd_assert_has_seals(fd, 0);
692 mfd_assert_add_seals(fd, F_SEAL_GROW); 802 mfd_assert_add_seals(fd, F_SEAL_GROW);
@@ -709,8 +819,14 @@ static void test_seal_resize(void)
709{ 819{
710 int fd; 820 int fd;
711 821
822 /* hugetlbfs does not contain sealing support */
823 if (hugetlbfs_test)
824 return;
825
826 printf("%s SEAL-RESIZE\n", MEMFD_STR);
827
712 fd = mfd_assert_new("kern_memfd_seal_resize", 828 fd = mfd_assert_new("kern_memfd_seal_resize",
713 MFD_DEF_SIZE, 829 mfd_def_size,
714 MFD_CLOEXEC | MFD_ALLOW_SEALING); 830 MFD_CLOEXEC | MFD_ALLOW_SEALING);
715 mfd_assert_has_seals(fd, 0); 831 mfd_assert_has_seals(fd, 0);
716 mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW); 832 mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
@@ -726,15 +842,52 @@ static void test_seal_resize(void)
726} 842}
727 843
728/* 844/*
845 * hugetlbfs does not support seals. Basic test to dup the memfd created
846 * fd and perform some basic operations on it.
847 */
848static void hugetlbfs_dup(char *b_suffix)
849{
850 int fd, fd2;
851
852 printf("%s HUGETLBFS-DUP %s\n", MEMFD_STR, b_suffix);
853
854 fd = mfd_assert_new("kern_memfd_share_dup",
855 mfd_def_size,
856 MFD_CLOEXEC);
857
858 fd2 = mfd_assert_dup(fd);
859
860 mfd_assert_read(fd);
861 mfd_assert_write(fd);
862
863 mfd_assert_shrink(fd2);
864 mfd_assert_grow(fd2);
865
866 close(fd2);
867 close(fd);
868}
869
870/*
729 * Test sharing via dup() 871 * Test sharing via dup()
730 * Test that seals are shared between dupped FDs and they're all equal. 872 * Test that seals are shared between dupped FDs and they're all equal.
731 */ 873 */
732static void test_share_dup(void) 874static void test_share_dup(char *banner, char *b_suffix)
733{ 875{
734 int fd, fd2; 876 int fd, fd2;
735 877
878 /*
879 * hugetlbfs does not contain sealing support. Perform some
880 * basic testing on dup'ed fd instead via hugetlbfs_dup.
881 */
882 if (hugetlbfs_test) {
883 hugetlbfs_dup(b_suffix);
884 return;
885 }
886
887 printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
888
736 fd = mfd_assert_new("kern_memfd_share_dup", 889 fd = mfd_assert_new("kern_memfd_share_dup",
737 MFD_DEF_SIZE, 890 mfd_def_size,
738 MFD_CLOEXEC | MFD_ALLOW_SEALING); 891 MFD_CLOEXEC | MFD_ALLOW_SEALING);
739 mfd_assert_has_seals(fd, 0); 892 mfd_assert_has_seals(fd, 0);
740 893
@@ -768,13 +921,19 @@ static void test_share_dup(void)
768 * Test sealing with active mmap()s 921 * Test sealing with active mmap()s
769 * Modifying seals is only allowed if no other mmap() refs exist. 922 * Modifying seals is only allowed if no other mmap() refs exist.
770 */ 923 */
771static void test_share_mmap(void) 924static void test_share_mmap(char *banner, char *b_suffix)
772{ 925{
773 int fd; 926 int fd;
774 void *p; 927 void *p;
775 928
929 /* hugetlbfs does not contain sealing support */
930 if (hugetlbfs_test)
931 return;
932
933 printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
934
776 fd = mfd_assert_new("kern_memfd_share_mmap", 935 fd = mfd_assert_new("kern_memfd_share_mmap",
777 MFD_DEF_SIZE, 936 mfd_def_size,
778 MFD_CLOEXEC | MFD_ALLOW_SEALING); 937 MFD_CLOEXEC | MFD_ALLOW_SEALING);
779 mfd_assert_has_seals(fd, 0); 938 mfd_assert_has_seals(fd, 0);
780 939
@@ -784,14 +943,40 @@ static void test_share_mmap(void)
784 mfd_assert_has_seals(fd, 0); 943 mfd_assert_has_seals(fd, 0);
785 mfd_assert_add_seals(fd, F_SEAL_SHRINK); 944 mfd_assert_add_seals(fd, F_SEAL_SHRINK);
786 mfd_assert_has_seals(fd, F_SEAL_SHRINK); 945 mfd_assert_has_seals(fd, F_SEAL_SHRINK);
787 munmap(p, MFD_DEF_SIZE); 946 munmap(p, mfd_def_size);
788 947
789 /* readable ref allows sealing */ 948 /* readable ref allows sealing */
790 p = mfd_assert_mmap_private(fd); 949 p = mfd_assert_mmap_private(fd);
791 mfd_assert_add_seals(fd, F_SEAL_WRITE); 950 mfd_assert_add_seals(fd, F_SEAL_WRITE);
792 mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); 951 mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
793 munmap(p, MFD_DEF_SIZE); 952 munmap(p, mfd_def_size);
953
954 close(fd);
955}
956
957/*
958 * Basic test to make sure we can open the hugetlbfs fd via /proc and
959 * perform some simple operations on it.
960 */
961static void hugetlbfs_proc_open(char *b_suffix)
962{
963 int fd, fd2;
964
965 printf("%s HUGETLBFS-PROC-OPEN %s\n", MEMFD_STR, b_suffix);
794 966
967 fd = mfd_assert_new("kern_memfd_share_open",
968 mfd_def_size,
969 MFD_CLOEXEC);
970
971 fd2 = mfd_assert_open(fd, O_RDWR, 0);
972
973 mfd_assert_read(fd);
974 mfd_assert_write(fd);
975
976 mfd_assert_shrink(fd2);
977 mfd_assert_grow(fd2);
978
979 close(fd2);
795 close(fd); 980 close(fd);
796} 981}
797 982
@@ -801,12 +986,23 @@ static void test_share_mmap(void)
801 * This is *not* like dup(), but like a real separate open(). Make sure the 986 * This is *not* like dup(), but like a real separate open(). Make sure the
802 * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR. 987 * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR.
803 */ 988 */
804static void test_share_open(void) 989static void test_share_open(char *banner, char *b_suffix)
805{ 990{
806 int fd, fd2; 991 int fd, fd2;
807 992
993 /*
994 * hugetlbfs does not contain sealing support. So test basic
995 * functionality of using /proc fd via hugetlbfs_proc_open
996 */
997 if (hugetlbfs_test) {
998 hugetlbfs_proc_open(b_suffix);
999 return;
1000 }
1001
1002 printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
1003
808 fd = mfd_assert_new("kern_memfd_share_open", 1004 fd = mfd_assert_new("kern_memfd_share_open",
809 MFD_DEF_SIZE, 1005 mfd_def_size,
810 MFD_CLOEXEC | MFD_ALLOW_SEALING); 1006 MFD_CLOEXEC | MFD_ALLOW_SEALING);
811 mfd_assert_has_seals(fd, 0); 1007 mfd_assert_has_seals(fd, 0);
812 1008
@@ -841,13 +1037,19 @@ static void test_share_open(void)
841 * Test sharing via fork() 1037 * Test sharing via fork()
842 * Test whether seal-modifications work as expected with forked childs. 1038 * Test whether seal-modifications work as expected with forked childs.
843 */ 1039 */
844static void test_share_fork(void) 1040static void test_share_fork(char *banner, char *b_suffix)
845{ 1041{
846 int fd; 1042 int fd;
847 pid_t pid; 1043 pid_t pid;
848 1044
1045 /* hugetlbfs does not contain sealing support */
1046 if (hugetlbfs_test)
1047 return;
1048
1049 printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
1050
849 fd = mfd_assert_new("kern_memfd_share_fork", 1051 fd = mfd_assert_new("kern_memfd_share_fork",
850 MFD_DEF_SIZE, 1052 mfd_def_size,
851 MFD_CLOEXEC | MFD_ALLOW_SEALING); 1053 MFD_CLOEXEC | MFD_ALLOW_SEALING);
852 mfd_assert_has_seals(fd, 0); 1054 mfd_assert_has_seals(fd, 0);
853 1055
@@ -870,40 +1072,40 @@ int main(int argc, char **argv)
870{ 1072{
871 pid_t pid; 1073 pid_t pid;
872 1074
873 printf("memfd: CREATE\n"); 1075 if (argc == 2) {
1076 if (!strcmp(argv[1], "hugetlbfs")) {
1077 unsigned long hpage_size = default_huge_page_size();
1078
1079 if (!hpage_size) {
1080 printf("Unable to determine huge page size\n");
1081 abort();
1082 }
1083
1084 hugetlbfs_test = 1;
1085 mfd_def_size = hpage_size * 2;
1086 }
1087 }
1088
874 test_create(); 1089 test_create();
875 printf("memfd: BASIC\n");
876 test_basic(); 1090 test_basic();
877 1091
878 printf("memfd: SEAL-WRITE\n");
879 test_seal_write(); 1092 test_seal_write();
880 printf("memfd: SEAL-SHRINK\n");
881 test_seal_shrink(); 1093 test_seal_shrink();
882 printf("memfd: SEAL-GROW\n");
883 test_seal_grow(); 1094 test_seal_grow();
884 printf("memfd: SEAL-RESIZE\n");
885 test_seal_resize(); 1095 test_seal_resize();
886 1096
887 printf("memfd: SHARE-DUP\n"); 1097 test_share_dup("SHARE-DUP", "");
888 test_share_dup(); 1098 test_share_mmap("SHARE-MMAP", "");
889 printf("memfd: SHARE-MMAP\n"); 1099 test_share_open("SHARE-OPEN", "");
890 test_share_mmap(); 1100 test_share_fork("SHARE-FORK", "");
891 printf("memfd: SHARE-OPEN\n");
892 test_share_open();
893 printf("memfd: SHARE-FORK\n");
894 test_share_fork();
895 1101
896 /* Run test-suite in a multi-threaded environment with a shared 1102 /* Run test-suite in a multi-threaded environment with a shared
897 * file-table. */ 1103 * file-table. */
898 pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM); 1104 pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM);
899 printf("memfd: SHARE-DUP (shared file-table)\n"); 1105 test_share_dup("SHARE-DUP", SHARED_FT_STR);
900 test_share_dup(); 1106 test_share_mmap("SHARE-MMAP", SHARED_FT_STR);
901 printf("memfd: SHARE-MMAP (shared file-table)\n"); 1107 test_share_open("SHARE-OPEN", SHARED_FT_STR);
902 test_share_mmap(); 1108 test_share_fork("SHARE-FORK", SHARED_FT_STR);
903 printf("memfd: SHARE-OPEN (shared file-table)\n");
904 test_share_open();
905 printf("memfd: SHARE-FORK (shared file-table)\n");
906 test_share_fork();
907 join_idle_thread(pid); 1109 join_idle_thread(pid);
908 1110
909 printf("memfd: DONE\n"); 1111 printf("memfd: DONE\n");
diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh
new file mode 100644
index 000000000000..daabb350697c
--- /dev/null
+++ b/tools/testing/selftests/memfd/run_tests.sh
@@ -0,0 +1,69 @@
1#!/bin/bash
2# please run as root
3
4#
5# Normal tests requiring no special resources
6#
7./run_fuse_test.sh
8./memfd_test
9
10#
11# To test memfd_create with hugetlbfs, there needs to be hpages_test
12# huge pages free. Attempt to allocate enough pages to test.
13#
14hpages_test=8
15
16#
17# Get count of free huge pages from /proc/meminfo
18#
19while read name size unit; do
20 if [ "$name" = "HugePages_Free:" ]; then
21 freepgs=$size
22 fi
23done < /proc/meminfo
24
25#
26# If not enough free huge pages for test, attempt to increase
27#
28if [ -n "$freepgs" ] && [ $freepgs -lt $hpages_test ]; then
29 nr_hugepgs=`cat /proc/sys/vm/nr_hugepages`
30 hpages_needed=`expr $hpages_test - $freepgs`
31
32 echo 3 > /proc/sys/vm/drop_caches
33 echo $(( $hpages_needed + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages
34 if [ $? -ne 0 ]; then
35 echo "Please run this test as root"
36 exit 1
37 fi
38 while read name size unit; do
39 if [ "$name" = "HugePages_Free:" ]; then
40 freepgs=$size
41 fi
42 done < /proc/meminfo
43fi
44
45#
46# If still not enough huge pages available, exit. But, give back any huge
47# pages potentially allocated above.
48#
49if [ $freepgs -lt $hpages_test ]; then
50 # nr_hugepgs non-zero only if we attempted to increase
51 if [ -n "$nr_hugepgs" ]; then
52 echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
53 fi
54 printf "Not enough huge pages available (%d < %d)\n" \
55 $freepgs $needpgs
56 exit 1
57fi
58
59#
60# Run the hugetlbfs test
61#
62./memfd_test hugetlbfs
63
64#
65# Give back any huge pages allocated for the test
66#
67if [ -n "$nr_hugepgs" ]; then
68 echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
69fi
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 1eae79ae5b4e..a2c53a3d223d 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -66,6 +66,8 @@
66#include <sys/wait.h> 66#include <sys/wait.h>
67#include <pthread.h> 67#include <pthread.h>
68#include <linux/userfaultfd.h> 68#include <linux/userfaultfd.h>
69#include <setjmp.h>
70#include <stdbool.h>
69 71
70#ifdef __NR_userfaultfd 72#ifdef __NR_userfaultfd
71 73
@@ -82,11 +84,17 @@ static int bounces;
82#define TEST_SHMEM 3 84#define TEST_SHMEM 3
83static int test_type; 85static int test_type;
84 86
87/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
88#define ALARM_INTERVAL_SECS 10
89static volatile bool test_uffdio_copy_eexist = true;
90static volatile bool test_uffdio_zeropage_eexist = true;
91
92static bool map_shared;
85static int huge_fd; 93static int huge_fd;
86static char *huge_fd_off0; 94static char *huge_fd_off0;
87static unsigned long long *count_verify; 95static unsigned long long *count_verify;
88static int uffd, uffd_flags, finished, *pipefd; 96static int uffd, uffd_flags, finished, *pipefd;
89static char *area_src, *area_dst; 97static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
90static char *zeropage; 98static char *zeropage;
91pthread_attr_t attr; 99pthread_attr_t attr;
92 100
@@ -125,6 +133,9 @@ static void anon_allocate_area(void **alloc_area)
125 } 133 }
126} 134}
127 135
136static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
137{
138}
128 139
129/* HugeTLB memory */ 140/* HugeTLB memory */
130static int hugetlb_release_pages(char *rel_area) 141static int hugetlb_release_pages(char *rel_area)
@@ -145,17 +156,51 @@ static int hugetlb_release_pages(char *rel_area)
145 156
146static void hugetlb_allocate_area(void **alloc_area) 157static void hugetlb_allocate_area(void **alloc_area)
147{ 158{
159 void *area_alias = NULL;
160 char **alloc_area_alias;
148 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 161 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
149 MAP_PRIVATE | MAP_HUGETLB, huge_fd, 162 (map_shared ? MAP_SHARED : MAP_PRIVATE) |
150 *alloc_area == area_src ? 0 : 163 MAP_HUGETLB,
151 nr_pages * page_size); 164 huge_fd, *alloc_area == area_src ? 0 :
165 nr_pages * page_size);
152 if (*alloc_area == MAP_FAILED) { 166 if (*alloc_area == MAP_FAILED) {
153 fprintf(stderr, "mmap of hugetlbfs file failed\n"); 167 fprintf(stderr, "mmap of hugetlbfs file failed\n");
154 *alloc_area = NULL; 168 *alloc_area = NULL;
155 } 169 }
156 170
157 if (*alloc_area == area_src) 171 if (map_shared) {
172 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
173 MAP_SHARED | MAP_HUGETLB,
174 huge_fd, *alloc_area == area_src ? 0 :
175 nr_pages * page_size);
176 if (area_alias == MAP_FAILED) {
177 if (munmap(*alloc_area, nr_pages * page_size) < 0)
178 perror("hugetlb munmap"), exit(1);
179 *alloc_area = NULL;
180 return;
181 }
182 }
183 if (*alloc_area == area_src) {
158 huge_fd_off0 = *alloc_area; 184 huge_fd_off0 = *alloc_area;
185 alloc_area_alias = &area_src_alias;
186 } else {
187 alloc_area_alias = &area_dst_alias;
188 }
189 if (area_alias)
190 *alloc_area_alias = area_alias;
191}
192
193static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
194{
195 if (!map_shared)
196 return;
197 /*
198 * We can't zap just the pagetable with hugetlbfs because
199 * MADV_DONTEED won't work. So exercise -EEXIST on a alias
200 * mapping where the pagetables are not established initially,
201 * this way we'll exercise the -EEXEC at the fs level.
202 */
203 *start = (unsigned long) area_dst_alias + offset;
159} 204}
160 205
161/* Shared memory */ 206/* Shared memory */
@@ -185,6 +230,7 @@ struct uffd_test_ops {
185 unsigned long expected_ioctls; 230 unsigned long expected_ioctls;
186 void (*allocate_area)(void **alloc_area); 231 void (*allocate_area)(void **alloc_area);
187 int (*release_pages)(char *rel_area); 232 int (*release_pages)(char *rel_area);
233 void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
188}; 234};
189 235
190#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ 236#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
@@ -195,18 +241,21 @@ static struct uffd_test_ops anon_uffd_test_ops = {
195 .expected_ioctls = ANON_EXPECTED_IOCTLS, 241 .expected_ioctls = ANON_EXPECTED_IOCTLS,
196 .allocate_area = anon_allocate_area, 242 .allocate_area = anon_allocate_area,
197 .release_pages = anon_release_pages, 243 .release_pages = anon_release_pages,
244 .alias_mapping = noop_alias_mapping,
198}; 245};
199 246
200static struct uffd_test_ops shmem_uffd_test_ops = { 247static struct uffd_test_ops shmem_uffd_test_ops = {
201 .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, 248 .expected_ioctls = ANON_EXPECTED_IOCTLS,
202 .allocate_area = shmem_allocate_area, 249 .allocate_area = shmem_allocate_area,
203 .release_pages = shmem_release_pages, 250 .release_pages = shmem_release_pages,
251 .alias_mapping = noop_alias_mapping,
204}; 252};
205 253
206static struct uffd_test_ops hugetlb_uffd_test_ops = { 254static struct uffd_test_ops hugetlb_uffd_test_ops = {
207 .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, 255 .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
208 .allocate_area = hugetlb_allocate_area, 256 .allocate_area = hugetlb_allocate_area,
209 .release_pages = hugetlb_release_pages, 257 .release_pages = hugetlb_release_pages,
258 .alias_mapping = hugetlb_alias_mapping,
210}; 259};
211 260
212static struct uffd_test_ops *uffd_test_ops; 261static struct uffd_test_ops *uffd_test_ops;
@@ -331,6 +380,23 @@ static void *locking_thread(void *arg)
331 return NULL; 380 return NULL;
332} 381}
333 382
383static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
384 unsigned long offset)
385{
386 uffd_test_ops->alias_mapping(&uffdio_copy->dst,
387 uffdio_copy->len,
388 offset);
389 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
390 /* real retval in ufdio_copy.copy */
391 if (uffdio_copy->copy != -EEXIST)
392 fprintf(stderr, "UFFDIO_COPY retry error %Ld\n",
393 uffdio_copy->copy), exit(1);
394 } else {
395 fprintf(stderr, "UFFDIO_COPY retry unexpected %Ld\n",
396 uffdio_copy->copy), exit(1);
397 }
398}
399
334static int copy_page(int ufd, unsigned long offset) 400static int copy_page(int ufd, unsigned long offset)
335{ 401{
336 struct uffdio_copy uffdio_copy; 402 struct uffdio_copy uffdio_copy;
@@ -351,8 +417,13 @@ static int copy_page(int ufd, unsigned long offset)
351 } else if (uffdio_copy.copy != page_size) { 417 } else if (uffdio_copy.copy != page_size) {
352 fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", 418 fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
353 uffdio_copy.copy), exit(1); 419 uffdio_copy.copy), exit(1);
354 } else 420 } else {
421 if (test_uffdio_copy_eexist) {
422 test_uffdio_copy_eexist = false;
423 retry_copy_page(ufd, &uffdio_copy, offset);
424 }
355 return 1; 425 return 1;
426 }
356 return 0; 427 return 0;
357} 428}
358 429
@@ -408,6 +479,7 @@ static void *uffd_poll_thread(void *arg)
408 userfaults++; 479 userfaults++;
409 break; 480 break;
410 case UFFD_EVENT_FORK: 481 case UFFD_EVENT_FORK:
482 close(uffd);
411 uffd = msg.arg.fork.ufd; 483 uffd = msg.arg.fork.ufd;
412 pollfd[0].fd = uffd; 484 pollfd[0].fd = uffd;
413 break; 485 break;
@@ -572,6 +644,17 @@ static int userfaultfd_open(int features)
572 return 0; 644 return 0;
573} 645}
574 646
647sigjmp_buf jbuf, *sigbuf;
648
649static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
650{
651 if (sig == SIGBUS) {
652 if (sigbuf)
653 siglongjmp(*sigbuf, 1);
654 abort();
655 }
656}
657
575/* 658/*
576 * For non-cooperative userfaultfd test we fork() a process that will 659 * For non-cooperative userfaultfd test we fork() a process that will
577 * generate pagefaults, will mremap the area monitored by the 660 * generate pagefaults, will mremap the area monitored by the
@@ -585,19 +668,59 @@ static int userfaultfd_open(int features)
585 * The release of the pages currently generates event for shmem and 668 * The release of the pages currently generates event for shmem and
586 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked 669 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
587 * for hugetlb. 670 * for hugetlb.
671 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
672 * monitored area, generate pagefaults and test that signal is delivered.
673 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
674 * test robustness use case - we release monitored area, fork a process
675 * that will generate pagefaults and verify signal is generated.
676 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
677 * feature. Using monitor thread, verify no userfault events are generated.
588 */ 678 */
589static int faulting_process(void) 679static int faulting_process(int signal_test)
590{ 680{
591 unsigned long nr; 681 unsigned long nr;
592 unsigned long long count; 682 unsigned long long count;
593 unsigned long split_nr_pages; 683 unsigned long split_nr_pages;
684 unsigned long lastnr;
685 struct sigaction act;
686 unsigned long signalled = 0;
594 687
595 if (test_type != TEST_HUGETLB) 688 if (test_type != TEST_HUGETLB)
596 split_nr_pages = (nr_pages + 1) / 2; 689 split_nr_pages = (nr_pages + 1) / 2;
597 else 690 else
598 split_nr_pages = nr_pages; 691 split_nr_pages = nr_pages;
599 692
693 if (signal_test) {
694 sigbuf = &jbuf;
695 memset(&act, 0, sizeof(act));
696 act.sa_sigaction = sighndl;
697 act.sa_flags = SA_SIGINFO;
698 if (sigaction(SIGBUS, &act, 0)) {
699 perror("sigaction");
700 return 1;
701 }
702 lastnr = (unsigned long)-1;
703 }
704
600 for (nr = 0; nr < split_nr_pages; nr++) { 705 for (nr = 0; nr < split_nr_pages; nr++) {
706 if (signal_test) {
707 if (sigsetjmp(*sigbuf, 1) != 0) {
708 if (nr == lastnr) {
709 fprintf(stderr, "Signal repeated\n");
710 return 1;
711 }
712
713 lastnr = nr;
714 if (signal_test == 1) {
715 if (copy_page(uffd, nr * page_size))
716 signalled++;
717 } else {
718 signalled++;
719 continue;
720 }
721 }
722 }
723
601 count = *area_count(area_dst, nr); 724 count = *area_count(area_dst, nr);
602 if (count != count_verify[nr]) { 725 if (count != count_verify[nr]) {
603 fprintf(stderr, 726 fprintf(stderr,
@@ -607,6 +730,9 @@ static int faulting_process(void)
607 } 730 }
608 } 731 }
609 732
733 if (signal_test)
734 return signalled != split_nr_pages;
735
610 if (test_type == TEST_HUGETLB) 736 if (test_type == TEST_HUGETLB)
611 return 0; 737 return 0;
612 738
@@ -636,6 +762,23 @@ static int faulting_process(void)
636 return 0; 762 return 0;
637} 763}
638 764
765static void retry_uffdio_zeropage(int ufd,
766 struct uffdio_zeropage *uffdio_zeropage,
767 unsigned long offset)
768{
769 uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
770 uffdio_zeropage->range.len,
771 offset);
772 if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
773 if (uffdio_zeropage->zeropage != -EEXIST)
774 fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n",
775 uffdio_zeropage->zeropage), exit(1);
776 } else {
777 fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n",
778 uffdio_zeropage->zeropage), exit(1);
779 }
780}
781
639static int uffdio_zeropage(int ufd, unsigned long offset) 782static int uffdio_zeropage(int ufd, unsigned long offset)
640{ 783{
641 struct uffdio_zeropage uffdio_zeropage; 784 struct uffdio_zeropage uffdio_zeropage;
@@ -670,8 +813,14 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
670 if (uffdio_zeropage.zeropage != page_size) { 813 if (uffdio_zeropage.zeropage != page_size) {
671 fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", 814 fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
672 uffdio_zeropage.zeropage), exit(1); 815 uffdio_zeropage.zeropage), exit(1);
673 } else 816 } else {
817 if (test_uffdio_zeropage_eexist) {
818 test_uffdio_zeropage_eexist = false;
819 retry_uffdio_zeropage(ufd, &uffdio_zeropage,
820 offset);
821 }
674 return 1; 822 return 1;
823 }
675 } else { 824 } else {
676 fprintf(stderr, 825 fprintf(stderr,
677 "UFFDIO_ZEROPAGE succeeded %Ld\n", 826 "UFFDIO_ZEROPAGE succeeded %Ld\n",
@@ -761,7 +910,7 @@ static int userfaultfd_events_test(void)
761 perror("fork"), exit(1); 910 perror("fork"), exit(1);
762 911
763 if (!pid) 912 if (!pid)
764 return faulting_process(); 913 return faulting_process(0);
765 914
766 waitpid(pid, &err, 0); 915 waitpid(pid, &err, 0);
767 if (err) 916 if (err)
@@ -778,6 +927,72 @@ static int userfaultfd_events_test(void)
778 return userfaults != nr_pages; 927 return userfaults != nr_pages;
779} 928}
780 929
930static int userfaultfd_sig_test(void)
931{
932 struct uffdio_register uffdio_register;
933 unsigned long expected_ioctls;
934 unsigned long userfaults;
935 pthread_t uffd_mon;
936 int err, features;
937 pid_t pid;
938 char c;
939
940 printf("testing signal delivery: ");
941 fflush(stdout);
942
943 if (uffd_test_ops->release_pages(area_dst))
944 return 1;
945
946 features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
947 if (userfaultfd_open(features) < 0)
948 return 1;
949 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
950
951 uffdio_register.range.start = (unsigned long) area_dst;
952 uffdio_register.range.len = nr_pages * page_size;
953 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
954 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
955 fprintf(stderr, "register failure\n"), exit(1);
956
957 expected_ioctls = uffd_test_ops->expected_ioctls;
958 if ((uffdio_register.ioctls & expected_ioctls) !=
959 expected_ioctls)
960 fprintf(stderr,
961 "unexpected missing ioctl for anon memory\n"),
962 exit(1);
963
964 if (faulting_process(1))
965 fprintf(stderr, "faulting process failed\n"), exit(1);
966
967 if (uffd_test_ops->release_pages(area_dst))
968 return 1;
969
970 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
971 perror("uffd_poll_thread create"), exit(1);
972
973 pid = fork();
974 if (pid < 0)
975 perror("fork"), exit(1);
976
977 if (!pid)
978 exit(faulting_process(2));
979
980 waitpid(pid, &err, 0);
981 if (err)
982 fprintf(stderr, "faulting process failed\n"), exit(1);
983
984 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
985 perror("pipe write"), exit(1);
986 if (pthread_join(uffd_mon, (void **)&userfaults))
987 return 1;
988
989 printf("done.\n");
990 if (userfaults)
991 fprintf(stderr, "Signal test failed, userfaults: %ld\n",
992 userfaults);
993 close(uffd);
994 return userfaults != 0;
995}
781static int userfaultfd_stress(void) 996static int userfaultfd_stress(void)
782{ 997{
783 void *area; 998 void *area;
@@ -879,6 +1094,15 @@ static int userfaultfd_stress(void)
879 return 1; 1094 return 1;
880 } 1095 }
881 1096
1097 if (area_dst_alias) {
1098 uffdio_register.range.start = (unsigned long)
1099 area_dst_alias;
1100 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1101 fprintf(stderr, "register failure alias\n");
1102 return 1;
1103 }
1104 }
1105
882 /* 1106 /*
883 * The madvise done previously isn't enough: some 1107 * The madvise done previously isn't enough: some
884 * uffd_thread could have read userfaults (one of 1108 * uffd_thread could have read userfaults (one of
@@ -912,9 +1136,17 @@ static int userfaultfd_stress(void)
912 1136
913 /* unregister */ 1137 /* unregister */
914 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { 1138 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
915 fprintf(stderr, "register failure\n"); 1139 fprintf(stderr, "unregister failure\n");
916 return 1; 1140 return 1;
917 } 1141 }
1142 if (area_dst_alias) {
1143 uffdio_register.range.start = (unsigned long) area_dst;
1144 if (ioctl(uffd, UFFDIO_UNREGISTER,
1145 &uffdio_register.range)) {
1146 fprintf(stderr, "unregister failure alias\n");
1147 return 1;
1148 }
1149 }
918 1150
919 /* verification */ 1151 /* verification */
920 if (bounces & BOUNCE_VERIFY) { 1152 if (bounces & BOUNCE_VERIFY) {
@@ -936,6 +1168,10 @@ static int userfaultfd_stress(void)
936 area_src = area_dst; 1168 area_src = area_dst;
937 area_dst = tmp_area; 1169 area_dst = tmp_area;
938 1170
1171 tmp_area = area_src_alias;
1172 area_src_alias = area_dst_alias;
1173 area_dst_alias = tmp_area;
1174
939 printf("userfaults:"); 1175 printf("userfaults:");
940 for (cpu = 0; cpu < nr_cpus; cpu++) 1176 for (cpu = 0; cpu < nr_cpus; cpu++)
941 printf(" %lu", userfaults[cpu]); 1177 printf(" %lu", userfaults[cpu]);
@@ -946,7 +1182,8 @@ static int userfaultfd_stress(void)
946 return err; 1182 return err;
947 1183
948 close(uffd); 1184 close(uffd);
949 return userfaultfd_zeropage_test() || userfaultfd_events_test(); 1185 return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1186 || userfaultfd_events_test();
950} 1187}
951 1188
952/* 1189/*
@@ -981,7 +1218,12 @@ static void set_test_type(const char *type)
981 } else if (!strcmp(type, "hugetlb")) { 1218 } else if (!strcmp(type, "hugetlb")) {
982 test_type = TEST_HUGETLB; 1219 test_type = TEST_HUGETLB;
983 uffd_test_ops = &hugetlb_uffd_test_ops; 1220 uffd_test_ops = &hugetlb_uffd_test_ops;
1221 } else if (!strcmp(type, "hugetlb_shared")) {
1222 map_shared = true;
1223 test_type = TEST_HUGETLB;
1224 uffd_test_ops = &hugetlb_uffd_test_ops;
984 } else if (!strcmp(type, "shmem")) { 1225 } else if (!strcmp(type, "shmem")) {
1226 map_shared = true;
985 test_type = TEST_SHMEM; 1227 test_type = TEST_SHMEM;
986 uffd_test_ops = &shmem_uffd_test_ops; 1228 uffd_test_ops = &shmem_uffd_test_ops;
987 } else { 1229 } else {
@@ -1001,12 +1243,25 @@ static void set_test_type(const char *type)
1001 fprintf(stderr, "Impossible to run this test\n"), exit(2); 1243 fprintf(stderr, "Impossible to run this test\n"), exit(2);
1002} 1244}
1003 1245
1246static void sigalrm(int sig)
1247{
1248 if (sig != SIGALRM)
1249 abort();
1250 test_uffdio_copy_eexist = true;
1251 test_uffdio_zeropage_eexist = true;
1252 alarm(ALARM_INTERVAL_SECS);
1253}
1254
1004int main(int argc, char **argv) 1255int main(int argc, char **argv)
1005{ 1256{
1006 if (argc < 4) 1257 if (argc < 4)
1007 fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"), 1258 fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
1008 exit(1); 1259 exit(1);
1009 1260
1261 if (signal(SIGALRM, sigalrm) == SIG_ERR)
1262 fprintf(stderr, "failed to arm SIGALRM"), exit(1);
1263 alarm(ALARM_INTERVAL_SECS);
1264
1010 set_test_type(argv[1]); 1265 set_test_type(argv[1]);
1011 1266
1012 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 1267 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);