diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-06 23:49:49 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-06 23:49:49 -0400 |
commit | d34fc1adf01ff87026da85fb972dc259dc347540 (patch) | |
tree | 27356073d423187157b7cdb69da32b53102fb9e7 | |
parent | 1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33 (diff) | |
parent | d2cd9ede6e193dd7d88b6d27399e96229a551b19 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- various misc bits
- DAX updates
- OCFS2
- most of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (119 commits)
mm,fork: introduce MADV_WIPEONFORK
x86,mpx: make mpx depend on x86-64 to free up VMA flag
mm: add /proc/pid/smaps_rollup
mm: hugetlb: clear target sub-page last when clearing huge page
mm: oom: let oom_reap_task and exit_mmap run concurrently
swap: choose swap device according to numa node
mm: replace TIF_MEMDIE checks by tsk_is_oom_victim
mm, oom: do not rely on TIF_MEMDIE for memory reserves access
z3fold: use per-cpu unbuddied lists
mm, swap: don't use VMA based swap readahead if HDD is used as swap
mm, swap: add sysfs interface for VMA based swap readahead
mm, swap: VMA based swap readahead
mm, swap: fix swap readahead marking
mm, swap: add swap readahead hit statistics
mm/vmalloc.c: don't reinvent the wheel but use existing llist API
mm/vmstat.c: fix wrong comment
selftests/memfd: add memfd_create hugetlbfs selftest
mm/shmem: add hugetlbfs support to memfd_create()
mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups
mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas()
...
139 files changed, 3960 insertions, 2068 deletions
diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup new file mode 100644 index 000000000000..0a54ed0d63c9 --- /dev/null +++ b/Documentation/ABI/testing/procfs-smaps_rollup | |||
@@ -0,0 +1,31 @@ | |||
1 | What: /proc/pid/smaps_rollup | ||
2 | Date: August 2017 | ||
3 | Contact: Daniel Colascione <dancol@google.com> | ||
4 | Description: | ||
5 | This file provides pre-summed memory information for a | ||
6 | process. The format is identical to /proc/pid/smaps, | ||
7 | except instead of an entry for each VMA in a process, | ||
8 | smaps_rollup has a single entry (tagged "[rollup]") | ||
9 | for which each field is the sum of the corresponding | ||
10 | fields from all the maps in /proc/pid/smaps. | ||
11 | For more details, see the procfs man page. | ||
12 | |||
13 | Typical output looks like this: | ||
14 | |||
15 | 00100000-ff709000 ---p 00000000 00:00 0 [rollup] | ||
16 | Rss: 884 kB | ||
17 | Pss: 385 kB | ||
18 | Shared_Clean: 696 kB | ||
19 | Shared_Dirty: 0 kB | ||
20 | Private_Clean: 120 kB | ||
21 | Private_Dirty: 68 kB | ||
22 | Referenced: 884 kB | ||
23 | Anonymous: 68 kB | ||
24 | LazyFree: 0 kB | ||
25 | AnonHugePages: 0 kB | ||
26 | ShmemPmdMapped: 0 kB | ||
27 | Shared_Hugetlb: 0 kB | ||
28 | Private_Hugetlb: 0 kB | ||
29 | Swap: 0 kB | ||
30 | SwapPss: 0 kB | ||
31 | Locked: 385 kB | ||
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 451b6d882b2c..c1513c756af1 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram | |||
@@ -90,3 +90,11 @@ Description: | |||
90 | device's debugging info useful for kernel developers. Its | 90 | device's debugging info useful for kernel developers. Its |
91 | format is not documented intentionally and may change | 91 | format is not documented intentionally and may change |
92 | anytime without any notice. | 92 | anytime without any notice. |
93 | |||
94 | What: /sys/block/zram<id>/backing_dev | ||
95 | Date: June 2017 | ||
96 | Contact: Minchan Kim <minchan@kernel.org> | ||
97 | Description: | ||
98 | The backing_dev file is read-write and set up backing | ||
99 | device for zram to write incompressible pages. | ||
100 | For using, user should enable CONFIG_ZRAM_WRITEBACK. | ||
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-swap b/Documentation/ABI/testing/sysfs-kernel-mm-swap new file mode 100644 index 000000000000..587db52084c7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-swap | |||
@@ -0,0 +1,26 @@ | |||
1 | What: /sys/kernel/mm/swap/ | ||
2 | Date: August 2017 | ||
3 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
4 | Description: Interface for swapping | ||
5 | |||
6 | What: /sys/kernel/mm/swap/vma_ra_enabled | ||
7 | Date: August 2017 | ||
8 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
9 | Description: Enable/disable VMA based swap readahead. | ||
10 | |||
11 | If set to true, the VMA based swap readahead algorithm | ||
12 | will be used for swappable anonymous pages mapped in a | ||
13 | VMA, and the global swap readahead algorithm will be | ||
14 | still used for tmpfs etc. other users. If set to | ||
15 | false, the global swap readahead algorithm will be | ||
16 | used for all swappable pages. | ||
17 | |||
18 | What: /sys/kernel/mm/swap/vma_ra_max_order | ||
19 | Date: August 2017 | ||
20 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
21 | Description: The max readahead size in order for VMA based swap readahead | ||
22 | |||
23 | VMA based swap readahead algorithm will readahead at | ||
24 | most 1 << max_order pages for each readahead. The | ||
25 | real readahead size for each readahead will be scaled | ||
26 | according to the estimation algorithm. | ||
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6996b7727b85..86b0e8ec8ad7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -2783,7 +2783,7 @@ | |||
2783 | Allowed values are enable and disable | 2783 | Allowed values are enable and disable |
2784 | 2784 | ||
2785 | numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. | 2785 | numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. |
2786 | one of ['zone', 'node', 'default'] can be specified | 2786 | 'node', 'default' can be specified |
2787 | This can be set from sysctl after boot. | 2787 | This can be set from sysctl after boot. |
2788 | See Documentation/sysctl/vm.txt for details. | 2788 | See Documentation/sysctl/vm.txt for details. |
2789 | 2789 | ||
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 4fced8a21307..257e65714c6a 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt | |||
@@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations | |||
168 | comp_algorithm RW show and change the compression algorithm | 168 | comp_algorithm RW show and change the compression algorithm |
169 | compact WO trigger memory compaction | 169 | compact WO trigger memory compaction |
170 | debug_stat RO this file is used for zram debugging purposes | 170 | debug_stat RO this file is used for zram debugging purposes |
171 | backing_dev RW set up backend storage for zram to write out | ||
171 | 172 | ||
172 | 173 | ||
173 | User space is advised to use the following files to read the device statistics. | 174 | User space is advised to use the following files to read the device statistics. |
@@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace: | |||
231 | resets the disksize to zero. You must set the disksize again | 232 | resets the disksize to zero. You must set the disksize again |
232 | before reusing the device. | 233 | before reusing the device. |
233 | 234 | ||
235 | * Optional Feature | ||
236 | |||
237 | = writeback | ||
238 | |||
239 | With incompressible pages, there is no memory saving with zram. | ||
240 | Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page | ||
241 | to backing storage rather than keeping it in memory. | ||
242 | User should set up backing device via /sys/block/zramX/backing_dev | ||
243 | before disksize setting. | ||
244 | |||
234 | Nitin Gupta | 245 | Nitin Gupta |
235 | ngupta@vflare.org | 246 | ngupta@vflare.org |
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt index aed6b94160b1..0eb31de3a2c1 100644 --- a/Documentation/filesystems/caching/netfs-api.txt +++ b/Documentation/filesystems/caching/netfs-api.txt | |||
@@ -151,8 +151,6 @@ To define an object, a structure of the following type should be filled out: | |||
151 | void (*mark_pages_cached)(void *cookie_netfs_data, | 151 | void (*mark_pages_cached)(void *cookie_netfs_data, |
152 | struct address_space *mapping, | 152 | struct address_space *mapping, |
153 | struct pagevec *cached_pvec); | 153 | struct pagevec *cached_pvec); |
154 | |||
155 | void (*now_uncached)(void *cookie_netfs_data); | ||
156 | }; | 154 | }; |
157 | 155 | ||
158 | This has the following fields: | 156 | This has the following fields: |
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index a7e6e14aeb08..3be3b266be41 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt | |||
@@ -63,9 +63,8 @@ Filesystem support consists of | |||
63 | - implementing an mmap file operation for DAX files which sets the | 63 | - implementing an mmap file operation for DAX files which sets the |
64 | VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to | 64 | VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to |
65 | include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These | 65 | include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These |
66 | handlers should probably call dax_iomap_fault() (for fault and page_mkwrite | 66 | handlers should probably call dax_iomap_fault() passing the appropriate |
67 | handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate | 67 | fault size and iomap operations. |
68 | iomap operations. | ||
69 | - calling iomap_zero_range() passing appropriate iomap operations instead of | 68 | - calling iomap_zero_range() passing appropriate iomap operations instead of |
70 | block_truncate_page() for DAX files | 69 | block_truncate_page() for DAX files |
71 | - ensuring that there is sufficient locking between reads, writes, | 70 | - ensuring that there is sufficient locking between reads, writes, |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 48244c42ff52..9baf66a9ef4e 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -572,7 +572,9 @@ See Documentation/nommu-mmap.txt for more information. | |||
572 | 572 | ||
573 | numa_zonelist_order | 573 | numa_zonelist_order |
574 | 574 | ||
575 | This sysctl is only for NUMA. | 575 | This sysctl is only for NUMA and it is deprecated. Anything but |
576 | Node order will fail! | ||
577 | |||
576 | 'where the memory is allocated from' is controlled by zonelists. | 578 | 'where the memory is allocated from' is controlled by zonelists. |
577 | (This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation. | 579 | (This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation. |
578 | you may be able to read ZONE_DMA as ZONE_DMA32...) | 580 | you may be able to read ZONE_DMA as ZONE_DMA32...) |
diff --git a/Documentation/vm/numa b/Documentation/vm/numa index a08f71647714..a31b85b9bb88 100644 --- a/Documentation/vm/numa +++ b/Documentation/vm/numa | |||
@@ -79,11 +79,8 @@ memory, Linux must decide whether to order the zonelists such that allocations | |||
79 | fall back to the same zone type on a different node, or to a different zone | 79 | fall back to the same zone type on a different node, or to a different zone |
80 | type on the same node. This is an important consideration because some zones, | 80 | type on the same node. This is an important consideration because some zones, |
81 | such as DMA or DMA32, represent relatively scarce resources. Linux chooses | 81 | such as DMA or DMA32, represent relatively scarce resources. Linux chooses |
82 | a default zonelist order based on the sizes of the various zone types relative | 82 | a default Node ordered zonelist. This means it tries to fallback to other zones |
83 | to the total memory of the node and the total memory of the system. The | 83 | from the same node before using remote nodes which are ordered by NUMA distance. |
84 | default zonelist order may be overridden using the numa_zonelist_order kernel | ||
85 | boot parameter or sysctl. [see Documentation/admin-guide/kernel-parameters.rst and | ||
86 | Documentation/sysctl/vm.txt] | ||
87 | 84 | ||
88 | By default, Linux will attempt to satisfy memory allocation requests from the | 85 | By default, Linux will attempt to satisfy memory allocation requests from the |
89 | node to which the CPU that executes the request is assigned. Specifically, | 86 | node to which the CPU that executes the request is assigned. Specifically, |
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt new file mode 100644 index 000000000000..d5960c9124f5 --- /dev/null +++ b/Documentation/vm/swap_numa.txt | |||
@@ -0,0 +1,69 @@ | |||
1 | Automatically bind swap device to numa node | ||
2 | ------------------------------------------- | ||
3 | |||
4 | If the system has more than one swap device and swap device has the node | ||
5 | information, we can make use of this information to decide which swap | ||
6 | device to use in get_swap_pages() to get better performance. | ||
7 | |||
8 | |||
9 | How to use this feature | ||
10 | ----------------------- | ||
11 | |||
12 | Swap device has priority and that decides the order of it to be used. To make | ||
13 | use of automatically binding, there is no need to manipulate priority settings | ||
14 | for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and | ||
15 | swapB, with swapA attached to node 0 and swapB attached to node 1, are going | ||
16 | to be swapped on. Simply swapping them on by doing: | ||
17 | # swapon /dev/swapA | ||
18 | # swapon /dev/swapB | ||
19 | |||
20 | Then node 0 will use the two swap devices in the order of swapA then swapB and | ||
21 | node 1 will use the two swap devices in the order of swapB then swapA. Note | ||
22 | that the order of them being swapped on doesn't matter. | ||
23 | |||
24 | A more complex example on a 4 node machine. Assume 6 swap devices are going to | ||
25 | be swapped on: swapA and swapB are attached to node 0, swapC is attached to | ||
26 | node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. | ||
27 | The way to swap them on is the same as above: | ||
28 | # swapon /dev/swapA | ||
29 | # swapon /dev/swapB | ||
30 | # swapon /dev/swapC | ||
31 | # swapon /dev/swapD | ||
32 | # swapon /dev/swapE | ||
33 | # swapon /dev/swapF | ||
34 | |||
35 | Then node 0 will use them in the order of: | ||
36 | swapA/swapB -> swapC -> swapD -> swapE -> swapF | ||
37 | swapA and swapB will be used in a round robin mode before any other swap device. | ||
38 | |||
39 | node 1 will use them in the order of: | ||
40 | swapC -> swapA -> swapB -> swapD -> swapE -> swapF | ||
41 | |||
42 | node 2 will use them in the order of: | ||
43 | swapD/swapE -> swapA -> swapB -> swapC -> swapF | ||
44 | Similaly, swapD and swapE will be used in a round robin mode before any | ||
45 | other swap devices. | ||
46 | |||
47 | node 3 will use them in the order of: | ||
48 | swapF -> swapA -> swapB -> swapC -> swapD -> swapE | ||
49 | |||
50 | |||
51 | Implementation details | ||
52 | ---------------------- | ||
53 | |||
54 | The current code uses a priority based list, swap_avail_list, to decide | ||
55 | which swap device to use and if multiple swap devices share the same | ||
56 | priority, they are used round robin. This change here replaces the single | ||
57 | global swap_avail_list with a per-numa-node list, i.e. for each numa node, | ||
58 | it sees its own priority based list of available swap devices. Swap | ||
59 | device's priority can be promoted on its matching node's swap_avail_list. | ||
60 | |||
61 | The current swap device's priority is set as: user can set a >=0 value, | ||
62 | or the system will pick one starting from -1 then downwards. The priority | ||
63 | value in the swap_avail_list is the negated value of the swap device's | ||
64 | due to plist being sorted from low to high. The new policy doesn't change | ||
65 | the semantics for priority >=0 cases, the previous starting from -1 then | ||
66 | downwards now becomes starting from -2 then downwards and -1 is reserved | ||
67 | as the promoted value. So if multiple swap devices are attached to the same | ||
68 | node, they will all be promoted to priority -1 on that node's plist and will | ||
69 | be used round robin before any other swap devices. | ||
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 02760f6e6ca4..3b26cc62dadb 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h | |||
@@ -64,20 +64,12 @@ | |||
64 | overrides the coredump filter bits */ | 64 | overrides the coredump filter bits */ |
65 | #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ | 65 | #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ |
66 | 66 | ||
67 | #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ | ||
68 | #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ | ||
69 | |||
67 | /* compatibility flags */ | 70 | /* compatibility flags */ |
68 | #define MAP_FILE 0 | 71 | #define MAP_FILE 0 |
69 | 72 | ||
70 | /* | ||
71 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
72 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
73 | * spaces. | ||
74 | * | ||
75 | * Assume these are all power of twos. | ||
76 | * When 0 use the default page size. | ||
77 | */ | ||
78 | #define MAP_HUGE_SHIFT 26 | ||
79 | #define MAP_HUGE_MASK 0x3f | ||
80 | |||
81 | #define PKEY_DISABLE_ACCESS 0x1 | 73 | #define PKEY_DISABLE_ACCESS 0x1 |
82 | #define PKEY_DISABLE_WRITE 0x2 | 74 | #define PKEY_DISABLE_WRITE 0x2 |
83 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ | 75 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ |
diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h index e95f874ded1b..707c7f7b6bea 100644 --- a/arch/metag/include/asm/topology.h +++ b/arch/metag/include/asm/topology.h | |||
@@ -4,7 +4,6 @@ | |||
4 | #ifdef CONFIG_NUMA | 4 | #ifdef CONFIG_NUMA |
5 | 5 | ||
6 | #define cpu_to_node(cpu) ((void)(cpu), 0) | 6 | #define cpu_to_node(cpu) ((void)(cpu), 0) |
7 | #define parent_node(node) ((void)(node), 0) | ||
8 | 7 | ||
9 | #define cpumask_of_node(node) ((void)node, cpu_online_mask) | 8 | #define cpumask_of_node(node) ((void)node, cpu_online_mask) |
10 | 9 | ||
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 655e2fb5395b..da3216007fe0 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h | |||
@@ -91,20 +91,12 @@ | |||
91 | overrides the coredump filter bits */ | 91 | overrides the coredump filter bits */ |
92 | #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ | 92 | #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ |
93 | 93 | ||
94 | #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ | ||
95 | #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ | ||
96 | |||
94 | /* compatibility flags */ | 97 | /* compatibility flags */ |
95 | #define MAP_FILE 0 | 98 | #define MAP_FILE 0 |
96 | 99 | ||
97 | /* | ||
98 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
99 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
100 | * spaces. | ||
101 | * | ||
102 | * Assume these are all power of twos. | ||
103 | * When 0 use the default page size. | ||
104 | */ | ||
105 | #define MAP_HUGE_SHIFT 26 | ||
106 | #define MAP_HUGE_MASK 0x3f | ||
107 | |||
108 | #define PKEY_DISABLE_ACCESS 0x1 | 100 | #define PKEY_DISABLE_ACCESS 0x1 |
109 | #define PKEY_DISABLE_WRITE 0x2 | 101 | #define PKEY_DISABLE_WRITE 0x2 |
110 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ | 102 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ |
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 9a9c2fe4be50..775b5d5e41a1 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h | |||
@@ -57,6 +57,9 @@ | |||
57 | overrides the coredump filter bits */ | 57 | overrides the coredump filter bits */ |
58 | #define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ | 58 | #define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ |
59 | 59 | ||
60 | #define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */ | ||
61 | #define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */ | ||
62 | |||
60 | #define MADV_HWPOISON 100 /* poison a page for testing */ | 63 | #define MADV_HWPOISON 100 /* poison a page for testing */ |
61 | #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ | 64 | #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ |
62 | 65 | ||
@@ -64,17 +67,6 @@ | |||
64 | #define MAP_FILE 0 | 67 | #define MAP_FILE 0 |
65 | #define MAP_VARIABLE 0 | 68 | #define MAP_VARIABLE 0 |
66 | 69 | ||
67 | /* | ||
68 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
69 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
70 | * spaces. | ||
71 | * | ||
72 | * Assume these are all power of twos. | ||
73 | * When 0 use the default page size. | ||
74 | */ | ||
75 | #define MAP_HUGE_SHIFT 26 | ||
76 | #define MAP_HUGE_MASK 0x3f | ||
77 | |||
78 | #define PKEY_DISABLE_ACCESS 0x1 | 70 | #define PKEY_DISABLE_ACCESS 0x1 |
79 | #define PKEY_DISABLE_WRITE 0x2 | 71 | #define PKEY_DISABLE_WRITE 0x2 |
80 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ | 72 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ |
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index ab45cc2f3101..03c06ba7464f 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h | |||
@@ -29,20 +29,4 @@ | |||
29 | #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ | 29 | #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ |
30 | #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ | 30 | #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ |
31 | 31 | ||
32 | /* | ||
33 | * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2), | ||
34 | * encode the log2 of the huge page size. A value of zero indicates that the | ||
35 | * default huge page size should be used. To use a non-default huge page size, | ||
36 | * one of these defines can be used, or the size can be encoded by hand. Note | ||
37 | * that on most systems only a subset, or possibly none, of these sizes will be | ||
38 | * available. | ||
39 | */ | ||
40 | #define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */ | ||
41 | #define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */ | ||
42 | #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */ | ||
43 | #define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */ | ||
44 | #define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */ | ||
45 | #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */ | ||
46 | #define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */ | ||
47 | |||
48 | #endif /* _UAPI_ASM_POWERPC_MMAN_H */ | 32 | #endif /* _UAPI_ASM_POWERPC_MMAN_H */ |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index acb366bf6bc1..4b278a33ccbb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -1806,7 +1806,9 @@ config X86_SMAP | |||
1806 | config X86_INTEL_MPX | 1806 | config X86_INTEL_MPX |
1807 | prompt "Intel MPX (Memory Protection Extensions)" | 1807 | prompt "Intel MPX (Memory Protection Extensions)" |
1808 | def_bool n | 1808 | def_bool n |
1809 | depends on CPU_SUP_INTEL | 1809 | # Note: only available in 64-bit mode due to VMA flags shortage |
1810 | depends on CPU_SUP_INTEL && X86_64 | ||
1811 | select ARCH_USES_HIGH_VMA_FLAGS | ||
1810 | ---help--- | 1812 | ---help--- |
1811 | MPX provides hardware features that can be used in | 1813 | MPX provides hardware features that can be used in |
1812 | conjunction with compiler-instrumented code to check | 1814 | conjunction with compiler-instrumented code to check |
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 39bca7fac087..3be08f07695c 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h | |||
@@ -3,9 +3,6 @@ | |||
3 | 3 | ||
4 | #define MAP_32BIT 0x40 /* only give out 32bit addresses */ | 4 | #define MAP_32BIT 0x40 /* only give out 32bit addresses */ |
5 | 5 | ||
6 | #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) | ||
7 | #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) | ||
8 | |||
9 | #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS | 6 | #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS |
10 | /* | 7 | /* |
11 | * Take the 4 protection key bits out of the vma->vm_flags | 8 | * Take the 4 protection key bits out of the vma->vm_flags |
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 24365b30aae9..b15b278aa314 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h | |||
@@ -103,20 +103,12 @@ | |||
103 | overrides the coredump filter bits */ | 103 | overrides the coredump filter bits */ |
104 | #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ | 104 | #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ |
105 | 105 | ||
106 | #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ | ||
107 | #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ | ||
108 | |||
106 | /* compatibility flags */ | 109 | /* compatibility flags */ |
107 | #define MAP_FILE 0 | 110 | #define MAP_FILE 0 |
108 | 111 | ||
109 | /* | ||
110 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
111 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
112 | * spaces. | ||
113 | * | ||
114 | * Assume these are all power of twos. | ||
115 | * When 0 use the default page size. | ||
116 | */ | ||
117 | #define MAP_HUGE_SHIFT 26 | ||
118 | #define MAP_HUGE_MASK 0x3f | ||
119 | |||
120 | #define PKEY_DISABLE_ACCESS 0x1 | 112 | #define PKEY_DISABLE_ACCESS 0x1 |
121 | #define PKEY_DISABLE_WRITE 0x2 | 113 | #define PKEY_DISABLE_WRITE 0x2 |
122 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ | 114 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c7c4e0325cdb..4e3b61cda520 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -388,6 +388,19 @@ static ssize_t show_phys_device(struct device *dev, | |||
388 | } | 388 | } |
389 | 389 | ||
390 | #ifdef CONFIG_MEMORY_HOTREMOVE | 390 | #ifdef CONFIG_MEMORY_HOTREMOVE |
391 | static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn, | ||
392 | unsigned long nr_pages, int online_type, | ||
393 | struct zone *default_zone) | ||
394 | { | ||
395 | struct zone *zone; | ||
396 | |||
397 | zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); | ||
398 | if (zone != default_zone) { | ||
399 | strcat(buf, " "); | ||
400 | strcat(buf, zone->name); | ||
401 | } | ||
402 | } | ||
403 | |||
391 | static ssize_t show_valid_zones(struct device *dev, | 404 | static ssize_t show_valid_zones(struct device *dev, |
392 | struct device_attribute *attr, char *buf) | 405 | struct device_attribute *attr, char *buf) |
393 | { | 406 | { |
@@ -395,7 +408,7 @@ static ssize_t show_valid_zones(struct device *dev, | |||
395 | unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); | 408 | unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); |
396 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; | 409 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; |
397 | unsigned long valid_start_pfn, valid_end_pfn; | 410 | unsigned long valid_start_pfn, valid_end_pfn; |
398 | bool append = false; | 411 | struct zone *default_zone; |
399 | int nid; | 412 | int nid; |
400 | 413 | ||
401 | /* | 414 | /* |
@@ -418,16 +431,13 @@ static ssize_t show_valid_zones(struct device *dev, | |||
418 | } | 431 | } |
419 | 432 | ||
420 | nid = pfn_to_nid(start_pfn); | 433 | nid = pfn_to_nid(start_pfn); |
421 | if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { | 434 | default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages); |
422 | strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); | 435 | strcat(buf, default_zone->name); |
423 | append = true; | ||
424 | } | ||
425 | 436 | ||
426 | if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { | 437 | print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, |
427 | if (append) | 438 | default_zone); |
428 | strcat(buf, " "); | 439 | print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE, |
429 | strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); | 440 | default_zone); |
430 | } | ||
431 | out: | 441 | out: |
432 | strcat(buf, "\n"); | 442 | strcat(buf, "\n"); |
433 | 443 | ||
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 104b71c0490d..5d9ed0616413 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c | |||
@@ -326,7 +326,11 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, | |||
326 | struct page *page, bool is_write) | 326 | struct page *page, bool is_write) |
327 | { | 327 | { |
328 | struct brd_device *brd = bdev->bd_disk->private_data; | 328 | struct brd_device *brd = bdev->bd_disk->private_data; |
329 | int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector); | 329 | int err; |
330 | |||
331 | if (PageTransHuge(page)) | ||
332 | return -ENOTSUPP; | ||
333 | err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector); | ||
330 | page_endio(page, is_write, err); | 334 | page_endio(page, is_write, err); |
331 | return err; | 335 | return err; |
332 | } | 336 | } |
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index b8ecba6dcd3b..7cd4a8ec3c8f 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig | |||
@@ -13,3 +13,15 @@ config ZRAM | |||
13 | disks and maybe many more. | 13 | disks and maybe many more. |
14 | 14 | ||
15 | See zram.txt for more information. | 15 | See zram.txt for more information. |
16 | |||
17 | config ZRAM_WRITEBACK | ||
18 | bool "Write back incompressible page to backing device" | ||
19 | depends on ZRAM | ||
20 | default n | ||
21 | help | ||
22 | With incompressible page, there is no memory saving to keep it | ||
23 | in memory. Instead, write it out to backing device. | ||
24 | For this feature, admin should set up backing device via | ||
25 | /sys/block/zramX/backing_dev. | ||
26 | |||
27 | See zram.txt for more infomration. | ||
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3b1b6340ba13..4a0438c4ef2a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
@@ -270,6 +270,349 @@ static ssize_t mem_used_max_store(struct device *dev, | |||
270 | return len; | 270 | return len; |
271 | } | 271 | } |
272 | 272 | ||
273 | #ifdef CONFIG_ZRAM_WRITEBACK | ||
274 | static bool zram_wb_enabled(struct zram *zram) | ||
275 | { | ||
276 | return zram->backing_dev; | ||
277 | } | ||
278 | |||
279 | static void reset_bdev(struct zram *zram) | ||
280 | { | ||
281 | struct block_device *bdev; | ||
282 | |||
283 | if (!zram_wb_enabled(zram)) | ||
284 | return; | ||
285 | |||
286 | bdev = zram->bdev; | ||
287 | if (zram->old_block_size) | ||
288 | set_blocksize(bdev, zram->old_block_size); | ||
289 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); | ||
290 | /* hope filp_close flush all of IO */ | ||
291 | filp_close(zram->backing_dev, NULL); | ||
292 | zram->backing_dev = NULL; | ||
293 | zram->old_block_size = 0; | ||
294 | zram->bdev = NULL; | ||
295 | |||
296 | kvfree(zram->bitmap); | ||
297 | zram->bitmap = NULL; | ||
298 | } | ||
299 | |||
300 | static ssize_t backing_dev_show(struct device *dev, | ||
301 | struct device_attribute *attr, char *buf) | ||
302 | { | ||
303 | struct zram *zram = dev_to_zram(dev); | ||
304 | struct file *file = zram->backing_dev; | ||
305 | char *p; | ||
306 | ssize_t ret; | ||
307 | |||
308 | down_read(&zram->init_lock); | ||
309 | if (!zram_wb_enabled(zram)) { | ||
310 | memcpy(buf, "none\n", 5); | ||
311 | up_read(&zram->init_lock); | ||
312 | return 5; | ||
313 | } | ||
314 | |||
315 | p = file_path(file, buf, PAGE_SIZE - 1); | ||
316 | if (IS_ERR(p)) { | ||
317 | ret = PTR_ERR(p); | ||
318 | goto out; | ||
319 | } | ||
320 | |||
321 | ret = strlen(p); | ||
322 | memmove(buf, p, ret); | ||
323 | buf[ret++] = '\n'; | ||
324 | out: | ||
325 | up_read(&zram->init_lock); | ||
326 | return ret; | ||
327 | } | ||
328 | |||
329 | static ssize_t backing_dev_store(struct device *dev, | ||
330 | struct device_attribute *attr, const char *buf, size_t len) | ||
331 | { | ||
332 | char *file_name; | ||
333 | struct file *backing_dev = NULL; | ||
334 | struct inode *inode; | ||
335 | struct address_space *mapping; | ||
336 | unsigned int bitmap_sz, old_block_size = 0; | ||
337 | unsigned long nr_pages, *bitmap = NULL; | ||
338 | struct block_device *bdev = NULL; | ||
339 | int err; | ||
340 | struct zram *zram = dev_to_zram(dev); | ||
341 | |||
342 | file_name = kmalloc(PATH_MAX, GFP_KERNEL); | ||
343 | if (!file_name) | ||
344 | return -ENOMEM; | ||
345 | |||
346 | down_write(&zram->init_lock); | ||
347 | if (init_done(zram)) { | ||
348 | pr_info("Can't setup backing device for initialized device\n"); | ||
349 | err = -EBUSY; | ||
350 | goto out; | ||
351 | } | ||
352 | |||
353 | strlcpy(file_name, buf, len); | ||
354 | |||
355 | backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0); | ||
356 | if (IS_ERR(backing_dev)) { | ||
357 | err = PTR_ERR(backing_dev); | ||
358 | backing_dev = NULL; | ||
359 | goto out; | ||
360 | } | ||
361 | |||
362 | mapping = backing_dev->f_mapping; | ||
363 | inode = mapping->host; | ||
364 | |||
365 | /* Support only block device in this moment */ | ||
366 | if (!S_ISBLK(inode->i_mode)) { | ||
367 | err = -ENOTBLK; | ||
368 | goto out; | ||
369 | } | ||
370 | |||
371 | bdev = bdgrab(I_BDEV(inode)); | ||
372 | err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram); | ||
373 | if (err < 0) | ||
374 | goto out; | ||
375 | |||
376 | nr_pages = i_size_read(inode) >> PAGE_SHIFT; | ||
377 | bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long); | ||
378 | bitmap = kvzalloc(bitmap_sz, GFP_KERNEL); | ||
379 | if (!bitmap) { | ||
380 | err = -ENOMEM; | ||
381 | goto out; | ||
382 | } | ||
383 | |||
384 | old_block_size = block_size(bdev); | ||
385 | err = set_blocksize(bdev, PAGE_SIZE); | ||
386 | if (err) | ||
387 | goto out; | ||
388 | |||
389 | reset_bdev(zram); | ||
390 | spin_lock_init(&zram->bitmap_lock); | ||
391 | |||
392 | zram->old_block_size = old_block_size; | ||
393 | zram->bdev = bdev; | ||
394 | zram->backing_dev = backing_dev; | ||
395 | zram->bitmap = bitmap; | ||
396 | zram->nr_pages = nr_pages; | ||
397 | up_write(&zram->init_lock); | ||
398 | |||
399 | pr_info("setup backing device %s\n", file_name); | ||
400 | kfree(file_name); | ||
401 | |||
402 | return len; | ||
403 | out: | ||
404 | if (bitmap) | ||
405 | kvfree(bitmap); | ||
406 | |||
407 | if (bdev) | ||
408 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | ||
409 | |||
410 | if (backing_dev) | ||
411 | filp_close(backing_dev, NULL); | ||
412 | |||
413 | up_write(&zram->init_lock); | ||
414 | |||
415 | kfree(file_name); | ||
416 | |||
417 | return err; | ||
418 | } | ||
419 | |||
420 | static unsigned long get_entry_bdev(struct zram *zram) | ||
421 | { | ||
422 | unsigned long entry; | ||
423 | |||
424 | spin_lock(&zram->bitmap_lock); | ||
425 | /* skip 0 bit to confuse zram.handle = 0 */ | ||
426 | entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1); | ||
427 | if (entry == zram->nr_pages) { | ||
428 | spin_unlock(&zram->bitmap_lock); | ||
429 | return 0; | ||
430 | } | ||
431 | |||
432 | set_bit(entry, zram->bitmap); | ||
433 | spin_unlock(&zram->bitmap_lock); | ||
434 | |||
435 | return entry; | ||
436 | } | ||
437 | |||
438 | static void put_entry_bdev(struct zram *zram, unsigned long entry) | ||
439 | { | ||
440 | int was_set; | ||
441 | |||
442 | spin_lock(&zram->bitmap_lock); | ||
443 | was_set = test_and_clear_bit(entry, zram->bitmap); | ||
444 | spin_unlock(&zram->bitmap_lock); | ||
445 | WARN_ON_ONCE(!was_set); | ||
446 | } | ||
447 | |||
448 | void zram_page_end_io(struct bio *bio) | ||
449 | { | ||
450 | struct page *page = bio->bi_io_vec[0].bv_page; | ||
451 | |||
452 | page_endio(page, op_is_write(bio_op(bio)), | ||
453 | blk_status_to_errno(bio->bi_status)); | ||
454 | bio_put(bio); | ||
455 | } | ||
456 | |||
457 | /* | ||
458 | * Returns 1 if the submission is successful. | ||
459 | */ | ||
460 | static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, | ||
461 | unsigned long entry, struct bio *parent) | ||
462 | { | ||
463 | struct bio *bio; | ||
464 | |||
465 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
466 | if (!bio) | ||
467 | return -ENOMEM; | ||
468 | |||
469 | bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); | ||
470 | bio->bi_bdev = zram->bdev; | ||
471 | if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) { | ||
472 | bio_put(bio); | ||
473 | return -EIO; | ||
474 | } | ||
475 | |||
476 | if (!parent) { | ||
477 | bio->bi_opf = REQ_OP_READ; | ||
478 | bio->bi_end_io = zram_page_end_io; | ||
479 | } else { | ||
480 | bio->bi_opf = parent->bi_opf; | ||
481 | bio_chain(bio, parent); | ||
482 | } | ||
483 | |||
484 | submit_bio(bio); | ||
485 | return 1; | ||
486 | } | ||
487 | |||
488 | struct zram_work { | ||
489 | struct work_struct work; | ||
490 | struct zram *zram; | ||
491 | unsigned long entry; | ||
492 | struct bio *bio; | ||
493 | }; | ||
494 | |||
495 | #if PAGE_SIZE != 4096 | ||
496 | static void zram_sync_read(struct work_struct *work) | ||
497 | { | ||
498 | struct bio_vec bvec; | ||
499 | struct zram_work *zw = container_of(work, struct zram_work, work); | ||
500 | struct zram *zram = zw->zram; | ||
501 | unsigned long entry = zw->entry; | ||
502 | struct bio *bio = zw->bio; | ||
503 | |||
504 | read_from_bdev_async(zram, &bvec, entry, bio); | ||
505 | } | ||
506 | |||
507 | /* | ||
508 | * Block layer want one ->make_request_fn to be active at a time | ||
509 | * so if we use chained IO with parent IO in same context, | ||
510 | * it's a deadlock. To avoid, it, it uses worker thread context. | ||
511 | */ | ||
512 | static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, | ||
513 | unsigned long entry, struct bio *bio) | ||
514 | { | ||
515 | struct zram_work work; | ||
516 | |||
517 | work.zram = zram; | ||
518 | work.entry = entry; | ||
519 | work.bio = bio; | ||
520 | |||
521 | INIT_WORK_ONSTACK(&work.work, zram_sync_read); | ||
522 | queue_work(system_unbound_wq, &work.work); | ||
523 | flush_work(&work.work); | ||
524 | destroy_work_on_stack(&work.work); | ||
525 | |||
526 | return 1; | ||
527 | } | ||
528 | #else | ||
529 | static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, | ||
530 | unsigned long entry, struct bio *bio) | ||
531 | { | ||
532 | WARN_ON(1); | ||
533 | return -EIO; | ||
534 | } | ||
535 | #endif | ||
536 | |||
537 | static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, | ||
538 | unsigned long entry, struct bio *parent, bool sync) | ||
539 | { | ||
540 | if (sync) | ||
541 | return read_from_bdev_sync(zram, bvec, entry, parent); | ||
542 | else | ||
543 | return read_from_bdev_async(zram, bvec, entry, parent); | ||
544 | } | ||
545 | |||
546 | static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, | ||
547 | u32 index, struct bio *parent, | ||
548 | unsigned long *pentry) | ||
549 | { | ||
550 | struct bio *bio; | ||
551 | unsigned long entry; | ||
552 | |||
553 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
554 | if (!bio) | ||
555 | return -ENOMEM; | ||
556 | |||
557 | entry = get_entry_bdev(zram); | ||
558 | if (!entry) { | ||
559 | bio_put(bio); | ||
560 | return -ENOSPC; | ||
561 | } | ||
562 | |||
563 | bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); | ||
564 | bio->bi_bdev = zram->bdev; | ||
565 | if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, | ||
566 | bvec->bv_offset)) { | ||
567 | bio_put(bio); | ||
568 | put_entry_bdev(zram, entry); | ||
569 | return -EIO; | ||
570 | } | ||
571 | |||
572 | if (!parent) { | ||
573 | bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; | ||
574 | bio->bi_end_io = zram_page_end_io; | ||
575 | } else { | ||
576 | bio->bi_opf = parent->bi_opf; | ||
577 | bio_chain(bio, parent); | ||
578 | } | ||
579 | |||
580 | submit_bio(bio); | ||
581 | *pentry = entry; | ||
582 | |||
583 | return 0; | ||
584 | } | ||
585 | |||
586 | static void zram_wb_clear(struct zram *zram, u32 index) | ||
587 | { | ||
588 | unsigned long entry; | ||
589 | |||
590 | zram_clear_flag(zram, index, ZRAM_WB); | ||
591 | entry = zram_get_element(zram, index); | ||
592 | zram_set_element(zram, index, 0); | ||
593 | put_entry_bdev(zram, entry); | ||
594 | } | ||
595 | |||
596 | #else | ||
597 | static bool zram_wb_enabled(struct zram *zram) { return false; } | ||
598 | static inline void reset_bdev(struct zram *zram) {}; | ||
599 | static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, | ||
600 | u32 index, struct bio *parent, | ||
601 | unsigned long *pentry) | ||
602 | |||
603 | { | ||
604 | return -EIO; | ||
605 | } | ||
606 | |||
607 | static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, | ||
608 | unsigned long entry, struct bio *parent, bool sync) | ||
609 | { | ||
610 | return -EIO; | ||
611 | } | ||
612 | static void zram_wb_clear(struct zram *zram, u32 index) {} | ||
613 | #endif | ||
614 | |||
615 | |||
273 | /* | 616 | /* |
274 | * We switched to per-cpu streams and this attr is not needed anymore. | 617 | * We switched to per-cpu streams and this attr is not needed anymore. |
275 | * However, we will keep it around for some time, because: | 618 | * However, we will keep it around for some time, because: |
@@ -453,30 +796,6 @@ static bool zram_same_page_read(struct zram *zram, u32 index, | |||
453 | return false; | 796 | return false; |
454 | } | 797 | } |
455 | 798 | ||
456 | static bool zram_same_page_write(struct zram *zram, u32 index, | ||
457 | struct page *page) | ||
458 | { | ||
459 | unsigned long element; | ||
460 | void *mem = kmap_atomic(page); | ||
461 | |||
462 | if (page_same_filled(mem, &element)) { | ||
463 | kunmap_atomic(mem); | ||
464 | /* Free memory associated with this sector now. */ | ||
465 | zram_slot_lock(zram, index); | ||
466 | zram_free_page(zram, index); | ||
467 | zram_set_flag(zram, index, ZRAM_SAME); | ||
468 | zram_set_element(zram, index, element); | ||
469 | zram_slot_unlock(zram, index); | ||
470 | |||
471 | atomic64_inc(&zram->stats.same_pages); | ||
472 | atomic64_inc(&zram->stats.pages_stored); | ||
473 | return true; | ||
474 | } | ||
475 | kunmap_atomic(mem); | ||
476 | |||
477 | return false; | ||
478 | } | ||
479 | |||
480 | static void zram_meta_free(struct zram *zram, u64 disksize) | 799 | static void zram_meta_free(struct zram *zram, u64 disksize) |
481 | { | 800 | { |
482 | size_t num_pages = disksize >> PAGE_SHIFT; | 801 | size_t num_pages = disksize >> PAGE_SHIFT; |
@@ -515,7 +834,13 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) | |||
515 | */ | 834 | */ |
516 | static void zram_free_page(struct zram *zram, size_t index) | 835 | static void zram_free_page(struct zram *zram, size_t index) |
517 | { | 836 | { |
518 | unsigned long handle = zram_get_handle(zram, index); | 837 | unsigned long handle; |
838 | |||
839 | if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { | ||
840 | zram_wb_clear(zram, index); | ||
841 | atomic64_dec(&zram->stats.pages_stored); | ||
842 | return; | ||
843 | } | ||
519 | 844 | ||
520 | /* | 845 | /* |
521 | * No memory is allocated for same element filled pages. | 846 | * No memory is allocated for same element filled pages. |
@@ -529,6 +854,7 @@ static void zram_free_page(struct zram *zram, size_t index) | |||
529 | return; | 854 | return; |
530 | } | 855 | } |
531 | 856 | ||
857 | handle = zram_get_handle(zram, index); | ||
532 | if (!handle) | 858 | if (!handle) |
533 | return; | 859 | return; |
534 | 860 | ||
@@ -542,13 +868,31 @@ static void zram_free_page(struct zram *zram, size_t index) | |||
542 | zram_set_obj_size(zram, index, 0); | 868 | zram_set_obj_size(zram, index, 0); |
543 | } | 869 | } |
544 | 870 | ||
545 | static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) | 871 | static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, |
872 | struct bio *bio, bool partial_io) | ||
546 | { | 873 | { |
547 | int ret; | 874 | int ret; |
548 | unsigned long handle; | 875 | unsigned long handle; |
549 | unsigned int size; | 876 | unsigned int size; |
550 | void *src, *dst; | 877 | void *src, *dst; |
551 | 878 | ||
879 | if (zram_wb_enabled(zram)) { | ||
880 | zram_slot_lock(zram, index); | ||
881 | if (zram_test_flag(zram, index, ZRAM_WB)) { | ||
882 | struct bio_vec bvec; | ||
883 | |||
884 | zram_slot_unlock(zram, index); | ||
885 | |||
886 | bvec.bv_page = page; | ||
887 | bvec.bv_len = PAGE_SIZE; | ||
888 | bvec.bv_offset = 0; | ||
889 | return read_from_bdev(zram, &bvec, | ||
890 | zram_get_element(zram, index), | ||
891 | bio, partial_io); | ||
892 | } | ||
893 | zram_slot_unlock(zram, index); | ||
894 | } | ||
895 | |||
552 | if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) | 896 | if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) |
553 | return 0; | 897 | return 0; |
554 | 898 | ||
@@ -581,7 +925,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) | |||
581 | } | 925 | } |
582 | 926 | ||
583 | static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, | 927 | static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, |
584 | u32 index, int offset) | 928 | u32 index, int offset, struct bio *bio) |
585 | { | 929 | { |
586 | int ret; | 930 | int ret; |
587 | struct page *page; | 931 | struct page *page; |
@@ -594,7 +938,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, | |||
594 | return -ENOMEM; | 938 | return -ENOMEM; |
595 | } | 939 | } |
596 | 940 | ||
597 | ret = zram_decompress_page(zram, page, index); | 941 | ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec)); |
598 | if (unlikely(ret)) | 942 | if (unlikely(ret)) |
599 | goto out; | 943 | goto out; |
600 | 944 | ||
@@ -613,30 +957,57 @@ out: | |||
613 | return ret; | 957 | return ret; |
614 | } | 958 | } |
615 | 959 | ||
616 | static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, | 960 | static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, |
617 | struct page *page, | 961 | u32 index, struct bio *bio) |
618 | unsigned long *out_handle, unsigned int *out_comp_len) | ||
619 | { | 962 | { |
620 | int ret; | 963 | int ret = 0; |
621 | unsigned int comp_len; | ||
622 | void *src; | ||
623 | unsigned long alloced_pages; | 964 | unsigned long alloced_pages; |
624 | unsigned long handle = 0; | 965 | unsigned long handle = 0; |
966 | unsigned int comp_len = 0; | ||
967 | void *src, *dst, *mem; | ||
968 | struct zcomp_strm *zstrm; | ||
969 | struct page *page = bvec->bv_page; | ||
970 | unsigned long element = 0; | ||
971 | enum zram_pageflags flags = 0; | ||
972 | bool allow_wb = true; | ||
973 | |||
974 | mem = kmap_atomic(page); | ||
975 | if (page_same_filled(mem, &element)) { | ||
976 | kunmap_atomic(mem); | ||
977 | /* Free memory associated with this sector now. */ | ||
978 | flags = ZRAM_SAME; | ||
979 | atomic64_inc(&zram->stats.same_pages); | ||
980 | goto out; | ||
981 | } | ||
982 | kunmap_atomic(mem); | ||
625 | 983 | ||
626 | compress_again: | 984 | compress_again: |
985 | zstrm = zcomp_stream_get(zram->comp); | ||
627 | src = kmap_atomic(page); | 986 | src = kmap_atomic(page); |
628 | ret = zcomp_compress(*zstrm, src, &comp_len); | 987 | ret = zcomp_compress(zstrm, src, &comp_len); |
629 | kunmap_atomic(src); | 988 | kunmap_atomic(src); |
630 | 989 | ||
631 | if (unlikely(ret)) { | 990 | if (unlikely(ret)) { |
991 | zcomp_stream_put(zram->comp); | ||
632 | pr_err("Compression failed! err=%d\n", ret); | 992 | pr_err("Compression failed! err=%d\n", ret); |
633 | if (handle) | 993 | zs_free(zram->mem_pool, handle); |
634 | zs_free(zram->mem_pool, handle); | ||
635 | return ret; | 994 | return ret; |
636 | } | 995 | } |
637 | 996 | ||
638 | if (unlikely(comp_len > max_zpage_size)) | 997 | if (unlikely(comp_len > max_zpage_size)) { |
998 | if (zram_wb_enabled(zram) && allow_wb) { | ||
999 | zcomp_stream_put(zram->comp); | ||
1000 | ret = write_to_bdev(zram, bvec, index, bio, &element); | ||
1001 | if (!ret) { | ||
1002 | flags = ZRAM_WB; | ||
1003 | ret = 1; | ||
1004 | goto out; | ||
1005 | } | ||
1006 | allow_wb = false; | ||
1007 | goto compress_again; | ||
1008 | } | ||
639 | comp_len = PAGE_SIZE; | 1009 | comp_len = PAGE_SIZE; |
1010 | } | ||
640 | 1011 | ||
641 | /* | 1012 | /* |
642 | * handle allocation has 2 paths: | 1013 | * handle allocation has 2 paths: |
@@ -663,7 +1034,6 @@ compress_again: | |||
663 | handle = zs_malloc(zram->mem_pool, comp_len, | 1034 | handle = zs_malloc(zram->mem_pool, comp_len, |
664 | GFP_NOIO | __GFP_HIGHMEM | | 1035 | GFP_NOIO | __GFP_HIGHMEM | |
665 | __GFP_MOVABLE); | 1036 | __GFP_MOVABLE); |
666 | *zstrm = zcomp_stream_get(zram->comp); | ||
667 | if (handle) | 1037 | if (handle) |
668 | goto compress_again; | 1038 | goto compress_again; |
669 | return -ENOMEM; | 1039 | return -ENOMEM; |
@@ -673,34 +1043,11 @@ compress_again: | |||
673 | update_used_max(zram, alloced_pages); | 1043 | update_used_max(zram, alloced_pages); |
674 | 1044 | ||
675 | if (zram->limit_pages && alloced_pages > zram->limit_pages) { | 1045 | if (zram->limit_pages && alloced_pages > zram->limit_pages) { |
1046 | zcomp_stream_put(zram->comp); | ||
676 | zs_free(zram->mem_pool, handle); | 1047 | zs_free(zram->mem_pool, handle); |
677 | return -ENOMEM; | 1048 | return -ENOMEM; |
678 | } | 1049 | } |
679 | 1050 | ||
680 | *out_handle = handle; | ||
681 | *out_comp_len = comp_len; | ||
682 | return 0; | ||
683 | } | ||
684 | |||
685 | static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) | ||
686 | { | ||
687 | int ret; | ||
688 | unsigned long handle; | ||
689 | unsigned int comp_len; | ||
690 | void *src, *dst; | ||
691 | struct zcomp_strm *zstrm; | ||
692 | struct page *page = bvec->bv_page; | ||
693 | |||
694 | if (zram_same_page_write(zram, index, page)) | ||
695 | return 0; | ||
696 | |||
697 | zstrm = zcomp_stream_get(zram->comp); | ||
698 | ret = zram_compress(zram, &zstrm, page, &handle, &comp_len); | ||
699 | if (ret) { | ||
700 | zcomp_stream_put(zram->comp); | ||
701 | return ret; | ||
702 | } | ||
703 | |||
704 | dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); | 1051 | dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); |
705 | 1052 | ||
706 | src = zstrm->buffer; | 1053 | src = zstrm->buffer; |
@@ -712,25 +1059,31 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) | |||
712 | 1059 | ||
713 | zcomp_stream_put(zram->comp); | 1060 | zcomp_stream_put(zram->comp); |
714 | zs_unmap_object(zram->mem_pool, handle); | 1061 | zs_unmap_object(zram->mem_pool, handle); |
715 | 1062 | atomic64_add(comp_len, &zram->stats.compr_data_size); | |
1063 | out: | ||
716 | /* | 1064 | /* |
717 | * Free memory associated with this sector | 1065 | * Free memory associated with this sector |
718 | * before overwriting unused sectors. | 1066 | * before overwriting unused sectors. |
719 | */ | 1067 | */ |
720 | zram_slot_lock(zram, index); | 1068 | zram_slot_lock(zram, index); |
721 | zram_free_page(zram, index); | 1069 | zram_free_page(zram, index); |
722 | zram_set_handle(zram, index, handle); | 1070 | |
723 | zram_set_obj_size(zram, index, comp_len); | 1071 | if (flags) { |
1072 | zram_set_flag(zram, index, flags); | ||
1073 | zram_set_element(zram, index, element); | ||
1074 | } else { | ||
1075 | zram_set_handle(zram, index, handle); | ||
1076 | zram_set_obj_size(zram, index, comp_len); | ||
1077 | } | ||
724 | zram_slot_unlock(zram, index); | 1078 | zram_slot_unlock(zram, index); |
725 | 1079 | ||
726 | /* Update stats */ | 1080 | /* Update stats */ |
727 | atomic64_add(comp_len, &zram->stats.compr_data_size); | ||
728 | atomic64_inc(&zram->stats.pages_stored); | 1081 | atomic64_inc(&zram->stats.pages_stored); |
729 | return 0; | 1082 | return ret; |
730 | } | 1083 | } |
731 | 1084 | ||
732 | static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, | 1085 | static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, |
733 | u32 index, int offset) | 1086 | u32 index, int offset, struct bio *bio) |
734 | { | 1087 | { |
735 | int ret; | 1088 | int ret; |
736 | struct page *page = NULL; | 1089 | struct page *page = NULL; |
@@ -748,7 +1101,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, | |||
748 | if (!page) | 1101 | if (!page) |
749 | return -ENOMEM; | 1102 | return -ENOMEM; |
750 | 1103 | ||
751 | ret = zram_decompress_page(zram, page, index); | 1104 | ret = __zram_bvec_read(zram, page, index, bio, true); |
752 | if (ret) | 1105 | if (ret) |
753 | goto out; | 1106 | goto out; |
754 | 1107 | ||
@@ -763,7 +1116,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, | |||
763 | vec.bv_offset = 0; | 1116 | vec.bv_offset = 0; |
764 | } | 1117 | } |
765 | 1118 | ||
766 | ret = __zram_bvec_write(zram, &vec, index); | 1119 | ret = __zram_bvec_write(zram, &vec, index, bio); |
767 | out: | 1120 | out: |
768 | if (is_partial_io(bvec)) | 1121 | if (is_partial_io(bvec)) |
769 | __free_page(page); | 1122 | __free_page(page); |
@@ -808,8 +1161,13 @@ static void zram_bio_discard(struct zram *zram, u32 index, | |||
808 | } | 1161 | } |
809 | } | 1162 | } |
810 | 1163 | ||
1164 | /* | ||
1165 | * Returns errno if it has some problem. Otherwise return 0 or 1. | ||
1166 | * Returns 0 if IO request was done synchronously | ||
1167 | * Returns 1 if IO request was successfully submitted. | ||
1168 | */ | ||
811 | static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, | 1169 | static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, |
812 | int offset, bool is_write) | 1170 | int offset, bool is_write, struct bio *bio) |
813 | { | 1171 | { |
814 | unsigned long start_time = jiffies; | 1172 | unsigned long start_time = jiffies; |
815 | int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ; | 1173 | int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ; |
@@ -820,16 +1178,16 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, | |||
820 | 1178 | ||
821 | if (!is_write) { | 1179 | if (!is_write) { |
822 | atomic64_inc(&zram->stats.num_reads); | 1180 | atomic64_inc(&zram->stats.num_reads); |
823 | ret = zram_bvec_read(zram, bvec, index, offset); | 1181 | ret = zram_bvec_read(zram, bvec, index, offset, bio); |
824 | flush_dcache_page(bvec->bv_page); | 1182 | flush_dcache_page(bvec->bv_page); |
825 | } else { | 1183 | } else { |
826 | atomic64_inc(&zram->stats.num_writes); | 1184 | atomic64_inc(&zram->stats.num_writes); |
827 | ret = zram_bvec_write(zram, bvec, index, offset); | 1185 | ret = zram_bvec_write(zram, bvec, index, offset, bio); |
828 | } | 1186 | } |
829 | 1187 | ||
830 | generic_end_io_acct(rw_acct, &zram->disk->part0, start_time); | 1188 | generic_end_io_acct(rw_acct, &zram->disk->part0, start_time); |
831 | 1189 | ||
832 | if (unlikely(ret)) { | 1190 | if (unlikely(ret < 0)) { |
833 | if (!is_write) | 1191 | if (!is_write) |
834 | atomic64_inc(&zram->stats.failed_reads); | 1192 | atomic64_inc(&zram->stats.failed_reads); |
835 | else | 1193 | else |
@@ -868,7 +1226,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) | |||
868 | bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, | 1226 | bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, |
869 | unwritten); | 1227 | unwritten); |
870 | if (zram_bvec_rw(zram, &bv, index, offset, | 1228 | if (zram_bvec_rw(zram, &bv, index, offset, |
871 | op_is_write(bio_op(bio))) < 0) | 1229 | op_is_write(bio_op(bio)), bio) < 0) |
872 | goto out; | 1230 | goto out; |
873 | 1231 | ||
874 | bv.bv_offset += bv.bv_len; | 1232 | bv.bv_offset += bv.bv_len; |
@@ -922,16 +1280,18 @@ static void zram_slot_free_notify(struct block_device *bdev, | |||
922 | static int zram_rw_page(struct block_device *bdev, sector_t sector, | 1280 | static int zram_rw_page(struct block_device *bdev, sector_t sector, |
923 | struct page *page, bool is_write) | 1281 | struct page *page, bool is_write) |
924 | { | 1282 | { |
925 | int offset, err = -EIO; | 1283 | int offset, ret; |
926 | u32 index; | 1284 | u32 index; |
927 | struct zram *zram; | 1285 | struct zram *zram; |
928 | struct bio_vec bv; | 1286 | struct bio_vec bv; |
929 | 1287 | ||
1288 | if (PageTransHuge(page)) | ||
1289 | return -ENOTSUPP; | ||
930 | zram = bdev->bd_disk->private_data; | 1290 | zram = bdev->bd_disk->private_data; |
931 | 1291 | ||
932 | if (!valid_io_request(zram, sector, PAGE_SIZE)) { | 1292 | if (!valid_io_request(zram, sector, PAGE_SIZE)) { |
933 | atomic64_inc(&zram->stats.invalid_io); | 1293 | atomic64_inc(&zram->stats.invalid_io); |
934 | err = -EINVAL; | 1294 | ret = -EINVAL; |
935 | goto out; | 1295 | goto out; |
936 | } | 1296 | } |
937 | 1297 | ||
@@ -942,7 +1302,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, | |||
942 | bv.bv_len = PAGE_SIZE; | 1302 | bv.bv_len = PAGE_SIZE; |
943 | bv.bv_offset = 0; | 1303 | bv.bv_offset = 0; |
944 | 1304 | ||
945 | err = zram_bvec_rw(zram, &bv, index, offset, is_write); | 1305 | ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL); |
946 | out: | 1306 | out: |
947 | /* | 1307 | /* |
948 | * If I/O fails, just return error(ie, non-zero) without | 1308 | * If I/O fails, just return error(ie, non-zero) without |
@@ -952,9 +1312,20 @@ out: | |||
952 | * bio->bi_end_io does things to handle the error | 1312 | * bio->bi_end_io does things to handle the error |
953 | * (e.g., SetPageError, set_page_dirty and extra works). | 1313 | * (e.g., SetPageError, set_page_dirty and extra works). |
954 | */ | 1314 | */ |
955 | if (err == 0) | 1315 | if (unlikely(ret < 0)) |
1316 | return ret; | ||
1317 | |||
1318 | switch (ret) { | ||
1319 | case 0: | ||
956 | page_endio(page, is_write, 0); | 1320 | page_endio(page, is_write, 0); |
957 | return err; | 1321 | break; |
1322 | case 1: | ||
1323 | ret = 0; | ||
1324 | break; | ||
1325 | default: | ||
1326 | WARN_ON(1); | ||
1327 | } | ||
1328 | return ret; | ||
958 | } | 1329 | } |
959 | 1330 | ||
960 | static void zram_reset_device(struct zram *zram) | 1331 | static void zram_reset_device(struct zram *zram) |
@@ -983,6 +1354,7 @@ static void zram_reset_device(struct zram *zram) | |||
983 | zram_meta_free(zram, disksize); | 1354 | zram_meta_free(zram, disksize); |
984 | memset(&zram->stats, 0, sizeof(zram->stats)); | 1355 | memset(&zram->stats, 0, sizeof(zram->stats)); |
985 | zcomp_destroy(comp); | 1356 | zcomp_destroy(comp); |
1357 | reset_bdev(zram); | ||
986 | } | 1358 | } |
987 | 1359 | ||
988 | static ssize_t disksize_store(struct device *dev, | 1360 | static ssize_t disksize_store(struct device *dev, |
@@ -1108,6 +1480,9 @@ static DEVICE_ATTR_WO(mem_limit); | |||
1108 | static DEVICE_ATTR_WO(mem_used_max); | 1480 | static DEVICE_ATTR_WO(mem_used_max); |
1109 | static DEVICE_ATTR_RW(max_comp_streams); | 1481 | static DEVICE_ATTR_RW(max_comp_streams); |
1110 | static DEVICE_ATTR_RW(comp_algorithm); | 1482 | static DEVICE_ATTR_RW(comp_algorithm); |
1483 | #ifdef CONFIG_ZRAM_WRITEBACK | ||
1484 | static DEVICE_ATTR_RW(backing_dev); | ||
1485 | #endif | ||
1111 | 1486 | ||
1112 | static struct attribute *zram_disk_attrs[] = { | 1487 | static struct attribute *zram_disk_attrs[] = { |
1113 | &dev_attr_disksize.attr, | 1488 | &dev_attr_disksize.attr, |
@@ -1118,6 +1493,9 @@ static struct attribute *zram_disk_attrs[] = { | |||
1118 | &dev_attr_mem_used_max.attr, | 1493 | &dev_attr_mem_used_max.attr, |
1119 | &dev_attr_max_comp_streams.attr, | 1494 | &dev_attr_max_comp_streams.attr, |
1120 | &dev_attr_comp_algorithm.attr, | 1495 | &dev_attr_comp_algorithm.attr, |
1496 | #ifdef CONFIG_ZRAM_WRITEBACK | ||
1497 | &dev_attr_backing_dev.attr, | ||
1498 | #endif | ||
1121 | &dev_attr_io_stat.attr, | 1499 | &dev_attr_io_stat.attr, |
1122 | &dev_attr_mm_stat.attr, | 1500 | &dev_attr_mm_stat.attr, |
1123 | &dev_attr_debug_stat.attr, | 1501 | &dev_attr_debug_stat.attr, |
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e34e44d02e3e..31762db861e3 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h | |||
@@ -60,9 +60,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; | |||
60 | 60 | ||
61 | /* Flags for zram pages (table[page_no].value) */ | 61 | /* Flags for zram pages (table[page_no].value) */ |
62 | enum zram_pageflags { | 62 | enum zram_pageflags { |
63 | /* Page consists entirely of zeros */ | 63 | /* Page consists the same element */ |
64 | ZRAM_SAME = ZRAM_FLAG_SHIFT, | 64 | ZRAM_SAME = ZRAM_FLAG_SHIFT, |
65 | ZRAM_ACCESS, /* page is now accessed */ | 65 | ZRAM_ACCESS, /* page is now accessed */ |
66 | ZRAM_WB, /* page is stored on backing_device */ | ||
66 | 67 | ||
67 | __NR_ZRAM_PAGEFLAGS, | 68 | __NR_ZRAM_PAGEFLAGS, |
68 | }; | 69 | }; |
@@ -115,5 +116,13 @@ struct zram { | |||
115 | * zram is claimed so open request will be failed | 116 | * zram is claimed so open request will be failed |
116 | */ | 117 | */ |
117 | bool claim; /* Protected by bdev->bd_mutex */ | 118 | bool claim; /* Protected by bdev->bd_mutex */ |
119 | #ifdef CONFIG_ZRAM_WRITEBACK | ||
120 | struct file *backing_dev; | ||
121 | struct block_device *bdev; | ||
122 | unsigned int old_block_size; | ||
123 | unsigned long *bitmap; | ||
124 | unsigned long nr_pages; | ||
125 | spinlock_t bitmap_lock; | ||
126 | #endif | ||
118 | }; | 127 | }; |
119 | #endif | 128 | #endif |
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index a36216bd2a84..e4d4b6b41e26 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c | |||
@@ -4308,10 +4308,10 @@ i915_drop_caches_set(void *data, u64 val) | |||
4308 | 4308 | ||
4309 | fs_reclaim_acquire(GFP_KERNEL); | 4309 | fs_reclaim_acquire(GFP_KERNEL); |
4310 | if (val & DROP_BOUND) | 4310 | if (val & DROP_BOUND) |
4311 | i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND); | 4311 | i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_BOUND); |
4312 | 4312 | ||
4313 | if (val & DROP_UNBOUND) | 4313 | if (val & DROP_UNBOUND) |
4314 | i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_UNBOUND); | 4314 | i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_UNBOUND); |
4315 | 4315 | ||
4316 | if (val & DROP_SHRINK_ALL) | 4316 | if (val & DROP_SHRINK_ALL) |
4317 | i915_gem_shrink_all(dev_priv); | 4317 | i915_gem_shrink_all(dev_priv); |
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 60267e375e88..bd74641ab7f6 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h | |||
@@ -3742,6 +3742,7 @@ i915_gem_object_create_internal(struct drm_i915_private *dev_priv, | |||
3742 | /* i915_gem_shrinker.c */ | 3742 | /* i915_gem_shrinker.c */ |
3743 | unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv, | 3743 | unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv, |
3744 | unsigned long target, | 3744 | unsigned long target, |
3745 | unsigned long *nr_scanned, | ||
3745 | unsigned flags); | 3746 | unsigned flags); |
3746 | #define I915_SHRINK_PURGEABLE 0x1 | 3747 | #define I915_SHRINK_PURGEABLE 0x1 |
3747 | #define I915_SHRINK_UNBOUND 0x2 | 3748 | #define I915_SHRINK_UNBOUND 0x2 |
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index b9e8e0d6e97b..287c6ead95b3 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c | |||
@@ -2354,7 +2354,7 @@ rebuild_st: | |||
2354 | goto err_sg; | 2354 | goto err_sg; |
2355 | } | 2355 | } |
2356 | 2356 | ||
2357 | i915_gem_shrink(dev_priv, 2 * page_count, *s++); | 2357 | i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++); |
2358 | cond_resched(); | 2358 | cond_resched(); |
2359 | 2359 | ||
2360 | /* We've tried hard to allocate the memory by reaping | 2360 | /* We've tried hard to allocate the memory by reaping |
@@ -5015,7 +5015,7 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv) | |||
5015 | * the objects as well, see i915_gem_freeze() | 5015 | * the objects as well, see i915_gem_freeze() |
5016 | */ | 5016 | */ |
5017 | 5017 | ||
5018 | i915_gem_shrink(dev_priv, -1UL, I915_SHRINK_UNBOUND); | 5018 | i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND); |
5019 | i915_gem_drain_freed_objects(dev_priv); | 5019 | i915_gem_drain_freed_objects(dev_priv); |
5020 | 5020 | ||
5021 | mutex_lock(&dev_priv->drm.struct_mutex); | 5021 | mutex_lock(&dev_priv->drm.struct_mutex); |
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index d60f38adc4c4..6c6b8e8592aa 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c | |||
@@ -2062,7 +2062,7 @@ int i915_gem_gtt_prepare_pages(struct drm_i915_gem_object *obj, | |||
2062 | */ | 2062 | */ |
2063 | GEM_BUG_ON(obj->mm.pages == pages); | 2063 | GEM_BUG_ON(obj->mm.pages == pages); |
2064 | } while (i915_gem_shrink(to_i915(obj->base.dev), | 2064 | } while (i915_gem_shrink(to_i915(obj->base.dev), |
2065 | obj->base.size >> PAGE_SHIFT, | 2065 | obj->base.size >> PAGE_SHIFT, NULL, |
2066 | I915_SHRINK_BOUND | | 2066 | I915_SHRINK_BOUND | |
2067 | I915_SHRINK_UNBOUND | | 2067 | I915_SHRINK_UNBOUND | |
2068 | I915_SHRINK_ACTIVE)); | 2068 | I915_SHRINK_ACTIVE)); |
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c index 77fb39808131..74002b2d1b6f 100644 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c | |||
@@ -136,6 +136,7 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj) | |||
136 | * i915_gem_shrink - Shrink buffer object caches | 136 | * i915_gem_shrink - Shrink buffer object caches |
137 | * @dev_priv: i915 device | 137 | * @dev_priv: i915 device |
138 | * @target: amount of memory to make available, in pages | 138 | * @target: amount of memory to make available, in pages |
139 | * @nr_scanned: optional output for number of pages scanned (incremental) | ||
139 | * @flags: control flags for selecting cache types | 140 | * @flags: control flags for selecting cache types |
140 | * | 141 | * |
141 | * This function is the main interface to the shrinker. It will try to release | 142 | * This function is the main interface to the shrinker. It will try to release |
@@ -158,7 +159,9 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj) | |||
158 | */ | 159 | */ |
159 | unsigned long | 160 | unsigned long |
160 | i915_gem_shrink(struct drm_i915_private *dev_priv, | 161 | i915_gem_shrink(struct drm_i915_private *dev_priv, |
161 | unsigned long target, unsigned flags) | 162 | unsigned long target, |
163 | unsigned long *nr_scanned, | ||
164 | unsigned flags) | ||
162 | { | 165 | { |
163 | const struct { | 166 | const struct { |
164 | struct list_head *list; | 167 | struct list_head *list; |
@@ -169,6 +172,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv, | |||
169 | { NULL, 0 }, | 172 | { NULL, 0 }, |
170 | }, *phase; | 173 | }, *phase; |
171 | unsigned long count = 0; | 174 | unsigned long count = 0; |
175 | unsigned long scanned = 0; | ||
172 | bool unlock; | 176 | bool unlock; |
173 | 177 | ||
174 | if (!shrinker_lock(dev_priv, &unlock)) | 178 | if (!shrinker_lock(dev_priv, &unlock)) |
@@ -249,6 +253,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv, | |||
249 | count += obj->base.size >> PAGE_SHIFT; | 253 | count += obj->base.size >> PAGE_SHIFT; |
250 | } | 254 | } |
251 | mutex_unlock(&obj->mm.lock); | 255 | mutex_unlock(&obj->mm.lock); |
256 | scanned += obj->base.size >> PAGE_SHIFT; | ||
252 | } | 257 | } |
253 | } | 258 | } |
254 | list_splice_tail(&still_in_list, phase->list); | 259 | list_splice_tail(&still_in_list, phase->list); |
@@ -261,6 +266,8 @@ i915_gem_shrink(struct drm_i915_private *dev_priv, | |||
261 | 266 | ||
262 | shrinker_unlock(dev_priv, unlock); | 267 | shrinker_unlock(dev_priv, unlock); |
263 | 268 | ||
269 | if (nr_scanned) | ||
270 | *nr_scanned += scanned; | ||
264 | return count; | 271 | return count; |
265 | } | 272 | } |
266 | 273 | ||
@@ -283,7 +290,7 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv) | |||
283 | unsigned long freed; | 290 | unsigned long freed; |
284 | 291 | ||
285 | intel_runtime_pm_get(dev_priv); | 292 | intel_runtime_pm_get(dev_priv); |
286 | freed = i915_gem_shrink(dev_priv, -1UL, | 293 | freed = i915_gem_shrink(dev_priv, -1UL, NULL, |
287 | I915_SHRINK_BOUND | | 294 | I915_SHRINK_BOUND | |
288 | I915_SHRINK_UNBOUND | | 295 | I915_SHRINK_UNBOUND | |
289 | I915_SHRINK_ACTIVE); | 296 | I915_SHRINK_ACTIVE); |
@@ -329,23 +336,28 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) | |||
329 | unsigned long freed; | 336 | unsigned long freed; |
330 | bool unlock; | 337 | bool unlock; |
331 | 338 | ||
339 | sc->nr_scanned = 0; | ||
340 | |||
332 | if (!shrinker_lock(dev_priv, &unlock)) | 341 | if (!shrinker_lock(dev_priv, &unlock)) |
333 | return SHRINK_STOP; | 342 | return SHRINK_STOP; |
334 | 343 | ||
335 | freed = i915_gem_shrink(dev_priv, | 344 | freed = i915_gem_shrink(dev_priv, |
336 | sc->nr_to_scan, | 345 | sc->nr_to_scan, |
346 | &sc->nr_scanned, | ||
337 | I915_SHRINK_BOUND | | 347 | I915_SHRINK_BOUND | |
338 | I915_SHRINK_UNBOUND | | 348 | I915_SHRINK_UNBOUND | |
339 | I915_SHRINK_PURGEABLE); | 349 | I915_SHRINK_PURGEABLE); |
340 | if (freed < sc->nr_to_scan) | 350 | if (freed < sc->nr_to_scan) |
341 | freed += i915_gem_shrink(dev_priv, | 351 | freed += i915_gem_shrink(dev_priv, |
342 | sc->nr_to_scan - freed, | 352 | sc->nr_to_scan - sc->nr_scanned, |
353 | &sc->nr_scanned, | ||
343 | I915_SHRINK_BOUND | | 354 | I915_SHRINK_BOUND | |
344 | I915_SHRINK_UNBOUND); | 355 | I915_SHRINK_UNBOUND); |
345 | if (freed < sc->nr_to_scan && current_is_kswapd()) { | 356 | if (freed < sc->nr_to_scan && current_is_kswapd()) { |
346 | intel_runtime_pm_get(dev_priv); | 357 | intel_runtime_pm_get(dev_priv); |
347 | freed += i915_gem_shrink(dev_priv, | 358 | freed += i915_gem_shrink(dev_priv, |
348 | sc->nr_to_scan - freed, | 359 | sc->nr_to_scan - sc->nr_scanned, |
360 | &sc->nr_scanned, | ||
349 | I915_SHRINK_ACTIVE | | 361 | I915_SHRINK_ACTIVE | |
350 | I915_SHRINK_BOUND | | 362 | I915_SHRINK_BOUND | |
351 | I915_SHRINK_UNBOUND); | 363 | I915_SHRINK_UNBOUND); |
@@ -354,7 +366,7 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) | |||
354 | 366 | ||
355 | shrinker_unlock(dev_priv, unlock); | 367 | shrinker_unlock(dev_priv, unlock); |
356 | 368 | ||
357 | return freed; | 369 | return sc->nr_scanned ? freed : SHRINK_STOP; |
358 | } | 370 | } |
359 | 371 | ||
360 | static bool | 372 | static bool |
@@ -453,7 +465,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr | |||
453 | goto out; | 465 | goto out; |
454 | 466 | ||
455 | intel_runtime_pm_get(dev_priv); | 467 | intel_runtime_pm_get(dev_priv); |
456 | freed_pages += i915_gem_shrink(dev_priv, -1UL, | 468 | freed_pages += i915_gem_shrink(dev_priv, -1UL, NULL, |
457 | I915_SHRINK_BOUND | | 469 | I915_SHRINK_BOUND | |
458 | I915_SHRINK_UNBOUND | | 470 | I915_SHRINK_UNBOUND | |
459 | I915_SHRINK_ACTIVE | | 471 | I915_SHRINK_ACTIVE | |
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 14323faf8bd9..60491641a8d6 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c | |||
@@ -1241,8 +1241,10 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector, | |||
1241 | { | 1241 | { |
1242 | struct btt *btt = bdev->bd_disk->private_data; | 1242 | struct btt *btt = bdev->bd_disk->private_data; |
1243 | int rc; | 1243 | int rc; |
1244 | unsigned int len; | ||
1244 | 1245 | ||
1245 | rc = btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector); | 1246 | len = hpage_nr_pages(page) * PAGE_SIZE; |
1247 | rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector); | ||
1246 | if (rc == 0) | 1248 | if (rc == 0) |
1247 | page_endio(page, is_write, 0); | 1249 | page_endio(page, is_write, 0); |
1248 | 1250 | ||
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index f7099adaabc0..e9aa453da50c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c | |||
@@ -80,22 +80,40 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, | |||
80 | static void write_pmem(void *pmem_addr, struct page *page, | 80 | static void write_pmem(void *pmem_addr, struct page *page, |
81 | unsigned int off, unsigned int len) | 81 | unsigned int off, unsigned int len) |
82 | { | 82 | { |
83 | void *mem = kmap_atomic(page); | 83 | unsigned int chunk; |
84 | 84 | void *mem; | |
85 | memcpy_flushcache(pmem_addr, mem + off, len); | 85 | |
86 | kunmap_atomic(mem); | 86 | while (len) { |
87 | mem = kmap_atomic(page); | ||
88 | chunk = min_t(unsigned int, len, PAGE_SIZE); | ||
89 | memcpy_flushcache(pmem_addr, mem + off, chunk); | ||
90 | kunmap_atomic(mem); | ||
91 | len -= chunk; | ||
92 | off = 0; | ||
93 | page++; | ||
94 | pmem_addr += PAGE_SIZE; | ||
95 | } | ||
87 | } | 96 | } |
88 | 97 | ||
89 | static blk_status_t read_pmem(struct page *page, unsigned int off, | 98 | static blk_status_t read_pmem(struct page *page, unsigned int off, |
90 | void *pmem_addr, unsigned int len) | 99 | void *pmem_addr, unsigned int len) |
91 | { | 100 | { |
101 | unsigned int chunk; | ||
92 | int rc; | 102 | int rc; |
93 | void *mem = kmap_atomic(page); | 103 | void *mem; |
94 | 104 | ||
95 | rc = memcpy_mcsafe(mem + off, pmem_addr, len); | 105 | while (len) { |
96 | kunmap_atomic(mem); | 106 | mem = kmap_atomic(page); |
97 | if (rc) | 107 | chunk = min_t(unsigned int, len, PAGE_SIZE); |
98 | return BLK_STS_IOERR; | 108 | rc = memcpy_mcsafe(mem + off, pmem_addr, chunk); |
109 | kunmap_atomic(mem); | ||
110 | if (rc) | ||
111 | return BLK_STS_IOERR; | ||
112 | len -= chunk; | ||
113 | off = 0; | ||
114 | page++; | ||
115 | pmem_addr += PAGE_SIZE; | ||
116 | } | ||
99 | return BLK_STS_OK; | 117 | return BLK_STS_OK; |
100 | } | 118 | } |
101 | 119 | ||
@@ -188,7 +206,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, | |||
188 | struct pmem_device *pmem = bdev->bd_queue->queuedata; | 206 | struct pmem_device *pmem = bdev->bd_queue->queuedata; |
189 | blk_status_t rc; | 207 | blk_status_t rc; |
190 | 208 | ||
191 | rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector); | 209 | rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, |
210 | 0, is_write, sector); | ||
192 | 211 | ||
193 | /* | 212 | /* |
194 | * The ->rw_page interface is subtle and tricky. The core | 213 | * The ->rw_page interface is subtle and tricky. The core |
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 103ca5e1267b..64c58eb26159 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c | |||
@@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, | |||
151 | return FSCACHE_CHECKAUX_OKAY; | 151 | return FSCACHE_CHECKAUX_OKAY; |
152 | } | 152 | } |
153 | 153 | ||
154 | static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) | ||
155 | { | ||
156 | struct v9fs_inode *v9inode = cookie_netfs_data; | ||
157 | struct pagevec pvec; | ||
158 | pgoff_t first; | ||
159 | int loop, nr_pages; | ||
160 | |||
161 | pagevec_init(&pvec, 0); | ||
162 | first = 0; | ||
163 | |||
164 | for (;;) { | ||
165 | nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, | ||
166 | first, | ||
167 | PAGEVEC_SIZE - pagevec_count(&pvec)); | ||
168 | if (!nr_pages) | ||
169 | break; | ||
170 | |||
171 | for (loop = 0; loop < nr_pages; loop++) | ||
172 | ClearPageFsCache(pvec.pages[loop]); | ||
173 | |||
174 | first = pvec.pages[nr_pages - 1]->index + 1; | ||
175 | |||
176 | pvec.nr = nr_pages; | ||
177 | pagevec_release(&pvec); | ||
178 | cond_resched(); | ||
179 | } | ||
180 | } | ||
181 | |||
182 | const struct fscache_cookie_def v9fs_cache_inode_index_def = { | 154 | const struct fscache_cookie_def v9fs_cache_inode_index_def = { |
183 | .name = "9p.inode", | 155 | .name = "9p.inode", |
184 | .type = FSCACHE_COOKIE_TYPE_DATAFILE, | 156 | .type = FSCACHE_COOKIE_TYPE_DATAFILE, |
@@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = { | |||
186 | .get_attr = v9fs_cache_inode_get_attr, | 158 | .get_attr = v9fs_cache_inode_get_attr, |
187 | .get_aux = v9fs_cache_inode_get_aux, | 159 | .get_aux = v9fs_cache_inode_get_aux, |
188 | .check_aux = v9fs_cache_inode_check_aux, | 160 | .check_aux = v9fs_cache_inode_check_aux, |
189 | .now_uncached = v9fs_cache_inode_now_uncached, | ||
190 | }; | 161 | }; |
191 | 162 | ||
192 | void v9fs_cache_inode_get_cookie(struct inode *inode) | 163 | void v9fs_cache_inode_get_cookie(struct inode *inode) |
diff --git a/fs/afs/cache.c b/fs/afs/cache.c index 577763c3d88b..1fe855191261 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c | |||
@@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, | |||
39 | static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, | 39 | static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, |
40 | const void *buffer, | 40 | const void *buffer, |
41 | uint16_t buflen); | 41 | uint16_t buflen); |
42 | static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); | ||
43 | 42 | ||
44 | struct fscache_netfs afs_cache_netfs = { | 43 | struct fscache_netfs afs_cache_netfs = { |
45 | .name = "afs", | 44 | .name = "afs", |
@@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = { | |||
75 | .get_attr = afs_vnode_cache_get_attr, | 74 | .get_attr = afs_vnode_cache_get_attr, |
76 | .get_aux = afs_vnode_cache_get_aux, | 75 | .get_aux = afs_vnode_cache_get_aux, |
77 | .check_aux = afs_vnode_cache_check_aux, | 76 | .check_aux = afs_vnode_cache_check_aux, |
78 | .now_uncached = afs_vnode_cache_now_uncached, | ||
79 | }; | 77 | }; |
80 | 78 | ||
81 | /* | 79 | /* |
@@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, | |||
359 | _leave(" = SUCCESS"); | 357 | _leave(" = SUCCESS"); |
360 | return FSCACHE_CHECKAUX_OKAY; | 358 | return FSCACHE_CHECKAUX_OKAY; |
361 | } | 359 | } |
362 | |||
363 | /* | ||
364 | * indication the cookie is no longer uncached | ||
365 | * - this function is called when the backing store currently caching a cookie | ||
366 | * is removed | ||
367 | * - the netfs should use this to clean up any markers indicating cached pages | ||
368 | * - this is mandatory for any object that may have data | ||
369 | */ | ||
370 | static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) | ||
371 | { | ||
372 | struct afs_vnode *vnode = cookie_netfs_data; | ||
373 | struct pagevec pvec; | ||
374 | pgoff_t first; | ||
375 | int loop, nr_pages; | ||
376 | |||
377 | _enter("{%x,%x,%Lx}", | ||
378 | vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); | ||
379 | |||
380 | pagevec_init(&pvec, 0); | ||
381 | first = 0; | ||
382 | |||
383 | for (;;) { | ||
384 | /* grab a bunch of pages to clean */ | ||
385 | nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, | ||
386 | first, | ||
387 | PAGEVEC_SIZE - pagevec_count(&pvec)); | ||
388 | if (!nr_pages) | ||
389 | break; | ||
390 | |||
391 | for (loop = 0; loop < nr_pages; loop++) | ||
392 | ClearPageFsCache(pvec.pages[loop]); | ||
393 | |||
394 | first = pvec.pages[nr_pages - 1]->index + 1; | ||
395 | |||
396 | pvec.nr = nr_pages; | ||
397 | pagevec_release(&pvec); | ||
398 | cond_resched(); | ||
399 | } | ||
400 | |||
401 | _leave(""); | ||
402 | } | ||
diff --git a/fs/buffer.c b/fs/buffer.c index 5715dac7821f..50da0e102ca0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) | |||
1627 | struct pagevec pvec; | 1627 | struct pagevec pvec; |
1628 | pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); | 1628 | pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); |
1629 | pgoff_t end; | 1629 | pgoff_t end; |
1630 | int i; | 1630 | int i, count; |
1631 | struct buffer_head *bh; | 1631 | struct buffer_head *bh; |
1632 | struct buffer_head *head; | 1632 | struct buffer_head *head; |
1633 | 1633 | ||
1634 | end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); | 1634 | end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); |
1635 | pagevec_init(&pvec, 0); | 1635 | pagevec_init(&pvec, 0); |
1636 | while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, | 1636 | while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { |
1637 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 1637 | count = pagevec_count(&pvec); |
1638 | for (i = 0; i < pagevec_count(&pvec); i++) { | 1638 | for (i = 0; i < count; i++) { |
1639 | struct page *page = pvec.pages[i]; | 1639 | struct page *page = pvec.pages[i]; |
1640 | 1640 | ||
1641 | index = page->index; | ||
1642 | if (index > end) | ||
1643 | break; | ||
1644 | if (!page_has_buffers(page)) | 1641 | if (!page_has_buffers(page)) |
1645 | continue; | 1642 | continue; |
1646 | /* | 1643 | /* |
@@ -1670,7 +1667,9 @@ unlock_page: | |||
1670 | } | 1667 | } |
1671 | pagevec_release(&pvec); | 1668 | pagevec_release(&pvec); |
1672 | cond_resched(); | 1669 | cond_resched(); |
1673 | index++; | 1670 | /* End of range already reached? */ |
1671 | if (index > end || !index) | ||
1672 | break; | ||
1674 | } | 1673 | } |
1675 | } | 1674 | } |
1676 | EXPORT_SYMBOL(clean_bdev_aliases); | 1675 | EXPORT_SYMBOL(clean_bdev_aliases); |
@@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, | |||
3549 | pagevec_init(&pvec, 0); | 3548 | pagevec_init(&pvec, 0); |
3550 | 3549 | ||
3551 | do { | 3550 | do { |
3552 | unsigned want, nr_pages, i; | 3551 | unsigned nr_pages, i; |
3553 | 3552 | ||
3554 | want = min_t(unsigned, end - index, PAGEVEC_SIZE); | 3553 | nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, |
3555 | nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); | 3554 | end - 1); |
3556 | if (nr_pages == 0) | 3555 | if (nr_pages == 0) |
3557 | break; | 3556 | break; |
3558 | 3557 | ||
@@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, | |||
3573 | lastoff < page_offset(page)) | 3572 | lastoff < page_offset(page)) |
3574 | goto check_range; | 3573 | goto check_range; |
3575 | 3574 | ||
3576 | /* Searching done if the page index is out of range. */ | ||
3577 | if (page->index >= end) | ||
3578 | goto not_found; | ||
3579 | |||
3580 | lock_page(page); | 3575 | lock_page(page); |
3581 | if (likely(page->mapping == inode->i_mapping) && | 3576 | if (likely(page->mapping == inode->i_mapping) && |
3582 | page_has_buffers(page)) { | 3577 | page_has_buffers(page)) { |
@@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, | |||
3589 | unlock_page(page); | 3584 | unlock_page(page); |
3590 | lastoff = page_offset(page) + PAGE_SIZE; | 3585 | lastoff = page_offset(page) + PAGE_SIZE; |
3591 | } | 3586 | } |
3592 | |||
3593 | /* Searching done if fewer pages returned than wanted. */ | ||
3594 | if (nr_pages < want) | ||
3595 | break; | ||
3596 | |||
3597 | index = pvec.pages[i - 1]->index + 1; | ||
3598 | pagevec_release(&pvec); | 3587 | pagevec_release(&pvec); |
3599 | } while (index < end); | 3588 | } while (index < end); |
3600 | 3589 | ||
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 337f88673ed9..174d6e6569a8 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c | |||
@@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( | |||
194 | return FSCACHE_CHECKAUX_OKAY; | 194 | return FSCACHE_CHECKAUX_OKAY; |
195 | } | 195 | } |
196 | 196 | ||
197 | static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) | ||
198 | { | ||
199 | struct ceph_inode_info* ci = cookie_netfs_data; | ||
200 | struct pagevec pvec; | ||
201 | pgoff_t first; | ||
202 | int loop, nr_pages; | ||
203 | |||
204 | pagevec_init(&pvec, 0); | ||
205 | first = 0; | ||
206 | |||
207 | dout("ceph inode 0x%p now uncached", ci); | ||
208 | |||
209 | while (1) { | ||
210 | nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, | ||
211 | PAGEVEC_SIZE - pagevec_count(&pvec)); | ||
212 | |||
213 | if (!nr_pages) | ||
214 | break; | ||
215 | |||
216 | for (loop = 0; loop < nr_pages; loop++) | ||
217 | ClearPageFsCache(pvec.pages[loop]); | ||
218 | |||
219 | first = pvec.pages[nr_pages - 1]->index + 1; | ||
220 | |||
221 | pvec.nr = nr_pages; | ||
222 | pagevec_release(&pvec); | ||
223 | cond_resched(); | ||
224 | } | ||
225 | } | ||
226 | |||
227 | static const struct fscache_cookie_def ceph_fscache_inode_object_def = { | 197 | static const struct fscache_cookie_def ceph_fscache_inode_object_def = { |
228 | .name = "CEPH.inode", | 198 | .name = "CEPH.inode", |
229 | .type = FSCACHE_COOKIE_TYPE_DATAFILE, | 199 | .type = FSCACHE_COOKIE_TYPE_DATAFILE, |
@@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { | |||
231 | .get_attr = ceph_fscache_inode_get_attr, | 201 | .get_attr = ceph_fscache_inode_get_attr, |
232 | .get_aux = ceph_fscache_inode_get_aux, | 202 | .get_aux = ceph_fscache_inode_get_aux, |
233 | .check_aux = ceph_fscache_inode_check_aux, | 203 | .check_aux = ceph_fscache_inode_check_aux, |
234 | .now_uncached = ceph_fscache_inode_now_uncached, | ||
235 | }; | 204 | }; |
236 | 205 | ||
237 | void ceph_fscache_register_inode_cookie(struct inode *inode) | 206 | void ceph_fscache_register_inode_cookie(struct inode *inode) |
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 6c665bf4a27c..2c14020e5e1d 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c | |||
@@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, | |||
292 | return FSCACHE_CHECKAUX_OKAY; | 292 | return FSCACHE_CHECKAUX_OKAY; |
293 | } | 293 | } |
294 | 294 | ||
295 | static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) | ||
296 | { | ||
297 | struct cifsInodeInfo *cifsi = cookie_netfs_data; | ||
298 | struct pagevec pvec; | ||
299 | pgoff_t first; | ||
300 | int loop, nr_pages; | ||
301 | |||
302 | pagevec_init(&pvec, 0); | ||
303 | first = 0; | ||
304 | |||
305 | cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi); | ||
306 | |||
307 | for (;;) { | ||
308 | nr_pages = pagevec_lookup(&pvec, | ||
309 | cifsi->vfs_inode.i_mapping, first, | ||
310 | PAGEVEC_SIZE - pagevec_count(&pvec)); | ||
311 | if (!nr_pages) | ||
312 | break; | ||
313 | |||
314 | for (loop = 0; loop < nr_pages; loop++) | ||
315 | ClearPageFsCache(pvec.pages[loop]); | ||
316 | |||
317 | first = pvec.pages[nr_pages - 1]->index + 1; | ||
318 | |||
319 | pvec.nr = nr_pages; | ||
320 | pagevec_release(&pvec); | ||
321 | cond_resched(); | ||
322 | } | ||
323 | } | ||
324 | |||
325 | const struct fscache_cookie_def cifs_fscache_inode_object_def = { | 295 | const struct fscache_cookie_def cifs_fscache_inode_object_def = { |
326 | .name = "CIFS.uniqueid", | 296 | .name = "CIFS.uniqueid", |
327 | .type = FSCACHE_COOKIE_TYPE_DATAFILE, | 297 | .type = FSCACHE_COOKIE_TYPE_DATAFILE, |
@@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = { | |||
329 | .get_attr = cifs_fscache_inode_get_attr, | 299 | .get_attr = cifs_fscache_inode_get_attr, |
330 | .get_aux = cifs_fscache_inode_get_aux, | 300 | .get_aux = cifs_fscache_inode_get_aux, |
331 | .check_aux = cifs_fscache_inode_check_aux, | 301 | .check_aux = cifs_fscache_inode_check_aux, |
332 | .now_uncached = cifs_fscache_inode_now_uncached, | ||
333 | }; | 302 | }; |
@@ -42,6 +42,9 @@ | |||
42 | #define DAX_WAIT_TABLE_BITS 12 | 42 | #define DAX_WAIT_TABLE_BITS 12 |
43 | #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) | 43 | #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) |
44 | 44 | ||
45 | /* The 'colour' (ie low bits) within a PMD of a page offset. */ | ||
46 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) | ||
47 | |||
45 | static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; | 48 | static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; |
46 | 49 | ||
47 | static int __init init_dax_wait_table(void) | 50 | static int __init init_dax_wait_table(void) |
@@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void) | |||
54 | } | 57 | } |
55 | fs_initcall(init_dax_wait_table); | 58 | fs_initcall(init_dax_wait_table); |
56 | 59 | ||
60 | /* | ||
61 | * We use lowest available bit in exceptional entry for locking, one bit for | ||
62 | * the entry size (PMD) and two more to tell us if the entry is a zero page or | ||
63 | * an empty entry that is just used for locking. In total four special bits. | ||
64 | * | ||
65 | * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE | ||
66 | * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem | ||
67 | * block allocation. | ||
68 | */ | ||
69 | #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) | ||
70 | #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) | ||
71 | #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) | ||
72 | #define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) | ||
73 | #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) | ||
74 | |||
75 | static unsigned long dax_radix_sector(void *entry) | ||
76 | { | ||
77 | return (unsigned long)entry >> RADIX_DAX_SHIFT; | ||
78 | } | ||
79 | |||
80 | static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) | ||
81 | { | ||
82 | return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | | ||
83 | ((unsigned long)sector << RADIX_DAX_SHIFT) | | ||
84 | RADIX_DAX_ENTRY_LOCK); | ||
85 | } | ||
86 | |||
87 | static unsigned int dax_radix_order(void *entry) | ||
88 | { | ||
89 | if ((unsigned long)entry & RADIX_DAX_PMD) | ||
90 | return PMD_SHIFT - PAGE_SHIFT; | ||
91 | return 0; | ||
92 | } | ||
93 | |||
57 | static int dax_is_pmd_entry(void *entry) | 94 | static int dax_is_pmd_entry(void *entry) |
58 | { | 95 | { |
59 | return (unsigned long)entry & RADIX_DAX_PMD; | 96 | return (unsigned long)entry & RADIX_DAX_PMD; |
@@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry) | |||
66 | 103 | ||
67 | static int dax_is_zero_entry(void *entry) | 104 | static int dax_is_zero_entry(void *entry) |
68 | { | 105 | { |
69 | return (unsigned long)entry & RADIX_DAX_HZP; | 106 | return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; |
70 | } | 107 | } |
71 | 108 | ||
72 | static int dax_is_empty_entry(void *entry) | 109 | static int dax_is_empty_entry(void *entry) |
@@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, | |||
98 | * the range covered by the PMD map to the same bit lock. | 135 | * the range covered by the PMD map to the same bit lock. |
99 | */ | 136 | */ |
100 | if (dax_is_pmd_entry(entry)) | 137 | if (dax_is_pmd_entry(entry)) |
101 | index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); | 138 | index &= ~PG_PMD_COLOUR; |
102 | 139 | ||
103 | key->mapping = mapping; | 140 | key->mapping = mapping; |
104 | key->entry_start = index; | 141 | key->entry_start = index; |
@@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo | |||
121 | } | 158 | } |
122 | 159 | ||
123 | /* | 160 | /* |
161 | * We do not necessarily hold the mapping->tree_lock when we call this | ||
162 | * function so it is possible that 'entry' is no longer a valid item in the | ||
163 | * radix tree. This is okay because all we really need to do is to find the | ||
164 | * correct waitqueue where tasks might be waiting for that old 'entry' and | ||
165 | * wake them. | ||
166 | */ | ||
167 | static void dax_wake_mapping_entry_waiter(struct address_space *mapping, | ||
168 | pgoff_t index, void *entry, bool wake_all) | ||
169 | { | ||
170 | struct exceptional_entry_key key; | ||
171 | wait_queue_head_t *wq; | ||
172 | |||
173 | wq = dax_entry_waitqueue(mapping, index, entry, &key); | ||
174 | |||
175 | /* | ||
176 | * Checking for locked entry and prepare_to_wait_exclusive() happens | ||
177 | * under mapping->tree_lock, ditto for entry handling in our callers. | ||
178 | * So at this point all tasks that could have seen our entry locked | ||
179 | * must be in the waitqueue and the following check will see them. | ||
180 | */ | ||
181 | if (waitqueue_active(wq)) | ||
182 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | ||
183 | } | ||
184 | |||
185 | /* | ||
124 | * Check whether the given slot is locked. The function must be called with | 186 | * Check whether the given slot is locked. The function must be called with |
125 | * mapping->tree_lock held | 187 | * mapping->tree_lock held |
126 | */ | 188 | */ |
@@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, | |||
181 | for (;;) { | 243 | for (;;) { |
182 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, | 244 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, |
183 | &slot); | 245 | &slot); |
184 | if (!entry || !radix_tree_exceptional_entry(entry) || | 246 | if (!entry || |
247 | WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || | ||
185 | !slot_locked(mapping, slot)) { | 248 | !slot_locked(mapping, slot)) { |
186 | if (slotp) | 249 | if (slotp) |
187 | *slotp = slot; | 250 | *slotp = slot; |
@@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, | |||
216 | } | 279 | } |
217 | 280 | ||
218 | static void put_locked_mapping_entry(struct address_space *mapping, | 281 | static void put_locked_mapping_entry(struct address_space *mapping, |
219 | pgoff_t index, void *entry) | 282 | pgoff_t index) |
220 | { | 283 | { |
221 | if (!radix_tree_exceptional_entry(entry)) { | 284 | dax_unlock_mapping_entry(mapping, index); |
222 | unlock_page(entry); | ||
223 | put_page(entry); | ||
224 | } else { | ||
225 | dax_unlock_mapping_entry(mapping, index); | ||
226 | } | ||
227 | } | 285 | } |
228 | 286 | ||
229 | /* | 287 | /* |
@@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, | |||
233 | static void put_unlocked_mapping_entry(struct address_space *mapping, | 291 | static void put_unlocked_mapping_entry(struct address_space *mapping, |
234 | pgoff_t index, void *entry) | 292 | pgoff_t index, void *entry) |
235 | { | 293 | { |
236 | if (!radix_tree_exceptional_entry(entry)) | 294 | if (!entry) |
237 | return; | 295 | return; |
238 | 296 | ||
239 | /* We have to wake up next waiter for the radix tree entry lock */ | 297 | /* We have to wake up next waiter for the radix tree entry lock */ |
@@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, | |||
241 | } | 299 | } |
242 | 300 | ||
243 | /* | 301 | /* |
244 | * Find radix tree entry at given index. If it points to a page, return with | 302 | * Find radix tree entry at given index. If it points to an exceptional entry, |
245 | * the page locked. If it points to the exceptional entry, return with the | 303 | * return it with the radix tree entry locked. If the radix tree doesn't |
246 | * radix tree entry locked. If the radix tree doesn't contain given index, | 304 | * contain given index, create an empty exceptional entry for the index and |
247 | * create empty exceptional entry for the index and return with it locked. | 305 | * return with it locked. |
248 | * | 306 | * |
249 | * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will | 307 | * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will |
250 | * either return that locked entry or will return an error. This error will | 308 | * either return that locked entry or will return an error. This error will |
251 | * happen if there are any 4k entries (either zero pages or DAX entries) | 309 | * happen if there are any 4k entries within the 2MiB range that we are |
252 | * within the 2MiB range that we are requesting. | 310 | * requesting. |
253 | * | 311 | * |
254 | * We always favor 4k entries over 2MiB entries. There isn't a flow where we | 312 | * We always favor 4k entries over 2MiB entries. There isn't a flow where we |
255 | * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB | 313 | * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB |
@@ -276,18 +334,21 @@ restart: | |||
276 | spin_lock_irq(&mapping->tree_lock); | 334 | spin_lock_irq(&mapping->tree_lock); |
277 | entry = get_unlocked_mapping_entry(mapping, index, &slot); | 335 | entry = get_unlocked_mapping_entry(mapping, index, &slot); |
278 | 336 | ||
337 | if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { | ||
338 | entry = ERR_PTR(-EIO); | ||
339 | goto out_unlock; | ||
340 | } | ||
341 | |||
279 | if (entry) { | 342 | if (entry) { |
280 | if (size_flag & RADIX_DAX_PMD) { | 343 | if (size_flag & RADIX_DAX_PMD) { |
281 | if (!radix_tree_exceptional_entry(entry) || | 344 | if (dax_is_pte_entry(entry)) { |
282 | dax_is_pte_entry(entry)) { | ||
283 | put_unlocked_mapping_entry(mapping, index, | 345 | put_unlocked_mapping_entry(mapping, index, |
284 | entry); | 346 | entry); |
285 | entry = ERR_PTR(-EEXIST); | 347 | entry = ERR_PTR(-EEXIST); |
286 | goto out_unlock; | 348 | goto out_unlock; |
287 | } | 349 | } |
288 | } else { /* trying to grab a PTE entry */ | 350 | } else { /* trying to grab a PTE entry */ |
289 | if (radix_tree_exceptional_entry(entry) && | 351 | if (dax_is_pmd_entry(entry) && |
290 | dax_is_pmd_entry(entry) && | ||
291 | (dax_is_zero_entry(entry) || | 352 | (dax_is_zero_entry(entry) || |
292 | dax_is_empty_entry(entry))) { | 353 | dax_is_empty_entry(entry))) { |
293 | pmd_downgrade = true; | 354 | pmd_downgrade = true; |
@@ -321,7 +382,7 @@ restart: | |||
321 | mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); | 382 | mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); |
322 | if (err) { | 383 | if (err) { |
323 | if (pmd_downgrade) | 384 | if (pmd_downgrade) |
324 | put_locked_mapping_entry(mapping, index, entry); | 385 | put_locked_mapping_entry(mapping, index); |
325 | return ERR_PTR(err); | 386 | return ERR_PTR(err); |
326 | } | 387 | } |
327 | spin_lock_irq(&mapping->tree_lock); | 388 | spin_lock_irq(&mapping->tree_lock); |
@@ -371,52 +432,12 @@ restart: | |||
371 | spin_unlock_irq(&mapping->tree_lock); | 432 | spin_unlock_irq(&mapping->tree_lock); |
372 | return entry; | 433 | return entry; |
373 | } | 434 | } |
374 | /* Normal page in radix tree? */ | ||
375 | if (!radix_tree_exceptional_entry(entry)) { | ||
376 | struct page *page = entry; | ||
377 | |||
378 | get_page(page); | ||
379 | spin_unlock_irq(&mapping->tree_lock); | ||
380 | lock_page(page); | ||
381 | /* Page got truncated? Retry... */ | ||
382 | if (unlikely(page->mapping != mapping)) { | ||
383 | unlock_page(page); | ||
384 | put_page(page); | ||
385 | goto restart; | ||
386 | } | ||
387 | return page; | ||
388 | } | ||
389 | entry = lock_slot(mapping, slot); | 435 | entry = lock_slot(mapping, slot); |
390 | out_unlock: | 436 | out_unlock: |
391 | spin_unlock_irq(&mapping->tree_lock); | 437 | spin_unlock_irq(&mapping->tree_lock); |
392 | return entry; | 438 | return entry; |
393 | } | 439 | } |
394 | 440 | ||
395 | /* | ||
396 | * We do not necessarily hold the mapping->tree_lock when we call this | ||
397 | * function so it is possible that 'entry' is no longer a valid item in the | ||
398 | * radix tree. This is okay because all we really need to do is to find the | ||
399 | * correct waitqueue where tasks might be waiting for that old 'entry' and | ||
400 | * wake them. | ||
401 | */ | ||
402 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, | ||
403 | pgoff_t index, void *entry, bool wake_all) | ||
404 | { | ||
405 | struct exceptional_entry_key key; | ||
406 | wait_queue_head_t *wq; | ||
407 | |||
408 | wq = dax_entry_waitqueue(mapping, index, entry, &key); | ||
409 | |||
410 | /* | ||
411 | * Checking for locked entry and prepare_to_wait_exclusive() happens | ||
412 | * under mapping->tree_lock, ditto for entry handling in our callers. | ||
413 | * So at this point all tasks that could have seen our entry locked | ||
414 | * must be in the waitqueue and the following check will see them. | ||
415 | */ | ||
416 | if (waitqueue_active(wq)) | ||
417 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | ||
418 | } | ||
419 | |||
420 | static int __dax_invalidate_mapping_entry(struct address_space *mapping, | 441 | static int __dax_invalidate_mapping_entry(struct address_space *mapping, |
421 | pgoff_t index, bool trunc) | 442 | pgoff_t index, bool trunc) |
422 | { | 443 | { |
@@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, | |||
426 | 447 | ||
427 | spin_lock_irq(&mapping->tree_lock); | 448 | spin_lock_irq(&mapping->tree_lock); |
428 | entry = get_unlocked_mapping_entry(mapping, index, NULL); | 449 | entry = get_unlocked_mapping_entry(mapping, index, NULL); |
429 | if (!entry || !radix_tree_exceptional_entry(entry)) | 450 | if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) |
430 | goto out; | 451 | goto out; |
431 | if (!trunc && | 452 | if (!trunc && |
432 | (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || | 453 | (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || |
@@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, | |||
468 | return __dax_invalidate_mapping_entry(mapping, index, false); | 489 | return __dax_invalidate_mapping_entry(mapping, index, false); |
469 | } | 490 | } |
470 | 491 | ||
471 | /* | ||
472 | * The user has performed a load from a hole in the file. Allocating | ||
473 | * a new page in the file would cause excessive storage usage for | ||
474 | * workloads with sparse files. We allocate a page cache page instead. | ||
475 | * We'll kick it out of the page cache if it's ever written to, | ||
476 | * otherwise it will simply fall out of the page cache under memory | ||
477 | * pressure without ever having been dirtied. | ||
478 | */ | ||
479 | static int dax_load_hole(struct address_space *mapping, void **entry, | ||
480 | struct vm_fault *vmf) | ||
481 | { | ||
482 | struct inode *inode = mapping->host; | ||
483 | struct page *page; | ||
484 | int ret; | ||
485 | |||
486 | /* Hole page already exists? Return it... */ | ||
487 | if (!radix_tree_exceptional_entry(*entry)) { | ||
488 | page = *entry; | ||
489 | goto finish_fault; | ||
490 | } | ||
491 | |||
492 | /* This will replace locked radix tree entry with a hole page */ | ||
493 | page = find_or_create_page(mapping, vmf->pgoff, | ||
494 | vmf->gfp_mask | __GFP_ZERO); | ||
495 | if (!page) { | ||
496 | ret = VM_FAULT_OOM; | ||
497 | goto out; | ||
498 | } | ||
499 | |||
500 | finish_fault: | ||
501 | vmf->page = page; | ||
502 | ret = finish_fault(vmf); | ||
503 | vmf->page = NULL; | ||
504 | *entry = page; | ||
505 | if (!ret) { | ||
506 | /* Grab reference for PTE that is now referencing the page */ | ||
507 | get_page(page); | ||
508 | ret = VM_FAULT_NOPAGE; | ||
509 | } | ||
510 | out: | ||
511 | trace_dax_load_hole(inode, vmf, ret); | ||
512 | return ret; | ||
513 | } | ||
514 | |||
515 | static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, | 492 | static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, |
516 | sector_t sector, size_t size, struct page *to, | 493 | sector_t sector, size_t size, struct page *to, |
517 | unsigned long vaddr) | 494 | unsigned long vaddr) |
@@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
552 | unsigned long flags) | 529 | unsigned long flags) |
553 | { | 530 | { |
554 | struct radix_tree_root *page_tree = &mapping->page_tree; | 531 | struct radix_tree_root *page_tree = &mapping->page_tree; |
555 | int error = 0; | ||
556 | bool hole_fill = false; | ||
557 | void *new_entry; | 532 | void *new_entry; |
558 | pgoff_t index = vmf->pgoff; | 533 | pgoff_t index = vmf->pgoff; |
559 | 534 | ||
560 | if (vmf->flags & FAULT_FLAG_WRITE) | 535 | if (vmf->flags & FAULT_FLAG_WRITE) |
561 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 536 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
562 | 537 | ||
563 | /* Replacing hole page with block mapping? */ | 538 | if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { |
564 | if (!radix_tree_exceptional_entry(entry)) { | 539 | /* we are replacing a zero page with block mapping */ |
565 | hole_fill = true; | 540 | if (dax_is_pmd_entry(entry)) |
566 | /* | 541 | unmap_mapping_range(mapping, |
567 | * Unmap the page now before we remove it from page cache below. | 542 | (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, |
568 | * The page is locked so it cannot be faulted in again. | 543 | PMD_SIZE, 0); |
569 | */ | 544 | else /* pte entry */ |
570 | unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, | 545 | unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, |
571 | PAGE_SIZE, 0); | 546 | PAGE_SIZE, 0); |
572 | error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); | ||
573 | if (error) | ||
574 | return ERR_PTR(error); | ||
575 | } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { | ||
576 | /* replacing huge zero page with PMD block mapping */ | ||
577 | unmap_mapping_range(mapping, | ||
578 | (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); | ||
579 | } | 547 | } |
580 | 548 | ||
581 | spin_lock_irq(&mapping->tree_lock); | 549 | spin_lock_irq(&mapping->tree_lock); |
582 | new_entry = dax_radix_locked_entry(sector, flags); | 550 | new_entry = dax_radix_locked_entry(sector, flags); |
583 | 551 | ||
584 | if (hole_fill) { | 552 | if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { |
585 | __delete_from_page_cache(entry, NULL); | ||
586 | /* Drop pagecache reference */ | ||
587 | put_page(entry); | ||
588 | error = __radix_tree_insert(page_tree, index, | ||
589 | dax_radix_order(new_entry), new_entry); | ||
590 | if (error) { | ||
591 | new_entry = ERR_PTR(error); | ||
592 | goto unlock; | ||
593 | } | ||
594 | mapping->nrexceptional++; | ||
595 | } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { | ||
596 | /* | 553 | /* |
597 | * Only swap our new entry into the radix tree if the current | 554 | * Only swap our new entry into the radix tree if the current |
598 | * entry is a zero page or an empty entry. If a normal PTE or | 555 | * entry is a zero page or an empty entry. If a normal PTE or |
@@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
609 | WARN_ON_ONCE(ret != entry); | 566 | WARN_ON_ONCE(ret != entry); |
610 | __radix_tree_replace(page_tree, node, slot, | 567 | __radix_tree_replace(page_tree, node, slot, |
611 | new_entry, NULL, NULL); | 568 | new_entry, NULL, NULL); |
569 | entry = new_entry; | ||
612 | } | 570 | } |
571 | |||
613 | if (vmf->flags & FAULT_FLAG_WRITE) | 572 | if (vmf->flags & FAULT_FLAG_WRITE) |
614 | radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); | 573 | radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); |
615 | unlock: | 574 | |
616 | spin_unlock_irq(&mapping->tree_lock); | 575 | spin_unlock_irq(&mapping->tree_lock); |
617 | if (hole_fill) { | 576 | return entry; |
618 | radix_tree_preload_end(); | ||
619 | /* | ||
620 | * We don't need hole page anymore, it has been replaced with | ||
621 | * locked radix tree entry now. | ||
622 | */ | ||
623 | if (mapping->a_ops->freepage) | ||
624 | mapping->a_ops->freepage(entry); | ||
625 | unlock_page(entry); | ||
626 | put_page(entry); | ||
627 | } | ||
628 | return new_entry; | ||
629 | } | 577 | } |
630 | 578 | ||
631 | static inline unsigned long | 579 | static inline unsigned long |
@@ -727,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev, | |||
727 | spin_lock_irq(&mapping->tree_lock); | 675 | spin_lock_irq(&mapping->tree_lock); |
728 | entry2 = get_unlocked_mapping_entry(mapping, index, &slot); | 676 | entry2 = get_unlocked_mapping_entry(mapping, index, &slot); |
729 | /* Entry got punched out / reallocated? */ | 677 | /* Entry got punched out / reallocated? */ |
730 | if (!entry2 || !radix_tree_exceptional_entry(entry2)) | 678 | if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) |
731 | goto put_unlocked; | 679 | goto put_unlocked; |
732 | /* | 680 | /* |
733 | * Entry got reallocated elsewhere? No need to writeback. We have to | 681 | * Entry got reallocated elsewhere? No need to writeback. We have to |
@@ -799,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev, | |||
799 | trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); | 747 | trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); |
800 | dax_unlock: | 748 | dax_unlock: |
801 | dax_read_unlock(id); | 749 | dax_read_unlock(id); |
802 | put_locked_mapping_entry(mapping, index, entry); | 750 | put_locked_mapping_entry(mapping, index); |
803 | return ret; | 751 | return ret; |
804 | 752 | ||
805 | put_unlocked: | 753 | put_unlocked: |
@@ -874,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); | |||
874 | 822 | ||
875 | static int dax_insert_mapping(struct address_space *mapping, | 823 | static int dax_insert_mapping(struct address_space *mapping, |
876 | struct block_device *bdev, struct dax_device *dax_dev, | 824 | struct block_device *bdev, struct dax_device *dax_dev, |
877 | sector_t sector, size_t size, void **entryp, | 825 | sector_t sector, size_t size, void *entry, |
878 | struct vm_area_struct *vma, struct vm_fault *vmf) | 826 | struct vm_area_struct *vma, struct vm_fault *vmf) |
879 | { | 827 | { |
880 | unsigned long vaddr = vmf->address; | 828 | unsigned long vaddr = vmf->address; |
881 | void *entry = *entryp; | ||
882 | void *ret, *kaddr; | 829 | void *ret, *kaddr; |
883 | pgoff_t pgoff; | 830 | pgoff_t pgoff; |
884 | int id, rc; | 831 | int id, rc; |
@@ -899,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping, | |||
899 | ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); | 846 | ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); |
900 | if (IS_ERR(ret)) | 847 | if (IS_ERR(ret)) |
901 | return PTR_ERR(ret); | 848 | return PTR_ERR(ret); |
902 | *entryp = ret; | ||
903 | 849 | ||
904 | trace_dax_insert_mapping(mapping->host, vmf, ret); | 850 | trace_dax_insert_mapping(mapping->host, vmf, ret); |
905 | return vm_insert_mixed(vma, vaddr, pfn); | 851 | if (vmf->flags & FAULT_FLAG_WRITE) |
852 | return vm_insert_mixed_mkwrite(vma, vaddr, pfn); | ||
853 | else | ||
854 | return vm_insert_mixed(vma, vaddr, pfn); | ||
906 | } | 855 | } |
907 | 856 | ||
908 | /** | 857 | /* |
909 | * dax_pfn_mkwrite - handle first write to DAX page | 858 | * The user has performed a load from a hole in the file. Allocating a new |
910 | * @vmf: The description of the fault | 859 | * page in the file would cause excessive storage usage for workloads with |
860 | * sparse files. Instead we insert a read-only mapping of the 4k zero page. | ||
861 | * If this page is ever written to we will re-fault and change the mapping to | ||
862 | * point to real DAX storage instead. | ||
911 | */ | 863 | */ |
912 | int dax_pfn_mkwrite(struct vm_fault *vmf) | 864 | static int dax_load_hole(struct address_space *mapping, void *entry, |
865 | struct vm_fault *vmf) | ||
913 | { | 866 | { |
914 | struct file *file = vmf->vma->vm_file; | ||
915 | struct address_space *mapping = file->f_mapping; | ||
916 | struct inode *inode = mapping->host; | 867 | struct inode *inode = mapping->host; |
917 | void *entry, **slot; | 868 | unsigned long vaddr = vmf->address; |
918 | pgoff_t index = vmf->pgoff; | 869 | int ret = VM_FAULT_NOPAGE; |
870 | struct page *zero_page; | ||
871 | void *entry2; | ||
919 | 872 | ||
920 | spin_lock_irq(&mapping->tree_lock); | 873 | zero_page = ZERO_PAGE(0); |
921 | entry = get_unlocked_mapping_entry(mapping, index, &slot); | 874 | if (unlikely(!zero_page)) { |
922 | if (!entry || !radix_tree_exceptional_entry(entry)) { | 875 | ret = VM_FAULT_OOM; |
923 | if (entry) | 876 | goto out; |
924 | put_unlocked_mapping_entry(mapping, index, entry); | ||
925 | spin_unlock_irq(&mapping->tree_lock); | ||
926 | trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); | ||
927 | return VM_FAULT_NOPAGE; | ||
928 | } | 877 | } |
929 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); | 878 | |
930 | entry = lock_slot(mapping, slot); | 879 | entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, |
931 | spin_unlock_irq(&mapping->tree_lock); | 880 | RADIX_DAX_ZERO_PAGE); |
932 | /* | 881 | if (IS_ERR(entry2)) { |
933 | * If we race with somebody updating the PTE and finish_mkwrite_fault() | 882 | ret = VM_FAULT_SIGBUS; |
934 | * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry | 883 | goto out; |
935 | * the fault in either case. | 884 | } |
936 | */ | 885 | |
937 | finish_mkwrite_fault(vmf); | 886 | vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); |
938 | put_locked_mapping_entry(mapping, index, entry); | 887 | out: |
939 | trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); | 888 | trace_dax_load_hole(inode, vmf, ret); |
940 | return VM_FAULT_NOPAGE; | 889 | return ret; |
941 | } | 890 | } |
942 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); | ||
943 | 891 | ||
944 | static bool dax_range_is_aligned(struct block_device *bdev, | 892 | static bool dax_range_is_aligned(struct block_device *bdev, |
945 | unsigned int offset, unsigned int length) | 893 | unsigned int offset, unsigned int length) |
@@ -1059,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, | |||
1059 | if (map_len > end - pos) | 1007 | if (map_len > end - pos) |
1060 | map_len = end - pos; | 1008 | map_len = end - pos; |
1061 | 1009 | ||
1010 | /* | ||
1011 | * The userspace address for the memory copy has already been | ||
1012 | * validated via access_ok() in either vfs_read() or | ||
1013 | * vfs_write(), depending on which operation we are doing. | ||
1014 | */ | ||
1062 | if (iov_iter_rw(iter) == WRITE) | 1015 | if (iov_iter_rw(iter) == WRITE) |
1063 | map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, | 1016 | map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, |
1064 | map_len, iter); | 1017 | map_len, iter); |
@@ -1223,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1223 | major = VM_FAULT_MAJOR; | 1176 | major = VM_FAULT_MAJOR; |
1224 | } | 1177 | } |
1225 | error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, | 1178 | error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, |
1226 | sector, PAGE_SIZE, &entry, vmf->vma, vmf); | 1179 | sector, PAGE_SIZE, entry, vmf->vma, vmf); |
1227 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | 1180 | /* -EBUSY is fine, somebody else faulted on the same PTE */ |
1228 | if (error == -EBUSY) | 1181 | if (error == -EBUSY) |
1229 | error = 0; | 1182 | error = 0; |
@@ -1231,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1231 | case IOMAP_UNWRITTEN: | 1184 | case IOMAP_UNWRITTEN: |
1232 | case IOMAP_HOLE: | 1185 | case IOMAP_HOLE: |
1233 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1186 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
1234 | vmf_ret = dax_load_hole(mapping, &entry, vmf); | 1187 | vmf_ret = dax_load_hole(mapping, entry, vmf); |
1235 | goto finish_iomap; | 1188 | goto finish_iomap; |
1236 | } | 1189 | } |
1237 | /*FALLTHRU*/ | 1190 | /*FALLTHRU*/ |
@@ -1258,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1258 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); | 1211 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); |
1259 | } | 1212 | } |
1260 | unlock_entry: | 1213 | unlock_entry: |
1261 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | 1214 | put_locked_mapping_entry(mapping, vmf->pgoff); |
1262 | out: | 1215 | out: |
1263 | trace_dax_pte_fault_done(inode, vmf, vmf_ret); | 1216 | trace_dax_pte_fault_done(inode, vmf, vmf_ret); |
1264 | return vmf_ret; | 1217 | return vmf_ret; |
1265 | } | 1218 | } |
1266 | 1219 | ||
1267 | #ifdef CONFIG_FS_DAX_PMD | 1220 | #ifdef CONFIG_FS_DAX_PMD |
1268 | /* | ||
1269 | * The 'colour' (ie low bits) within a PMD of a page offset. This comes up | ||
1270 | * more often than one might expect in the below functions. | ||
1271 | */ | ||
1272 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) | ||
1273 | |||
1274 | static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, | 1221 | static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, |
1275 | loff_t pos, void **entryp) | 1222 | loff_t pos, void *entry) |
1276 | { | 1223 | { |
1277 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | 1224 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
1278 | const sector_t sector = dax_iomap_sector(iomap, pos); | 1225 | const sector_t sector = dax_iomap_sector(iomap, pos); |
@@ -1283,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, | |||
1283 | void *ret = NULL, *kaddr; | 1230 | void *ret = NULL, *kaddr; |
1284 | long length = 0; | 1231 | long length = 0; |
1285 | pgoff_t pgoff; | 1232 | pgoff_t pgoff; |
1286 | pfn_t pfn; | 1233 | pfn_t pfn = {}; |
1287 | int id; | 1234 | int id; |
1288 | 1235 | ||
1289 | if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) | 1236 | if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) |
@@ -1303,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, | |||
1303 | goto unlock_fallback; | 1250 | goto unlock_fallback; |
1304 | dax_read_unlock(id); | 1251 | dax_read_unlock(id); |
1305 | 1252 | ||
1306 | ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, | 1253 | ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, |
1307 | RADIX_DAX_PMD); | 1254 | RADIX_DAX_PMD); |
1308 | if (IS_ERR(ret)) | 1255 | if (IS_ERR(ret)) |
1309 | goto fallback; | 1256 | goto fallback; |
1310 | *entryp = ret; | ||
1311 | 1257 | ||
1312 | trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); | 1258 | trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); |
1313 | return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, | 1259 | return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, |
@@ -1321,7 +1267,7 @@ fallback: | |||
1321 | } | 1267 | } |
1322 | 1268 | ||
1323 | static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | 1269 | static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, |
1324 | void **entryp) | 1270 | void *entry) |
1325 | { | 1271 | { |
1326 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | 1272 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
1327 | unsigned long pmd_addr = vmf->address & PMD_MASK; | 1273 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
@@ -1336,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | |||
1336 | if (unlikely(!zero_page)) | 1282 | if (unlikely(!zero_page)) |
1337 | goto fallback; | 1283 | goto fallback; |
1338 | 1284 | ||
1339 | ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, | 1285 | ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, |
1340 | RADIX_DAX_PMD | RADIX_DAX_HZP); | 1286 | RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); |
1341 | if (IS_ERR(ret)) | 1287 | if (IS_ERR(ret)) |
1342 | goto fallback; | 1288 | goto fallback; |
1343 | *entryp = ret; | ||
1344 | 1289 | ||
1345 | ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); | 1290 | ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); |
1346 | if (!pmd_none(*(vmf->pmd))) { | 1291 | if (!pmd_none(*(vmf->pmd))) { |
@@ -1416,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1416 | goto fallback; | 1361 | goto fallback; |
1417 | 1362 | ||
1418 | /* | 1363 | /* |
1419 | * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | 1364 | * grab_mapping_entry() will make sure we get a 2MiB empty entry, a |
1420 | * PMD or a HZP entry. If it can't (because a 4k page is already in | 1365 | * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page |
1421 | * the tree, for instance), it will return -EEXIST and we just fall | 1366 | * is already in the tree, for instance), it will return -EEXIST and |
1422 | * back to 4k entries. | 1367 | * we just fall back to 4k entries. |
1423 | */ | 1368 | */ |
1424 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | 1369 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); |
1425 | if (IS_ERR(entry)) | 1370 | if (IS_ERR(entry)) |
@@ -1452,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1452 | 1397 | ||
1453 | switch (iomap.type) { | 1398 | switch (iomap.type) { |
1454 | case IOMAP_MAPPED: | 1399 | case IOMAP_MAPPED: |
1455 | result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); | 1400 | result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); |
1456 | break; | 1401 | break; |
1457 | case IOMAP_UNWRITTEN: | 1402 | case IOMAP_UNWRITTEN: |
1458 | case IOMAP_HOLE: | 1403 | case IOMAP_HOLE: |
1459 | if (WARN_ON_ONCE(write)) | 1404 | if (WARN_ON_ONCE(write)) |
1460 | break; | 1405 | break; |
1461 | result = dax_pmd_load_hole(vmf, &iomap, &entry); | 1406 | result = dax_pmd_load_hole(vmf, &iomap, entry); |
1462 | break; | 1407 | break; |
1463 | default: | 1408 | default: |
1464 | WARN_ON_ONCE(1); | 1409 | WARN_ON_ONCE(1); |
@@ -1481,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1481 | &iomap); | 1426 | &iomap); |
1482 | } | 1427 | } |
1483 | unlock_entry: | 1428 | unlock_entry: |
1484 | put_locked_mapping_entry(mapping, pgoff, entry); | 1429 | put_locked_mapping_entry(mapping, pgoff); |
1485 | fallback: | 1430 | fallback: |
1486 | if (result == VM_FAULT_FALLBACK) { | 1431 | if (result == VM_FAULT_FALLBACK) { |
1487 | split_huge_pmd(vma, vmf->pmd, vmf->address); | 1432 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
diff --git a/fs/ext2/file.c b/fs/ext2/file.c index d34d32bdc944..ff3a3636a5ca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c | |||
@@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf) | |||
107 | return ret; | 107 | return ret; |
108 | } | 108 | } |
109 | 109 | ||
110 | static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf) | ||
111 | { | ||
112 | struct inode *inode = file_inode(vmf->vma->vm_file); | ||
113 | struct ext2_inode_info *ei = EXT2_I(inode); | ||
114 | loff_t size; | ||
115 | int ret; | ||
116 | |||
117 | sb_start_pagefault(inode->i_sb); | ||
118 | file_update_time(vmf->vma->vm_file); | ||
119 | down_read(&ei->dax_sem); | ||
120 | |||
121 | /* check that the faulting page hasn't raced with truncate */ | ||
122 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
123 | if (vmf->pgoff >= size) | ||
124 | ret = VM_FAULT_SIGBUS; | ||
125 | else | ||
126 | ret = dax_pfn_mkwrite(vmf); | ||
127 | |||
128 | up_read(&ei->dax_sem); | ||
129 | sb_end_pagefault(inode->i_sb); | ||
130 | return ret; | ||
131 | } | ||
132 | |||
133 | static const struct vm_operations_struct ext2_dax_vm_ops = { | 110 | static const struct vm_operations_struct ext2_dax_vm_ops = { |
134 | .fault = ext2_dax_fault, | 111 | .fault = ext2_dax_fault, |
135 | /* | 112 | /* |
@@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = { | |||
138 | * will always fail and fail back to regular faults. | 115 | * will always fail and fail back to regular faults. |
139 | */ | 116 | */ |
140 | .page_mkwrite = ext2_dax_fault, | 117 | .page_mkwrite = ext2_dax_fault, |
141 | .pfn_mkwrite = ext2_dax_pfn_mkwrite, | 118 | .pfn_mkwrite = ext2_dax_fault, |
142 | }; | 119 | }; |
143 | 120 | ||
144 | static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) | 121 | static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 197653ea6041..57dcaea762c3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -324,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf) | |||
324 | return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); | 324 | return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); |
325 | } | 325 | } |
326 | 326 | ||
327 | /* | ||
328 | * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() | ||
329 | * handler we check for races agaist truncate. Note that since we cycle through | ||
330 | * i_mmap_sem, we are sure that also any hole punching that began before we | ||
331 | * were called is finished by now and so if it included part of the file we | ||
332 | * are working on, our pte will get unmapped and the check for pte_same() in | ||
333 | * wp_pfn_shared() fails. Thus fault gets retried and things work out as | ||
334 | * desired. | ||
335 | */ | ||
336 | static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) | ||
337 | { | ||
338 | struct inode *inode = file_inode(vmf->vma->vm_file); | ||
339 | struct super_block *sb = inode->i_sb; | ||
340 | loff_t size; | ||
341 | int ret; | ||
342 | |||
343 | sb_start_pagefault(sb); | ||
344 | file_update_time(vmf->vma->vm_file); | ||
345 | down_read(&EXT4_I(inode)->i_mmap_sem); | ||
346 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
347 | if (vmf->pgoff >= size) | ||
348 | ret = VM_FAULT_SIGBUS; | ||
349 | else | ||
350 | ret = dax_pfn_mkwrite(vmf); | ||
351 | up_read(&EXT4_I(inode)->i_mmap_sem); | ||
352 | sb_end_pagefault(sb); | ||
353 | |||
354 | return ret; | ||
355 | } | ||
356 | |||
357 | static const struct vm_operations_struct ext4_dax_vm_ops = { | 327 | static const struct vm_operations_struct ext4_dax_vm_ops = { |
358 | .fault = ext4_dax_fault, | 328 | .fault = ext4_dax_fault, |
359 | .huge_fault = ext4_dax_huge_fault, | 329 | .huge_fault = ext4_dax_huge_fault, |
360 | .page_mkwrite = ext4_dax_fault, | 330 | .page_mkwrite = ext4_dax_fault, |
361 | .pfn_mkwrite = ext4_dax_pfn_mkwrite, | 331 | .pfn_mkwrite = ext4_dax_fault, |
362 | }; | 332 | }; |
363 | #else | 333 | #else |
364 | #define ext4_dax_vm_ops ext4_file_vm_ops | 334 | #define ext4_dax_vm_ops ext4_file_vm_ops |
@@ -507,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, | |||
507 | 477 | ||
508 | pagevec_init(&pvec, 0); | 478 | pagevec_init(&pvec, 0); |
509 | do { | 479 | do { |
510 | int i, num; | 480 | int i; |
511 | unsigned long nr_pages; | 481 | unsigned long nr_pages; |
512 | 482 | ||
513 | num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; | 483 | nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, |
514 | nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, | 484 | &index, end); |
515 | (pgoff_t)num); | ||
516 | if (nr_pages == 0) | 485 | if (nr_pages == 0) |
517 | break; | 486 | break; |
518 | 487 | ||
@@ -531,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, | |||
531 | goto out; | 500 | goto out; |
532 | } | 501 | } |
533 | 502 | ||
534 | if (page->index > end) | ||
535 | goto out; | ||
536 | |||
537 | lock_page(page); | 503 | lock_page(page); |
538 | 504 | ||
539 | if (unlikely(page->mapping != inode->i_mapping)) { | 505 | if (unlikely(page->mapping != inode->i_mapping)) { |
@@ -576,14 +542,10 @@ next: | |||
576 | unlock_page(page); | 542 | unlock_page(page); |
577 | } | 543 | } |
578 | 544 | ||
579 | /* The no. of pages is less than our desired, we are done. */ | ||
580 | if (nr_pages < num) | ||
581 | break; | ||
582 | |||
583 | index = pvec.pages[i - 1]->index + 1; | ||
584 | pagevec_release(&pvec); | 545 | pagevec_release(&pvec); |
585 | } while (index <= end); | 546 | } while (index <= end); |
586 | 547 | ||
548 | /* There are no pages upto endoff - that would be a hole in there. */ | ||
587 | if (whence == SEEK_HOLE && lastoff < endoff) { | 549 | if (whence == SEEK_HOLE && lastoff < endoff) { |
588 | found = 1; | 550 | found = 1; |
589 | *offset = lastoff; | 551 | *offset = lastoff; |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 714396760616..e963508ea35f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, | |||
1720 | 1720 | ||
1721 | pagevec_init(&pvec, 0); | 1721 | pagevec_init(&pvec, 0); |
1722 | while (index <= end) { | 1722 | while (index <= end) { |
1723 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | 1723 | nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); |
1724 | if (nr_pages == 0) | 1724 | if (nr_pages == 0) |
1725 | break; | 1725 | break; |
1726 | for (i = 0; i < nr_pages; i++) { | 1726 | for (i = 0; i < nr_pages; i++) { |
1727 | struct page *page = pvec.pages[i]; | 1727 | struct page *page = pvec.pages[i]; |
1728 | if (page->index > end) | 1728 | |
1729 | break; | ||
1730 | BUG_ON(!PageLocked(page)); | 1729 | BUG_ON(!PageLocked(page)); |
1731 | BUG_ON(PageWriteback(page)); | 1730 | BUG_ON(PageWriteback(page)); |
1732 | if (invalidate) { | 1731 | if (invalidate) { |
@@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, | |||
1737 | } | 1736 | } |
1738 | unlock_page(page); | 1737 | unlock_page(page); |
1739 | } | 1738 | } |
1740 | index = pvec.pages[nr_pages - 1]->index + 1; | ||
1741 | pagevec_release(&pvec); | 1739 | pagevec_release(&pvec); |
1742 | } | 1740 | } |
1743 | } | 1741 | } |
@@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) | |||
2348 | 2346 | ||
2349 | pagevec_init(&pvec, 0); | 2347 | pagevec_init(&pvec, 0); |
2350 | while (start <= end) { | 2348 | while (start <= end) { |
2351 | nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, | 2349 | nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, |
2352 | PAGEVEC_SIZE); | 2350 | &start, end); |
2353 | if (nr_pages == 0) | 2351 | if (nr_pages == 0) |
2354 | break; | 2352 | break; |
2355 | for (i = 0; i < nr_pages; i++) { | 2353 | for (i = 0; i < nr_pages; i++) { |
2356 | struct page *page = pvec.pages[i]; | 2354 | struct page *page = pvec.pages[i]; |
2357 | 2355 | ||
2358 | if (page->index > end) | ||
2359 | break; | ||
2360 | /* Up to 'end' pages must be contiguous */ | ||
2361 | BUG_ON(page->index != start); | ||
2362 | bh = head = page_buffers(page); | 2356 | bh = head = page_buffers(page); |
2363 | do { | 2357 | do { |
2364 | if (lblk < mpd->map.m_lblk) | 2358 | if (lblk < mpd->map.m_lblk) |
@@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) | |||
2403 | pagevec_release(&pvec); | 2397 | pagevec_release(&pvec); |
2404 | return err; | 2398 | return err; |
2405 | } | 2399 | } |
2406 | start++; | ||
2407 | } | 2400 | } |
2408 | pagevec_release(&pvec); | 2401 | pagevec_release(&pvec); |
2409 | } | 2402 | } |
diff --git a/fs/fscache/page.c b/fs/fscache/page.c index c8c4f79c7ce1..0ad3fd3ad0b4 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c | |||
@@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, | |||
1178 | pagevec_init(&pvec, 0); | 1178 | pagevec_init(&pvec, 0); |
1179 | next = 0; | 1179 | next = 0; |
1180 | do { | 1180 | do { |
1181 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) | 1181 | if (!pagevec_lookup(&pvec, mapping, &next)) |
1182 | break; | 1182 | break; |
1183 | for (i = 0; i < pagevec_count(&pvec); i++) { | 1183 | for (i = 0; i < pagevec_count(&pvec); i++) { |
1184 | struct page *page = pvec.pages[i]; | 1184 | struct page *page = pvec.pages[i]; |
1185 | next = page->index; | ||
1186 | if (PageFsCache(page)) { | 1185 | if (PageFsCache(page)) { |
1187 | __fscache_wait_on_page_write(cookie, page); | 1186 | __fscache_wait_on_page_write(cookie, page); |
1188 | __fscache_uncache_page(cookie, page); | 1187 | __fscache_uncache_page(cookie, page); |
@@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, | |||
1190 | } | 1189 | } |
1191 | pagevec_release(&pvec); | 1190 | pagevec_release(&pvec); |
1192 | cond_resched(); | 1191 | cond_resched(); |
1193 | } while (++next); | 1192 | } while (next); |
1194 | 1193 | ||
1195 | _leave(""); | 1194 | _leave(""); |
1196 | } | 1195 | } |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 28d2753be094..7c02b3f738e1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
401 | const pgoff_t end = lend >> huge_page_shift(h); | 401 | const pgoff_t end = lend >> huge_page_shift(h); |
402 | struct vm_area_struct pseudo_vma; | 402 | struct vm_area_struct pseudo_vma; |
403 | struct pagevec pvec; | 403 | struct pagevec pvec; |
404 | pgoff_t next; | 404 | pgoff_t next, index; |
405 | int i, freed = 0; | 405 | int i, freed = 0; |
406 | long lookup_nr = PAGEVEC_SIZE; | ||
407 | bool truncate_op = (lend == LLONG_MAX); | 406 | bool truncate_op = (lend == LLONG_MAX); |
408 | 407 | ||
409 | memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); | 408 | memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); |
@@ -412,33 +411,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
412 | next = start; | 411 | next = start; |
413 | while (next < end) { | 412 | while (next < end) { |
414 | /* | 413 | /* |
415 | * Don't grab more pages than the number left in the range. | ||
416 | */ | ||
417 | if (end - next < lookup_nr) | ||
418 | lookup_nr = end - next; | ||
419 | |||
420 | /* | ||
421 | * When no more pages are found, we are done. | 414 | * When no more pages are found, we are done. |
422 | */ | 415 | */ |
423 | if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) | 416 | if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1)) |
424 | break; | 417 | break; |
425 | 418 | ||
426 | for (i = 0; i < pagevec_count(&pvec); ++i) { | 419 | for (i = 0; i < pagevec_count(&pvec); ++i) { |
427 | struct page *page = pvec.pages[i]; | 420 | struct page *page = pvec.pages[i]; |
428 | u32 hash; | 421 | u32 hash; |
429 | 422 | ||
430 | /* | 423 | index = page->index; |
431 | * The page (index) could be beyond end. This is | ||
432 | * only possible in the punch hole case as end is | ||
433 | * max page offset in the truncate case. | ||
434 | */ | ||
435 | next = page->index; | ||
436 | if (next >= end) | ||
437 | break; | ||
438 | |||
439 | hash = hugetlb_fault_mutex_hash(h, current->mm, | 424 | hash = hugetlb_fault_mutex_hash(h, current->mm, |
440 | &pseudo_vma, | 425 | &pseudo_vma, |
441 | mapping, next, 0); | 426 | mapping, index, 0); |
442 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 427 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
443 | 428 | ||
444 | /* | 429 | /* |
@@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
455 | 440 | ||
456 | i_mmap_lock_write(mapping); | 441 | i_mmap_lock_write(mapping); |
457 | hugetlb_vmdelete_list(&mapping->i_mmap, | 442 | hugetlb_vmdelete_list(&mapping->i_mmap, |
458 | next * pages_per_huge_page(h), | 443 | index * pages_per_huge_page(h), |
459 | (next + 1) * pages_per_huge_page(h)); | 444 | (index + 1) * pages_per_huge_page(h)); |
460 | i_mmap_unlock_write(mapping); | 445 | i_mmap_unlock_write(mapping); |
461 | } | 446 | } |
462 | 447 | ||
@@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
475 | freed++; | 460 | freed++; |
476 | if (!truncate_op) { | 461 | if (!truncate_op) { |
477 | if (unlikely(hugetlb_unreserve_pages(inode, | 462 | if (unlikely(hugetlb_unreserve_pages(inode, |
478 | next, next + 1, 1))) | 463 | index, index + 1, 1))) |
479 | hugetlb_fix_reserve_counts(inode); | 464 | hugetlb_fix_reserve_counts(inode); |
480 | } | 465 | } |
481 | 466 | ||
482 | unlock_page(page); | 467 | unlock_page(page); |
483 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 468 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
484 | } | 469 | } |
485 | ++next; | ||
486 | huge_pagevec_release(&pvec); | 470 | huge_pagevec_release(&pvec); |
487 | cond_resched(); | 471 | cond_resched(); |
488 | } | 472 | } |
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 777b055063f6..3025fe8584a0 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c | |||
@@ -252,45 +252,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, | |||
252 | } | 252 | } |
253 | 253 | ||
254 | /* | 254 | /* |
255 | * Indication from FS-Cache that the cookie is no longer cached | ||
256 | * - This function is called when the backing store currently caching a cookie | ||
257 | * is removed | ||
258 | * - The netfs should use this to clean up any markers indicating cached pages | ||
259 | * - This is mandatory for any object that may have data | ||
260 | */ | ||
261 | static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) | ||
262 | { | ||
263 | struct nfs_inode *nfsi = cookie_netfs_data; | ||
264 | struct pagevec pvec; | ||
265 | pgoff_t first; | ||
266 | int loop, nr_pages; | ||
267 | |||
268 | pagevec_init(&pvec, 0); | ||
269 | first = 0; | ||
270 | |||
271 | dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); | ||
272 | |||
273 | for (;;) { | ||
274 | /* grab a bunch of pages to unmark */ | ||
275 | nr_pages = pagevec_lookup(&pvec, | ||
276 | nfsi->vfs_inode.i_mapping, | ||
277 | first, | ||
278 | PAGEVEC_SIZE - pagevec_count(&pvec)); | ||
279 | if (!nr_pages) | ||
280 | break; | ||
281 | |||
282 | for (loop = 0; loop < nr_pages; loop++) | ||
283 | ClearPageFsCache(pvec.pages[loop]); | ||
284 | |||
285 | first = pvec.pages[nr_pages - 1]->index + 1; | ||
286 | |||
287 | pvec.nr = nr_pages; | ||
288 | pagevec_release(&pvec); | ||
289 | cond_resched(); | ||
290 | } | ||
291 | } | ||
292 | |||
293 | /* | ||
294 | * Get an extra reference on a read context. | 255 | * Get an extra reference on a read context. |
295 | * - This function can be absent if the completion function doesn't require a | 256 | * - This function can be absent if the completion function doesn't require a |
296 | * context. | 257 | * context. |
@@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = { | |||
330 | .get_attr = nfs_fscache_inode_get_attr, | 291 | .get_attr = nfs_fscache_inode_get_attr, |
331 | .get_aux = nfs_fscache_inode_get_aux, | 292 | .get_aux = nfs_fscache_inode_get_aux, |
332 | .check_aux = nfs_fscache_inode_check_aux, | 293 | .check_aux = nfs_fscache_inode_check_aux, |
333 | .now_uncached = nfs_fscache_inode_now_uncached, | ||
334 | .get_context = nfs_fh_get_context, | 294 | .get_context = nfs_fh_get_context, |
335 | .put_context = nfs_fh_put_context, | 295 | .put_context = nfs_fh_put_context, |
336 | }; | 296 | }; |
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index f11a3ad2df0c..8616c46d33da 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c | |||
@@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap, | |||
312 | 312 | ||
313 | pagevec_init(&pvec, 0); | 313 | pagevec_init(&pvec, 0); |
314 | repeat: | 314 | repeat: |
315 | n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); | 315 | n = pagevec_lookup(&pvec, smap, &index); |
316 | if (!n) | 316 | if (!n) |
317 | return; | 317 | return; |
318 | index = pvec.pages[n - 1]->index + 1; | ||
319 | 318 | ||
320 | for (i = 0; i < pagevec_count(&pvec); i++) { | 319 | for (i = 0; i < pagevec_count(&pvec); i++) { |
321 | struct page *page = pvec.pages[i], *dpage; | 320 | struct page *page = pvec.pages[i], *dpage; |
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e50a387959bf..40b5cc97f7b0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c | |||
@@ -221,7 +221,7 @@ out: | |||
221 | /* | 221 | /* |
222 | * Set the access or default ACL of an inode. | 222 | * Set the access or default ACL of an inode. |
223 | */ | 223 | */ |
224 | int ocfs2_set_acl(handle_t *handle, | 224 | static int ocfs2_set_acl(handle_t *handle, |
225 | struct inode *inode, | 225 | struct inode *inode, |
226 | struct buffer_head *di_bh, | 226 | struct buffer_head *di_bh, |
227 | int type, | 227 | int type, |
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 2783a75b3999..7be0bb756286 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h | |||
@@ -28,13 +28,6 @@ struct ocfs2_acl_entry { | |||
28 | 28 | ||
29 | struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); | 29 | struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); |
30 | int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); | 30 | int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); |
31 | int ocfs2_set_acl(handle_t *handle, | ||
32 | struct inode *inode, | ||
33 | struct buffer_head *di_bh, | ||
34 | int type, | ||
35 | struct posix_acl *acl, | ||
36 | struct ocfs2_alloc_context *meta_ac, | ||
37 | struct ocfs2_alloc_context *data_ac); | ||
38 | extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); | 31 | extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); |
39 | extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, | 32 | extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, |
40 | struct buffer_head *, struct buffer_head *, | 33 | struct buffer_head *, struct buffer_head *, |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fb15a96df0b6..a177eae3aa1a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, | |||
955 | /* | 955 | /* |
956 | * How many free extents have we got before we need more meta data? | 956 | * How many free extents have we got before we need more meta data? |
957 | */ | 957 | */ |
958 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | 958 | int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) |
959 | struct ocfs2_extent_tree *et) | ||
960 | { | 959 | { |
961 | int retval; | 960 | int retval; |
962 | struct ocfs2_extent_list *el = NULL; | 961 | struct ocfs2_extent_list *el = NULL; |
@@ -1933,14 +1932,12 @@ out: | |||
1933 | * the new changes. | 1932 | * the new changes. |
1934 | * | 1933 | * |
1935 | * left_rec: the record on the left. | 1934 | * left_rec: the record on the left. |
1936 | * left_child_el: is the child list pointed to by left_rec | ||
1937 | * right_rec: the record to the right of left_rec | 1935 | * right_rec: the record to the right of left_rec |
1938 | * right_child_el: is the child list pointed to by right_rec | 1936 | * right_child_el: is the child list pointed to by right_rec |
1939 | * | 1937 | * |
1940 | * By definition, this only works on interior nodes. | 1938 | * By definition, this only works on interior nodes. |
1941 | */ | 1939 | */ |
1942 | static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, | 1940 | static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, |
1943 | struct ocfs2_extent_list *left_child_el, | ||
1944 | struct ocfs2_extent_rec *right_rec, | 1941 | struct ocfs2_extent_rec *right_rec, |
1945 | struct ocfs2_extent_list *right_child_el) | 1942 | struct ocfs2_extent_list *right_child_el) |
1946 | { | 1943 | { |
@@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, | |||
2003 | */ | 2000 | */ |
2004 | BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); | 2001 | BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); |
2005 | 2002 | ||
2006 | ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, | 2003 | ocfs2_adjust_adjacent_records(&root_el->l_recs[i], |
2007 | &root_el->l_recs[i + 1], right_el); | 2004 | &root_el->l_recs[i + 1], right_el); |
2008 | } | 2005 | } |
2009 | 2006 | ||
@@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, | |||
2060 | el = right_path->p_node[i].el; | 2057 | el = right_path->p_node[i].el; |
2061 | right_rec = &el->l_recs[0]; | 2058 | right_rec = &el->l_recs[0]; |
2062 | 2059 | ||
2063 | ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, | 2060 | ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el); |
2064 | right_el); | ||
2065 | 2061 | ||
2066 | ocfs2_journal_dirty(handle, left_path->p_node[i].bh); | 2062 | ocfs2_journal_dirty(handle, left_path->p_node[i].bh); |
2067 | ocfs2_journal_dirty(handle, right_path->p_node[i].bh); | 2063 | ocfs2_journal_dirty(handle, right_path->p_node[i].bh); |
@@ -2509,7 +2505,7 @@ out_ret_path: | |||
2509 | 2505 | ||
2510 | static int ocfs2_update_edge_lengths(handle_t *handle, | 2506 | static int ocfs2_update_edge_lengths(handle_t *handle, |
2511 | struct ocfs2_extent_tree *et, | 2507 | struct ocfs2_extent_tree *et, |
2512 | int subtree_index, struct ocfs2_path *path) | 2508 | struct ocfs2_path *path) |
2513 | { | 2509 | { |
2514 | int i, idx, ret; | 2510 | int i, idx, ret; |
2515 | struct ocfs2_extent_rec *rec; | 2511 | struct ocfs2_extent_rec *rec; |
@@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, | |||
2755 | if (del_right_subtree) { | 2751 | if (del_right_subtree) { |
2756 | ocfs2_unlink_subtree(handle, et, left_path, right_path, | 2752 | ocfs2_unlink_subtree(handle, et, left_path, right_path, |
2757 | subtree_index, dealloc); | 2753 | subtree_index, dealloc); |
2758 | ret = ocfs2_update_edge_lengths(handle, et, subtree_index, | 2754 | ret = ocfs2_update_edge_lengths(handle, et, left_path); |
2759 | left_path); | ||
2760 | if (ret) { | 2755 | if (ret) { |
2761 | mlog_errno(ret); | 2756 | mlog_errno(ret); |
2762 | goto out; | 2757 | goto out; |
@@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, | |||
3060 | 3055 | ||
3061 | ocfs2_unlink_subtree(handle, et, left_path, path, | 3056 | ocfs2_unlink_subtree(handle, et, left_path, path, |
3062 | subtree_index, dealloc); | 3057 | subtree_index, dealloc); |
3063 | ret = ocfs2_update_edge_lengths(handle, et, subtree_index, | 3058 | ret = ocfs2_update_edge_lengths(handle, et, left_path); |
3064 | left_path); | ||
3065 | if (ret) { | 3059 | if (ret) { |
3066 | mlog_errno(ret); | 3060 | mlog_errno(ret); |
3067 | goto out; | 3061 | goto out; |
@@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, | |||
4790 | if (mark_unwritten) | 4784 | if (mark_unwritten) |
4791 | flags = OCFS2_EXT_UNWRITTEN; | 4785 | flags = OCFS2_EXT_UNWRITTEN; |
4792 | 4786 | ||
4793 | free_extents = ocfs2_num_free_extents(osb, et); | 4787 | free_extents = ocfs2_num_free_extents(et); |
4794 | if (free_extents < 0) { | 4788 | if (free_extents < 0) { |
4795 | status = free_extents; | 4789 | status = free_extents; |
4796 | mlog_errno(status); | 4790 | mlog_errno(status); |
@@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, | |||
5668 | 5662 | ||
5669 | *ac = NULL; | 5663 | *ac = NULL; |
5670 | 5664 | ||
5671 | num_free_extents = ocfs2_num_free_extents(osb, et); | 5665 | num_free_extents = ocfs2_num_free_extents(et); |
5672 | if (num_free_extents < 0) { | 5666 | if (num_free_extents < 0) { |
5673 | ret = num_free_extents; | 5667 | ret = num_free_extents; |
5674 | mlog_errno(ret); | 5668 | mlog_errno(ret); |
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 4a5152ec88a3..27b75cf32cfa 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h | |||
@@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode, | |||
144 | struct ocfs2_cached_dealloc_ctxt *dealloc, | 144 | struct ocfs2_cached_dealloc_ctxt *dealloc, |
145 | u64 refcount_loc, bool refcount_tree_locked); | 145 | u64 refcount_loc, bool refcount_tree_locked); |
146 | 146 | ||
147 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | 147 | int ocfs2_num_free_extents(struct ocfs2_extent_tree *et); |
148 | struct ocfs2_extent_tree *et); | ||
149 | 148 | ||
150 | /* | 149 | /* |
151 | * how many new metadata chunks would an allocation need at maximum? | 150 | * how many new metadata chunks would an allocation need at maximum? |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ffe003982d95..56ac07cd35f6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, | |||
505 | } | 505 | } |
506 | } | 506 | } |
507 | 507 | ||
508 | static void o2hb_wait_on_io(struct o2hb_region *reg, | 508 | static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc) |
509 | struct o2hb_bio_wait_ctxt *wc) | ||
510 | { | 509 | { |
511 | o2hb_bio_wait_dec(wc, 1); | 510 | o2hb_bio_wait_dec(wc, 1); |
512 | wait_for_completion(&wc->wc_io_complete); | 511 | wait_for_completion(&wc->wc_io_complete); |
@@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, | |||
608 | status = 0; | 607 | status = 0; |
609 | 608 | ||
610 | bail_and_wait: | 609 | bail_and_wait: |
611 | o2hb_wait_on_io(reg, &wc); | 610 | o2hb_wait_on_io(&wc); |
612 | if (wc.wc_error && !status) | 611 | if (wc.wc_error && !status) |
613 | status = wc.wc_error; | 612 | status = wc.wc_error; |
614 | 613 | ||
@@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
1162 | * before we can go to steady state. This ensures that | 1161 | * before we can go to steady state. This ensures that |
1163 | * people we find in our steady state have seen us. | 1162 | * people we find in our steady state have seen us. |
1164 | */ | 1163 | */ |
1165 | o2hb_wait_on_io(reg, &write_wc); | 1164 | o2hb_wait_on_io(&write_wc); |
1166 | if (write_wc.wc_error) { | 1165 | if (write_wc.wc_error) { |
1167 | /* Do not re-arm the write timeout on I/O error - we | 1166 | /* Do not re-arm the write timeout on I/O error - we |
1168 | * can't be sure that the new block ever made it to | 1167 | * can't be sure that the new block ever made it to |
@@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data) | |||
1275 | o2hb_prepare_block(reg, 0); | 1274 | o2hb_prepare_block(reg, 0); |
1276 | ret = o2hb_issue_node_write(reg, &write_wc); | 1275 | ret = o2hb_issue_node_write(reg, &write_wc); |
1277 | if (ret == 0) | 1276 | if (ret == 0) |
1278 | o2hb_wait_on_io(reg, &write_wc); | 1277 | o2hb_wait_on_io(&write_wc); |
1279 | else | 1278 | else |
1280 | mlog_errno(ret); | 1279 | mlog_errno(ret); |
1281 | } | 1280 | } |
@@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid, | |||
2576 | } | 2575 | } |
2577 | EXPORT_SYMBOL_GPL(o2hb_unregister_callback); | 2576 | EXPORT_SYMBOL_GPL(o2hb_unregister_callback); |
2578 | 2577 | ||
2579 | int o2hb_check_node_heartbeating(u8 node_num) | ||
2580 | { | ||
2581 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
2582 | |||
2583 | o2hb_fill_node_map(testing_map, sizeof(testing_map)); | ||
2584 | if (!test_bit(node_num, testing_map)) { | ||
2585 | mlog(ML_HEARTBEAT, | ||
2586 | "node (%u) does not have heartbeating enabled.\n", | ||
2587 | node_num); | ||
2588 | return 0; | ||
2589 | } | ||
2590 | |||
2591 | return 1; | ||
2592 | } | ||
2593 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); | ||
2594 | |||
2595 | int o2hb_check_node_heartbeating_no_sem(u8 node_num) | 2578 | int o2hb_check_node_heartbeating_no_sem(u8 node_num) |
2596 | { | 2579 | { |
2597 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 2580 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
@@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) | |||
2626 | } | 2609 | } |
2627 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); | 2610 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); |
2628 | 2611 | ||
2629 | /* Makes sure our local node is configured with a node number, and is | ||
2630 | * heartbeating. */ | ||
2631 | int o2hb_check_local_node_heartbeating(void) | ||
2632 | { | ||
2633 | u8 node_num; | ||
2634 | |||
2635 | /* if this node was set then we have networking */ | ||
2636 | node_num = o2nm_this_node(); | ||
2637 | if (node_num == O2NM_MAX_NODES) { | ||
2638 | mlog(ML_HEARTBEAT, "this node has not been configured.\n"); | ||
2639 | return 0; | ||
2640 | } | ||
2641 | |||
2642 | return o2hb_check_node_heartbeating(node_num); | ||
2643 | } | ||
2644 | EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); | ||
2645 | |||
2646 | /* | 2612 | /* |
2647 | * this is just a hack until we get the plumbing which flips file systems | 2613 | * this is just a hack until we get the plumbing which flips file systems |
2648 | * read only and drops the hb ref instead of killing the node dead. | 2614 | * read only and drops the hb ref instead of killing the node dead. |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 3ecb9f337b7d..febe6312ceff 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, | |||
3249 | spin_unlock(&OCFS2_I(dir)->ip_lock); | 3249 | spin_unlock(&OCFS2_I(dir)->ip_lock); |
3250 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), | 3250 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), |
3251 | parent_fe_bh); | 3251 | parent_fe_bh); |
3252 | num_free_extents = ocfs2_num_free_extents(osb, &et); | 3252 | num_free_extents = ocfs2_num_free_extents(&et); |
3253 | if (num_free_extents < 0) { | 3253 | if (num_free_extents < 0) { |
3254 | status = num_free_extents; | 3254 | status = num_free_extents; |
3255 | mlog_errno(status); | 3255 | mlog_errno(status); |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 66e59d3163ea..6e41fc8fabbe 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -713,13 +713,6 @@ leave: | |||
713 | return status; | 713 | return status; |
714 | } | 714 | } |
715 | 715 | ||
716 | int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, | ||
717 | u32 clusters_to_add, int mark_unwritten) | ||
718 | { | ||
719 | return __ocfs2_extend_allocation(inode, logical_start, | ||
720 | clusters_to_add, mark_unwritten); | ||
721 | } | ||
722 | |||
723 | /* | 716 | /* |
724 | * While a write will already be ordering the data, a truncate will not. | 717 | * While a write will already be ordering the data, a truncate will not. |
725 | * Thus, we need to explicitly order the zeroed pages. | 718 | * Thus, we need to explicitly order the zeroed pages. |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d5e5fa7f0743..36304434eacf 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) | |||
1348 | ocfs2_schedule_truncate_log_flush(osb, 0); | 1348 | ocfs2_schedule_truncate_log_flush(osb, 0); |
1349 | 1349 | ||
1350 | osb->local_alloc_copy = NULL; | 1350 | osb->local_alloc_copy = NULL; |
1351 | osb->dirty = 0; | ||
1352 | 1351 | ||
1353 | /* queue to recover orphan slots for all offline slots */ | 1352 | /* queue to recover orphan slots for all offline slots */ |
1354 | ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); | 1353 | ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); |
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index e52a2852d50d..7eb3b0a6347e 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c | |||
@@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, | |||
175 | unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; | 175 | unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; |
176 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 176 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
177 | 177 | ||
178 | num_free_extents = ocfs2_num_free_extents(osb, et); | 178 | num_free_extents = ocfs2_num_free_extents(et); |
179 | if (num_free_extents < 0) { | 179 | if (num_free_extents < 0) { |
180 | ret = num_free_extents; | 180 | ret = num_free_extents; |
181 | mlog_errno(ret); | 181 | mlog_errno(ret); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 0c39d71c67a1..9a50f222ac97 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -320,7 +320,6 @@ struct ocfs2_super | |||
320 | u64 system_dir_blkno; | 320 | u64 system_dir_blkno; |
321 | u64 bitmap_blkno; | 321 | u64 bitmap_blkno; |
322 | u32 bitmap_cpg; | 322 | u32 bitmap_cpg; |
323 | u8 *uuid; | ||
324 | char *uuid_str; | 323 | char *uuid_str; |
325 | u32 uuid_hash; | 324 | u32 uuid_hash; |
326 | u8 *vol_label; | 325 | u8 *vol_label; |
@@ -388,9 +387,8 @@ struct ocfs2_super | |||
388 | unsigned int osb_resv_level; | 387 | unsigned int osb_resv_level; |
389 | unsigned int osb_dir_resv_level; | 388 | unsigned int osb_dir_resv_level; |
390 | 389 | ||
391 | /* Next three fields are for local node slot recovery during | 390 | /* Next two fields are for local node slot recovery during |
392 | * mount. */ | 391 | * mount. */ |
393 | int dirty; | ||
394 | struct ocfs2_dinode *local_alloc_copy; | 392 | struct ocfs2_dinode *local_alloc_copy; |
395 | struct ocfs2_quota_recovery *quota_rec; | 393 | struct ocfs2_quota_recovery *quota_rec; |
396 | 394 | ||
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index f8933cb53d68..ab156e35ec00 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c | |||
@@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, | |||
2851 | int *credits) | 2851 | int *credits) |
2852 | { | 2852 | { |
2853 | int ret = 0, meta_add = 0; | 2853 | int ret = 0, meta_add = 0; |
2854 | int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); | 2854 | int num_free_extents = ocfs2_num_free_extents(et); |
2855 | 2855 | ||
2856 | if (num_free_extents < 0) { | 2856 | if (num_free_extents < 0) { |
2857 | ret = num_free_extents; | 2857 | ret = num_free_extents; |
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6ad3533940ba..71f22c8fbffd 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
@@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode, | |||
2700 | 2700 | ||
2701 | BUG_ON(clusters_to_add != 0 && data_ac == NULL); | 2701 | BUG_ON(clusters_to_add != 0 && data_ac == NULL); |
2702 | 2702 | ||
2703 | num_free_extents = ocfs2_num_free_extents(osb, et); | 2703 | num_free_extents = ocfs2_num_free_extents(et); |
2704 | if (num_free_extents < 0) { | 2704 | if (num_free_extents < 0) { |
2705 | ret = num_free_extents; | 2705 | ret = num_free_extents; |
2706 | mlog_errno(ret); | 2706 | mlog_errno(ret); |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 83005f486451..3f936be379a9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) | |||
2486 | if (dirty) { | 2486 | if (dirty) { |
2487 | /* Recovery will be completed after we've mounted the | 2487 | /* Recovery will be completed after we've mounted the |
2488 | * rest of the volume. */ | 2488 | * rest of the volume. */ |
2489 | osb->dirty = 1; | ||
2490 | osb->local_alloc_copy = local_alloc; | 2489 | osb->local_alloc_copy = local_alloc; |
2491 | local_alloc = NULL; | 2490 | local_alloc = NULL; |
2492 | } | 2491 | } |
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f70c3778d600..5fdf269ba82e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c | |||
@@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators( | |||
6800 | *credits += 1; | 6800 | *credits += 1; |
6801 | 6801 | ||
6802 | /* count in the xattr tree change. */ | 6802 | /* count in the xattr tree change. */ |
6803 | num_free_extents = ocfs2_num_free_extents(osb, xt_et); | 6803 | num_free_extents = ocfs2_num_free_extents(xt_et); |
6804 | if (num_free_extents < 0) { | 6804 | if (num_free_extents < 0) { |
6805 | ret = num_free_extents; | 6805 | ret = num_free_extents; |
6806 | mlog_errno(ret); | 6806 | mlog_errno(ret); |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 98fd8f6df851..e5d89a0d0b8a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -2931,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = { | |||
2931 | #ifdef CONFIG_PROC_PAGE_MONITOR | 2931 | #ifdef CONFIG_PROC_PAGE_MONITOR |
2932 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), | 2932 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), |
2933 | REG("smaps", S_IRUGO, proc_pid_smaps_operations), | 2933 | REG("smaps", S_IRUGO, proc_pid_smaps_operations), |
2934 | REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), | ||
2934 | REG("pagemap", S_IRUSR, proc_pagemap_operations), | 2935 | REG("pagemap", S_IRUSR, proc_pagemap_operations), |
2935 | #endif | 2936 | #endif |
2936 | #ifdef CONFIG_SECURITY | 2937 | #ifdef CONFIG_SECURITY |
@@ -3324,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = { | |||
3324 | #ifdef CONFIG_PROC_PAGE_MONITOR | 3325 | #ifdef CONFIG_PROC_PAGE_MONITOR |
3325 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), | 3326 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), |
3326 | REG("smaps", S_IRUGO, proc_tid_smaps_operations), | 3327 | REG("smaps", S_IRUGO, proc_tid_smaps_operations), |
3328 | REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), | ||
3327 | REG("pagemap", S_IRUSR, proc_pagemap_operations), | 3329 | REG("pagemap", S_IRUSR, proc_pagemap_operations), |
3328 | #endif | 3330 | #endif |
3329 | #ifdef CONFIG_SECURITY | 3331 | #ifdef CONFIG_SECURITY |
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2b89071630..2cbfcd32e884 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h | |||
@@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *); | |||
269 | /* | 269 | /* |
270 | * task_[no]mmu.c | 270 | * task_[no]mmu.c |
271 | */ | 271 | */ |
272 | struct mem_size_stats; | ||
272 | struct proc_maps_private { | 273 | struct proc_maps_private { |
273 | struct inode *inode; | 274 | struct inode *inode; |
274 | struct task_struct *task; | 275 | struct task_struct *task; |
275 | struct mm_struct *mm; | 276 | struct mm_struct *mm; |
277 | struct mem_size_stats *rollup; | ||
276 | #ifdef CONFIG_MMU | 278 | #ifdef CONFIG_MMU |
277 | struct vm_area_struct *tail_vma; | 279 | struct vm_area_struct *tail_vma; |
278 | #endif | 280 | #endif |
@@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations; | |||
288 | extern const struct file_operations proc_pid_numa_maps_operations; | 290 | extern const struct file_operations proc_pid_numa_maps_operations; |
289 | extern const struct file_operations proc_tid_numa_maps_operations; | 291 | extern const struct file_operations proc_tid_numa_maps_operations; |
290 | extern const struct file_operations proc_pid_smaps_operations; | 292 | extern const struct file_operations proc_pid_smaps_operations; |
293 | extern const struct file_operations proc_pid_smaps_rollup_operations; | ||
291 | extern const struct file_operations proc_tid_smaps_operations; | 294 | extern const struct file_operations proc_tid_smaps_operations; |
292 | extern const struct file_operations proc_clear_refs_operations; | 295 | extern const struct file_operations proc_clear_refs_operations; |
293 | extern const struct file_operations proc_pagemap_operations; | 296 | extern const struct file_operations proc_pagemap_operations; |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 509a61668d90..cdd979724c74 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
80 | show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); | 80 | show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); |
81 | show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); | 81 | show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); |
82 | show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); | 82 | show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); |
83 | show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); | 83 | show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); |
84 | 84 | ||
85 | #ifdef CONFIG_HIGHMEM | 85 | #ifdef CONFIG_HIGHMEM |
86 | show_val_kb(m, "HighTotal: ", i.totalhigh); | 86 | show_val_kb(m, "HighTotal: ", i.totalhigh); |
@@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
114 | show_val_kb(m, "SUnreclaim: ", | 114 | show_val_kb(m, "SUnreclaim: ", |
115 | global_node_page_state(NR_SLAB_UNRECLAIMABLE)); | 115 | global_node_page_state(NR_SLAB_UNRECLAIMABLE)); |
116 | seq_printf(m, "KernelStack: %8lu kB\n", | 116 | seq_printf(m, "KernelStack: %8lu kB\n", |
117 | global_page_state(NR_KERNEL_STACK_KB)); | 117 | global_zone_page_state(NR_KERNEL_STACK_KB)); |
118 | show_val_kb(m, "PageTables: ", | 118 | show_val_kb(m, "PageTables: ", |
119 | global_page_state(NR_PAGETABLE)); | 119 | global_zone_page_state(NR_PAGETABLE)); |
120 | #ifdef CONFIG_QUICKLIST | 120 | #ifdef CONFIG_QUICKLIST |
121 | show_val_kb(m, "Quicklists: ", quicklist_total_size()); | 121 | show_val_kb(m, "Quicklists: ", quicklist_total_size()); |
122 | #endif | 122 | #endif |
@@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
124 | show_val_kb(m, "NFS_Unstable: ", | 124 | show_val_kb(m, "NFS_Unstable: ", |
125 | global_node_page_state(NR_UNSTABLE_NFS)); | 125 | global_node_page_state(NR_UNSTABLE_NFS)); |
126 | show_val_kb(m, "Bounce: ", | 126 | show_val_kb(m, "Bounce: ", |
127 | global_page_state(NR_BOUNCE)); | 127 | global_zone_page_state(NR_BOUNCE)); |
128 | show_val_kb(m, "WritebackTmp: ", | 128 | show_val_kb(m, "WritebackTmp: ", |
129 | global_node_page_state(NR_WRITEBACK_TEMP)); | 129 | global_node_page_state(NR_WRITEBACK_TEMP)); |
130 | show_val_kb(m, "CommitLimit: ", vm_commit_limit()); | 130 | show_val_kb(m, "CommitLimit: ", vm_commit_limit()); |
@@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
151 | #ifdef CONFIG_CMA | 151 | #ifdef CONFIG_CMA |
152 | show_val_kb(m, "CmaTotal: ", totalcma_pages); | 152 | show_val_kb(m, "CmaTotal: ", totalcma_pages); |
153 | show_val_kb(m, "CmaFree: ", | 153 | show_val_kb(m, "CmaFree: ", |
154 | global_page_state(NR_FREE_CMA_PAGES)); | 154 | global_zone_page_state(NR_FREE_CMA_PAGES)); |
155 | #endif | 155 | #endif |
156 | 156 | ||
157 | hugetlb_report_meminfo(m); | 157 | hugetlb_report_meminfo(m); |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fe8f3265e877..a290966f91ec 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file) | |||
253 | if (priv->mm) | 253 | if (priv->mm) |
254 | mmdrop(priv->mm); | 254 | mmdrop(priv->mm); |
255 | 255 | ||
256 | kfree(priv->rollup); | ||
256 | return seq_release_private(inode, file); | 257 | return seq_release_private(inode, file); |
257 | } | 258 | } |
258 | 259 | ||
@@ -279,6 +280,23 @@ static int is_stack(struct proc_maps_private *priv, | |||
279 | vma->vm_end >= vma->vm_mm->start_stack; | 280 | vma->vm_end >= vma->vm_mm->start_stack; |
280 | } | 281 | } |
281 | 282 | ||
283 | static void show_vma_header_prefix(struct seq_file *m, | ||
284 | unsigned long start, unsigned long end, | ||
285 | vm_flags_t flags, unsigned long long pgoff, | ||
286 | dev_t dev, unsigned long ino) | ||
287 | { | ||
288 | seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); | ||
289 | seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", | ||
290 | start, | ||
291 | end, | ||
292 | flags & VM_READ ? 'r' : '-', | ||
293 | flags & VM_WRITE ? 'w' : '-', | ||
294 | flags & VM_EXEC ? 'x' : '-', | ||
295 | flags & VM_MAYSHARE ? 's' : 'p', | ||
296 | pgoff, | ||
297 | MAJOR(dev), MINOR(dev), ino); | ||
298 | } | ||
299 | |||
282 | static void | 300 | static void |
283 | show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) | 301 | show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) |
284 | { | 302 | { |
@@ -301,17 +319,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) | |||
301 | 319 | ||
302 | start = vma->vm_start; | 320 | start = vma->vm_start; |
303 | end = vma->vm_end; | 321 | end = vma->vm_end; |
304 | 322 | show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); | |
305 | seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); | ||
306 | seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", | ||
307 | start, | ||
308 | end, | ||
309 | flags & VM_READ ? 'r' : '-', | ||
310 | flags & VM_WRITE ? 'w' : '-', | ||
311 | flags & VM_EXEC ? 'x' : '-', | ||
312 | flags & VM_MAYSHARE ? 's' : 'p', | ||
313 | pgoff, | ||
314 | MAJOR(dev), MINOR(dev), ino); | ||
315 | 323 | ||
316 | /* | 324 | /* |
317 | * Print the dentry name for named mappings, and a | 325 | * Print the dentry name for named mappings, and a |
@@ -430,6 +438,7 @@ const struct file_operations proc_tid_maps_operations = { | |||
430 | 438 | ||
431 | #ifdef CONFIG_PROC_PAGE_MONITOR | 439 | #ifdef CONFIG_PROC_PAGE_MONITOR |
432 | struct mem_size_stats { | 440 | struct mem_size_stats { |
441 | bool first; | ||
433 | unsigned long resident; | 442 | unsigned long resident; |
434 | unsigned long shared_clean; | 443 | unsigned long shared_clean; |
435 | unsigned long shared_dirty; | 444 | unsigned long shared_dirty; |
@@ -443,7 +452,9 @@ struct mem_size_stats { | |||
443 | unsigned long swap; | 452 | unsigned long swap; |
444 | unsigned long shared_hugetlb; | 453 | unsigned long shared_hugetlb; |
445 | unsigned long private_hugetlb; | 454 | unsigned long private_hugetlb; |
455 | unsigned long first_vma_start; | ||
446 | u64 pss; | 456 | u64 pss; |
457 | u64 pss_locked; | ||
447 | u64 swap_pss; | 458 | u64 swap_pss; |
448 | bool check_shmem_swap; | 459 | bool check_shmem_swap; |
449 | }; | 460 | }; |
@@ -652,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | |||
652 | [ilog2(VM_NORESERVE)] = "nr", | 663 | [ilog2(VM_NORESERVE)] = "nr", |
653 | [ilog2(VM_HUGETLB)] = "ht", | 664 | [ilog2(VM_HUGETLB)] = "ht", |
654 | [ilog2(VM_ARCH_1)] = "ar", | 665 | [ilog2(VM_ARCH_1)] = "ar", |
666 | [ilog2(VM_WIPEONFORK)] = "wf", | ||
655 | [ilog2(VM_DONTDUMP)] = "dd", | 667 | [ilog2(VM_DONTDUMP)] = "dd", |
656 | #ifdef CONFIG_MEM_SOFT_DIRTY | 668 | #ifdef CONFIG_MEM_SOFT_DIRTY |
657 | [ilog2(VM_SOFTDIRTY)] = "sd", | 669 | [ilog2(VM_SOFTDIRTY)] = "sd", |
@@ -719,18 +731,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) | |||
719 | 731 | ||
720 | static int show_smap(struct seq_file *m, void *v, int is_pid) | 732 | static int show_smap(struct seq_file *m, void *v, int is_pid) |
721 | { | 733 | { |
734 | struct proc_maps_private *priv = m->private; | ||
722 | struct vm_area_struct *vma = v; | 735 | struct vm_area_struct *vma = v; |
723 | struct mem_size_stats mss; | 736 | struct mem_size_stats mss_stack; |
737 | struct mem_size_stats *mss; | ||
724 | struct mm_walk smaps_walk = { | 738 | struct mm_walk smaps_walk = { |
725 | .pmd_entry = smaps_pte_range, | 739 | .pmd_entry = smaps_pte_range, |
726 | #ifdef CONFIG_HUGETLB_PAGE | 740 | #ifdef CONFIG_HUGETLB_PAGE |
727 | .hugetlb_entry = smaps_hugetlb_range, | 741 | .hugetlb_entry = smaps_hugetlb_range, |
728 | #endif | 742 | #endif |
729 | .mm = vma->vm_mm, | 743 | .mm = vma->vm_mm, |
730 | .private = &mss, | ||
731 | }; | 744 | }; |
745 | int ret = 0; | ||
746 | bool rollup_mode; | ||
747 | bool last_vma; | ||
748 | |||
749 | if (priv->rollup) { | ||
750 | rollup_mode = true; | ||
751 | mss = priv->rollup; | ||
752 | if (mss->first) { | ||
753 | mss->first_vma_start = vma->vm_start; | ||
754 | mss->first = false; | ||
755 | } | ||
756 | last_vma = !m_next_vma(priv, vma); | ||
757 | } else { | ||
758 | rollup_mode = false; | ||
759 | memset(&mss_stack, 0, sizeof(mss_stack)); | ||
760 | mss = &mss_stack; | ||
761 | } | ||
732 | 762 | ||
733 | memset(&mss, 0, sizeof mss); | 763 | smaps_walk.private = mss; |
734 | 764 | ||
735 | #ifdef CONFIG_SHMEM | 765 | #ifdef CONFIG_SHMEM |
736 | if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { | 766 | if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { |
@@ -748,9 +778,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
748 | 778 | ||
749 | if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || | 779 | if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || |
750 | !(vma->vm_flags & VM_WRITE)) { | 780 | !(vma->vm_flags & VM_WRITE)) { |
751 | mss.swap = shmem_swapped; | 781 | mss->swap = shmem_swapped; |
752 | } else { | 782 | } else { |
753 | mss.check_shmem_swap = true; | 783 | mss->check_shmem_swap = true; |
754 | smaps_walk.pte_hole = smaps_pte_hole; | 784 | smaps_walk.pte_hole = smaps_pte_hole; |
755 | } | 785 | } |
756 | } | 786 | } |
@@ -758,54 +788,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
758 | 788 | ||
759 | /* mmap_sem is held in m_start */ | 789 | /* mmap_sem is held in m_start */ |
760 | walk_page_vma(vma, &smaps_walk); | 790 | walk_page_vma(vma, &smaps_walk); |
791 | if (vma->vm_flags & VM_LOCKED) | ||
792 | mss->pss_locked += mss->pss; | ||
793 | |||
794 | if (!rollup_mode) { | ||
795 | show_map_vma(m, vma, is_pid); | ||
796 | } else if (last_vma) { | ||
797 | show_vma_header_prefix( | ||
798 | m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0); | ||
799 | seq_pad(m, ' '); | ||
800 | seq_puts(m, "[rollup]\n"); | ||
801 | } else { | ||
802 | ret = SEQ_SKIP; | ||
803 | } | ||
761 | 804 | ||
762 | show_map_vma(m, vma, is_pid); | 805 | if (!rollup_mode) |
763 | 806 | seq_printf(m, | |
764 | seq_printf(m, | 807 | "Size: %8lu kB\n" |
765 | "Size: %8lu kB\n" | 808 | "KernelPageSize: %8lu kB\n" |
766 | "Rss: %8lu kB\n" | 809 | "MMUPageSize: %8lu kB\n", |
767 | "Pss: %8lu kB\n" | 810 | (vma->vm_end - vma->vm_start) >> 10, |
768 | "Shared_Clean: %8lu kB\n" | 811 | vma_kernel_pagesize(vma) >> 10, |
769 | "Shared_Dirty: %8lu kB\n" | 812 | vma_mmu_pagesize(vma) >> 10); |
770 | "Private_Clean: %8lu kB\n" | 813 | |
771 | "Private_Dirty: %8lu kB\n" | 814 | |
772 | "Referenced: %8lu kB\n" | 815 | if (!rollup_mode || last_vma) |
773 | "Anonymous: %8lu kB\n" | 816 | seq_printf(m, |
774 | "LazyFree: %8lu kB\n" | 817 | "Rss: %8lu kB\n" |
775 | "AnonHugePages: %8lu kB\n" | 818 | "Pss: %8lu kB\n" |
776 | "ShmemPmdMapped: %8lu kB\n" | 819 | "Shared_Clean: %8lu kB\n" |
777 | "Shared_Hugetlb: %8lu kB\n" | 820 | "Shared_Dirty: %8lu kB\n" |
778 | "Private_Hugetlb: %7lu kB\n" | 821 | "Private_Clean: %8lu kB\n" |
779 | "Swap: %8lu kB\n" | 822 | "Private_Dirty: %8lu kB\n" |
780 | "SwapPss: %8lu kB\n" | 823 | "Referenced: %8lu kB\n" |
781 | "KernelPageSize: %8lu kB\n" | 824 | "Anonymous: %8lu kB\n" |
782 | "MMUPageSize: %8lu kB\n" | 825 | "LazyFree: %8lu kB\n" |
783 | "Locked: %8lu kB\n", | 826 | "AnonHugePages: %8lu kB\n" |
784 | (vma->vm_end - vma->vm_start) >> 10, | 827 | "ShmemPmdMapped: %8lu kB\n" |
785 | mss.resident >> 10, | 828 | "Shared_Hugetlb: %8lu kB\n" |
786 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), | 829 | "Private_Hugetlb: %7lu kB\n" |
787 | mss.shared_clean >> 10, | 830 | "Swap: %8lu kB\n" |
788 | mss.shared_dirty >> 10, | 831 | "SwapPss: %8lu kB\n" |
789 | mss.private_clean >> 10, | 832 | "Locked: %8lu kB\n", |
790 | mss.private_dirty >> 10, | 833 | mss->resident >> 10, |
791 | mss.referenced >> 10, | 834 | (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), |
792 | mss.anonymous >> 10, | 835 | mss->shared_clean >> 10, |
793 | mss.lazyfree >> 10, | 836 | mss->shared_dirty >> 10, |
794 | mss.anonymous_thp >> 10, | 837 | mss->private_clean >> 10, |
795 | mss.shmem_thp >> 10, | 838 | mss->private_dirty >> 10, |
796 | mss.shared_hugetlb >> 10, | 839 | mss->referenced >> 10, |
797 | mss.private_hugetlb >> 10, | 840 | mss->anonymous >> 10, |
798 | mss.swap >> 10, | 841 | mss->lazyfree >> 10, |
799 | (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), | 842 | mss->anonymous_thp >> 10, |
800 | vma_kernel_pagesize(vma) >> 10, | 843 | mss->shmem_thp >> 10, |
801 | vma_mmu_pagesize(vma) >> 10, | 844 | mss->shared_hugetlb >> 10, |
802 | (vma->vm_flags & VM_LOCKED) ? | 845 | mss->private_hugetlb >> 10, |
803 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); | 846 | mss->swap >> 10, |
804 | 847 | (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), | |
805 | arch_show_smap(m, vma); | 848 | (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); |
806 | show_smap_vma_flags(m, vma); | 849 | |
850 | if (!rollup_mode) { | ||
851 | arch_show_smap(m, vma); | ||
852 | show_smap_vma_flags(m, vma); | ||
853 | } | ||
807 | m_cache_vma(m, vma); | 854 | m_cache_vma(m, vma); |
808 | return 0; | 855 | return ret; |
809 | } | 856 | } |
810 | 857 | ||
811 | static int show_pid_smap(struct seq_file *m, void *v) | 858 | static int show_pid_smap(struct seq_file *m, void *v) |
@@ -837,6 +884,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file) | |||
837 | return do_maps_open(inode, file, &proc_pid_smaps_op); | 884 | return do_maps_open(inode, file, &proc_pid_smaps_op); |
838 | } | 885 | } |
839 | 886 | ||
887 | static int pid_smaps_rollup_open(struct inode *inode, struct file *file) | ||
888 | { | ||
889 | struct seq_file *seq; | ||
890 | struct proc_maps_private *priv; | ||
891 | int ret = do_maps_open(inode, file, &proc_pid_smaps_op); | ||
892 | |||
893 | if (ret < 0) | ||
894 | return ret; | ||
895 | seq = file->private_data; | ||
896 | priv = seq->private; | ||
897 | priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL); | ||
898 | if (!priv->rollup) { | ||
899 | proc_map_release(inode, file); | ||
900 | return -ENOMEM; | ||
901 | } | ||
902 | priv->rollup->first = true; | ||
903 | return 0; | ||
904 | } | ||
905 | |||
840 | static int tid_smaps_open(struct inode *inode, struct file *file) | 906 | static int tid_smaps_open(struct inode *inode, struct file *file) |
841 | { | 907 | { |
842 | return do_maps_open(inode, file, &proc_tid_smaps_op); | 908 | return do_maps_open(inode, file, &proc_tid_smaps_op); |
@@ -849,6 +915,13 @@ const struct file_operations proc_pid_smaps_operations = { | |||
849 | .release = proc_map_release, | 915 | .release = proc_map_release, |
850 | }; | 916 | }; |
851 | 917 | ||
918 | const struct file_operations proc_pid_smaps_rollup_operations = { | ||
919 | .open = pid_smaps_rollup_open, | ||
920 | .read = seq_read, | ||
921 | .llseek = seq_lseek, | ||
922 | .release = proc_map_release, | ||
923 | }; | ||
924 | |||
852 | const struct file_operations proc_tid_smaps_operations = { | 925 | const struct file_operations proc_tid_smaps_operations = { |
853 | .open = tid_smaps_open, | 926 | .open = tid_smaps_open, |
854 | .read = seq_read, | 927 | .read = seq_read, |
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2ef7ce75c062..3ac1f2387083 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c | |||
@@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, | |||
228 | if (!pages) | 228 | if (!pages) |
229 | goto out_free; | 229 | goto out_free; |
230 | 230 | ||
231 | nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); | 231 | nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); |
232 | if (nr != lpages) | 232 | if (nr != lpages) |
233 | goto out_free_pages; /* leave if some pages were missing */ | 233 | goto out_free_pages; /* leave if some pages were missing */ |
234 | 234 | ||
@@ -335,11 +335,6 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, | |||
335 | goto out_put; | 335 | goto out_put; |
336 | 336 | ||
337 | mapping = f.file->f_mapping; | 337 | mapping = f.file->f_mapping; |
338 | if (!mapping) { | ||
339 | ret = -EINVAL; | ||
340 | goto out_put; | ||
341 | } | ||
342 | |||
343 | ret = 0; | 338 | ret = 0; |
344 | if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { | 339 | if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { |
345 | ret = file_fdatawait_range(f.file, offset, endbyte); | 340 | ret = file_fdatawait_range(f.file, offset, endbyte); |
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 886085b47c75..5419e7da82ba 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c | |||
@@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg) | |||
178 | 178 | ||
179 | static inline struct uffd_msg userfault_msg(unsigned long address, | 179 | static inline struct uffd_msg userfault_msg(unsigned long address, |
180 | unsigned int flags, | 180 | unsigned int flags, |
181 | unsigned long reason) | 181 | unsigned long reason, |
182 | unsigned int features) | ||
182 | { | 183 | { |
183 | struct uffd_msg msg; | 184 | struct uffd_msg msg; |
184 | msg_init(&msg); | 185 | msg_init(&msg); |
@@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address, | |||
202 | * write protect fault. | 203 | * write protect fault. |
203 | */ | 204 | */ |
204 | msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; | 205 | msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; |
206 | if (features & UFFD_FEATURE_THREAD_ID) | ||
207 | msg.arg.pagefault.feat.ptid = task_pid_vnr(current); | ||
205 | return msg; | 208 | return msg; |
206 | } | 209 | } |
207 | 210 | ||
@@ -370,6 +373,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) | |||
370 | VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); | 373 | VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); |
371 | VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); | 374 | VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); |
372 | 375 | ||
376 | if (ctx->features & UFFD_FEATURE_SIGBUS) | ||
377 | goto out; | ||
378 | |||
373 | /* | 379 | /* |
374 | * If it's already released don't get it. This avoids to loop | 380 | * If it's already released don't get it. This avoids to loop |
375 | * in __get_user_pages if userfaultfd_release waits on the | 381 | * in __get_user_pages if userfaultfd_release waits on the |
@@ -419,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) | |||
419 | 425 | ||
420 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); | 426 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); |
421 | uwq.wq.private = current; | 427 | uwq.wq.private = current; |
422 | uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); | 428 | uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, |
429 | ctx->features); | ||
423 | uwq.ctx = ctx; | 430 | uwq.ctx = ctx; |
424 | uwq.waken = false; | 431 | uwq.waken = false; |
425 | 432 | ||
@@ -1194,7 +1201,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, | |||
1194 | struct uffdio_register __user *user_uffdio_register; | 1201 | struct uffdio_register __user *user_uffdio_register; |
1195 | unsigned long vm_flags, new_flags; | 1202 | unsigned long vm_flags, new_flags; |
1196 | bool found; | 1203 | bool found; |
1197 | bool non_anon_pages; | 1204 | bool basic_ioctls; |
1198 | unsigned long start, end, vma_end; | 1205 | unsigned long start, end, vma_end; |
1199 | 1206 | ||
1200 | user_uffdio_register = (struct uffdio_register __user *) arg; | 1207 | user_uffdio_register = (struct uffdio_register __user *) arg; |
@@ -1260,7 +1267,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, | |||
1260 | * Search for not compatible vmas. | 1267 | * Search for not compatible vmas. |
1261 | */ | 1268 | */ |
1262 | found = false; | 1269 | found = false; |
1263 | non_anon_pages = false; | 1270 | basic_ioctls = false; |
1264 | for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { | 1271 | for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { |
1265 | cond_resched(); | 1272 | cond_resched(); |
1266 | 1273 | ||
@@ -1299,8 +1306,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, | |||
1299 | /* | 1306 | /* |
1300 | * Note vmas containing huge pages | 1307 | * Note vmas containing huge pages |
1301 | */ | 1308 | */ |
1302 | if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur)) | 1309 | if (is_vm_hugetlb_page(cur)) |
1303 | non_anon_pages = true; | 1310 | basic_ioctls = true; |
1304 | 1311 | ||
1305 | found = true; | 1312 | found = true; |
1306 | } | 1313 | } |
@@ -1371,7 +1378,7 @@ out_unlock: | |||
1371 | * userland which ioctls methods are guaranteed to | 1378 | * userland which ioctls methods are guaranteed to |
1372 | * succeed on this range. | 1379 | * succeed on this range. |
1373 | */ | 1380 | */ |
1374 | if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC : | 1381 | if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : |
1375 | UFFD_API_RANGE_IOCTLS, | 1382 | UFFD_API_RANGE_IOCTLS, |
1376 | &user_uffdio_register->ioctls)) | 1383 | &user_uffdio_register->ioctls)) |
1377 | ret = -EFAULT; | 1384 | ret = -EFAULT; |
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0debbc7e3f03..ec3e44fcf771 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
@@ -1101,7 +1101,7 @@ xfs_filemap_pfn_mkwrite( | |||
1101 | if (vmf->pgoff >= size) | 1101 | if (vmf->pgoff >= size) |
1102 | ret = VM_FAULT_SIGBUS; | 1102 | ret = VM_FAULT_SIGBUS; |
1103 | else if (IS_DAX(inode)) | 1103 | else if (IS_DAX(inode)) |
1104 | ret = dax_pfn_mkwrite(vmf); | 1104 | ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); |
1105 | xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); | 1105 | xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); |
1106 | sb_end_pagefault(inode->i_sb); | 1106 | sb_end_pagefault(inode->i_sb); |
1107 | return ret; | 1107 | return ret; |
diff --git a/include/linux/bio.h b/include/linux/bio.h index 7b1cf4ba0902..1f0720de8990 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h | |||
@@ -38,7 +38,15 @@ | |||
38 | #define BIO_BUG_ON | 38 | #define BIO_BUG_ON |
39 | #endif | 39 | #endif |
40 | 40 | ||
41 | #ifdef CONFIG_THP_SWAP | ||
42 | #if HPAGE_PMD_NR > 256 | ||
43 | #define BIO_MAX_PAGES HPAGE_PMD_NR | ||
44 | #else | ||
41 | #define BIO_MAX_PAGES 256 | 45 | #define BIO_MAX_PAGES 256 |
46 | #endif | ||
47 | #else | ||
48 | #define BIO_MAX_PAGES 256 | ||
49 | #endif | ||
42 | 50 | ||
43 | #define bio_prio(bio) (bio)->bi_ioprio | 51 | #define bio_prio(bio) (bio)->bi_ioprio |
44 | #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) | 52 | #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) |
diff --git a/include/linux/dax.h b/include/linux/dax.h index df97b7af7e2c..eb0bff6f1eab 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h | |||
@@ -89,34 +89,6 @@ void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, | |||
89 | void dax_write_cache(struct dax_device *dax_dev, bool wc); | 89 | void dax_write_cache(struct dax_device *dax_dev, bool wc); |
90 | bool dax_write_cache_enabled(struct dax_device *dax_dev); | 90 | bool dax_write_cache_enabled(struct dax_device *dax_dev); |
91 | 91 | ||
92 | /* | ||
93 | * We use lowest available bit in exceptional entry for locking, one bit for | ||
94 | * the entry size (PMD) and two more to tell us if the entry is a huge zero | ||
95 | * page (HZP) or an empty entry that is just used for locking. In total four | ||
96 | * special bits. | ||
97 | * | ||
98 | * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and | ||
99 | * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem | ||
100 | * block allocation. | ||
101 | */ | ||
102 | #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) | ||
103 | #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) | ||
104 | #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) | ||
105 | #define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) | ||
106 | #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) | ||
107 | |||
108 | static inline unsigned long dax_radix_sector(void *entry) | ||
109 | { | ||
110 | return (unsigned long)entry >> RADIX_DAX_SHIFT; | ||
111 | } | ||
112 | |||
113 | static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) | ||
114 | { | ||
115 | return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | | ||
116 | ((unsigned long)sector << RADIX_DAX_SHIFT) | | ||
117 | RADIX_DAX_ENTRY_LOCK); | ||
118 | } | ||
119 | |||
120 | ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | 92 | ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, |
121 | const struct iomap_ops *ops); | 93 | const struct iomap_ops *ops); |
122 | int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, | 94 | int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, |
@@ -124,8 +96,6 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, | |||
124 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); | 96 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); |
125 | int dax_invalidate_mapping_entry_sync(struct address_space *mapping, | 97 | int dax_invalidate_mapping_entry_sync(struct address_space *mapping, |
126 | pgoff_t index); | 98 | pgoff_t index); |
127 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, | ||
128 | pgoff_t index, void *entry, bool wake_all); | ||
129 | 99 | ||
130 | #ifdef CONFIG_FS_DAX | 100 | #ifdef CONFIG_FS_DAX |
131 | int __dax_zero_page_range(struct block_device *bdev, | 101 | int __dax_zero_page_range(struct block_device *bdev, |
@@ -140,21 +110,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev, | |||
140 | } | 110 | } |
141 | #endif | 111 | #endif |
142 | 112 | ||
143 | #ifdef CONFIG_FS_DAX_PMD | ||
144 | static inline unsigned int dax_radix_order(void *entry) | ||
145 | { | ||
146 | if ((unsigned long)entry & RADIX_DAX_PMD) | ||
147 | return PMD_SHIFT - PAGE_SHIFT; | ||
148 | return 0; | ||
149 | } | ||
150 | #else | ||
151 | static inline unsigned int dax_radix_order(void *entry) | ||
152 | { | ||
153 | return 0; | ||
154 | } | ||
155 | #endif | ||
156 | int dax_pfn_mkwrite(struct vm_fault *vmf); | ||
157 | |||
158 | static inline bool dax_mapping(struct address_space *mapping) | 113 | static inline bool dax_mapping(struct address_space *mapping) |
159 | { | 114 | { |
160 | return mapping->host && IS_DAX(mapping->host); | 115 | return mapping->host && IS_DAX(mapping->host); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b744a3456c5..c57002ae6520 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -1269,8 +1269,6 @@ extern void f_delown(struct file *filp); | |||
1269 | extern pid_t f_getown(struct file *filp); | 1269 | extern pid_t f_getown(struct file *filp); |
1270 | extern int send_sigurg(struct fown_struct *fown); | 1270 | extern int send_sigurg(struct fown_struct *fown); |
1271 | 1271 | ||
1272 | struct mm_struct; | ||
1273 | |||
1274 | /* | 1272 | /* |
1275 | * Umount options | 1273 | * Umount options |
1276 | */ | 1274 | */ |
diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 115bb81912cc..f4ff47d4a893 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h | |||
@@ -143,15 +143,6 @@ struct fscache_cookie_def { | |||
143 | void (*mark_page_cached)(void *cookie_netfs_data, | 143 | void (*mark_page_cached)(void *cookie_netfs_data, |
144 | struct address_space *mapping, | 144 | struct address_space *mapping, |
145 | struct page *page); | 145 | struct page *page); |
146 | |||
147 | /* indicate the cookie is no longer cached | ||
148 | * - this function is called when the backing store currently caching | ||
149 | * a cookie is removed | ||
150 | * - the netfs should use this to clean up any markers indicating | ||
151 | * cached pages | ||
152 | * - this is mandatory for any object that may have data | ||
153 | */ | ||
154 | void (*now_uncached)(void *cookie_netfs_data); | ||
155 | }; | 146 | }; |
156 | 147 | ||
157 | /* | 148 | /* |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9b15a4bcfa77..69966c461d1c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -488,8 +488,9 @@ struct mem_cgroup *lock_page_memcg(struct page *page); | |||
488 | void __unlock_page_memcg(struct mem_cgroup *memcg); | 488 | void __unlock_page_memcg(struct mem_cgroup *memcg); |
489 | void unlock_page_memcg(struct page *page); | 489 | void unlock_page_memcg(struct page *page); |
490 | 490 | ||
491 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
491 | static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, | 492 | static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, |
492 | enum memcg_stat_item idx) | 493 | int idx) |
493 | { | 494 | { |
494 | long val = 0; | 495 | long val = 0; |
495 | int cpu; | 496 | int cpu; |
@@ -503,15 +504,17 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, | |||
503 | return val; | 504 | return val; |
504 | } | 505 | } |
505 | 506 | ||
507 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
506 | static inline void __mod_memcg_state(struct mem_cgroup *memcg, | 508 | static inline void __mod_memcg_state(struct mem_cgroup *memcg, |
507 | enum memcg_stat_item idx, int val) | 509 | int idx, int val) |
508 | { | 510 | { |
509 | if (!mem_cgroup_disabled()) | 511 | if (!mem_cgroup_disabled()) |
510 | __this_cpu_add(memcg->stat->count[idx], val); | 512 | __this_cpu_add(memcg->stat->count[idx], val); |
511 | } | 513 | } |
512 | 514 | ||
515 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
513 | static inline void mod_memcg_state(struct mem_cgroup *memcg, | 516 | static inline void mod_memcg_state(struct mem_cgroup *memcg, |
514 | enum memcg_stat_item idx, int val) | 517 | int idx, int val) |
515 | { | 518 | { |
516 | if (!mem_cgroup_disabled()) | 519 | if (!mem_cgroup_disabled()) |
517 | this_cpu_add(memcg->stat->count[idx], val); | 520 | this_cpu_add(memcg->stat->count[idx], val); |
@@ -535,14 +538,14 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, | |||
535 | * Kernel pages are an exception to this, since they'll never move. | 538 | * Kernel pages are an exception to this, since they'll never move. |
536 | */ | 539 | */ |
537 | static inline void __mod_memcg_page_state(struct page *page, | 540 | static inline void __mod_memcg_page_state(struct page *page, |
538 | enum memcg_stat_item idx, int val) | 541 | int idx, int val) |
539 | { | 542 | { |
540 | if (page->mem_cgroup) | 543 | if (page->mem_cgroup) |
541 | __mod_memcg_state(page->mem_cgroup, idx, val); | 544 | __mod_memcg_state(page->mem_cgroup, idx, val); |
542 | } | 545 | } |
543 | 546 | ||
544 | static inline void mod_memcg_page_state(struct page *page, | 547 | static inline void mod_memcg_page_state(struct page *page, |
545 | enum memcg_stat_item idx, int val) | 548 | int idx, int val) |
546 | { | 549 | { |
547 | if (page->mem_cgroup) | 550 | if (page->mem_cgroup) |
548 | mod_memcg_state(page->mem_cgroup, idx, val); | 551 | mod_memcg_state(page->mem_cgroup, idx, val); |
@@ -632,8 +635,9 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, | |||
632 | this_cpu_add(memcg->stat->events[idx], count); | 635 | this_cpu_add(memcg->stat->events[idx], count); |
633 | } | 636 | } |
634 | 637 | ||
638 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
635 | static inline void count_memcg_page_event(struct page *page, | 639 | static inline void count_memcg_page_event(struct page *page, |
636 | enum memcg_stat_item idx) | 640 | int idx) |
637 | { | 641 | { |
638 | if (page->mem_cgroup) | 642 | if (page->mem_cgroup) |
639 | count_memcg_events(page->mem_cgroup, idx, 1); | 643 | count_memcg_events(page->mem_cgroup, idx, 1); |
@@ -846,31 +850,31 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) | |||
846 | } | 850 | } |
847 | 851 | ||
848 | static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, | 852 | static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, |
849 | enum memcg_stat_item idx) | 853 | int idx) |
850 | { | 854 | { |
851 | return 0; | 855 | return 0; |
852 | } | 856 | } |
853 | 857 | ||
854 | static inline void __mod_memcg_state(struct mem_cgroup *memcg, | 858 | static inline void __mod_memcg_state(struct mem_cgroup *memcg, |
855 | enum memcg_stat_item idx, | 859 | int idx, |
856 | int nr) | 860 | int nr) |
857 | { | 861 | { |
858 | } | 862 | } |
859 | 863 | ||
860 | static inline void mod_memcg_state(struct mem_cgroup *memcg, | 864 | static inline void mod_memcg_state(struct mem_cgroup *memcg, |
861 | enum memcg_stat_item idx, | 865 | int idx, |
862 | int nr) | 866 | int nr) |
863 | { | 867 | { |
864 | } | 868 | } |
865 | 869 | ||
866 | static inline void __mod_memcg_page_state(struct page *page, | 870 | static inline void __mod_memcg_page_state(struct page *page, |
867 | enum memcg_stat_item idx, | 871 | int idx, |
868 | int nr) | 872 | int nr) |
869 | { | 873 | { |
870 | } | 874 | } |
871 | 875 | ||
872 | static inline void mod_memcg_page_state(struct page *page, | 876 | static inline void mod_memcg_page_state(struct page *page, |
873 | enum memcg_stat_item idx, | 877 | int idx, |
874 | int nr) | 878 | int nr) |
875 | { | 879 | { |
876 | } | 880 | } |
@@ -924,7 +928,7 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, | |||
924 | } | 928 | } |
925 | 929 | ||
926 | static inline void count_memcg_page_event(struct page *page, | 930 | static inline void count_memcg_page_event(struct page *page, |
927 | enum memcg_stat_item idx) | 931 | int idx) |
928 | { | 932 | { |
929 | } | 933 | } |
930 | 934 | ||
@@ -934,26 +938,30 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) | |||
934 | } | 938 | } |
935 | #endif /* CONFIG_MEMCG */ | 939 | #endif /* CONFIG_MEMCG */ |
936 | 940 | ||
941 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
937 | static inline void __inc_memcg_state(struct mem_cgroup *memcg, | 942 | static inline void __inc_memcg_state(struct mem_cgroup *memcg, |
938 | enum memcg_stat_item idx) | 943 | int idx) |
939 | { | 944 | { |
940 | __mod_memcg_state(memcg, idx, 1); | 945 | __mod_memcg_state(memcg, idx, 1); |
941 | } | 946 | } |
942 | 947 | ||
948 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
943 | static inline void __dec_memcg_state(struct mem_cgroup *memcg, | 949 | static inline void __dec_memcg_state(struct mem_cgroup *memcg, |
944 | enum memcg_stat_item idx) | 950 | int idx) |
945 | { | 951 | { |
946 | __mod_memcg_state(memcg, idx, -1); | 952 | __mod_memcg_state(memcg, idx, -1); |
947 | } | 953 | } |
948 | 954 | ||
955 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
949 | static inline void __inc_memcg_page_state(struct page *page, | 956 | static inline void __inc_memcg_page_state(struct page *page, |
950 | enum memcg_stat_item idx) | 957 | int idx) |
951 | { | 958 | { |
952 | __mod_memcg_page_state(page, idx, 1); | 959 | __mod_memcg_page_state(page, idx, 1); |
953 | } | 960 | } |
954 | 961 | ||
962 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
955 | static inline void __dec_memcg_page_state(struct page *page, | 963 | static inline void __dec_memcg_page_state(struct page *page, |
956 | enum memcg_stat_item idx) | 964 | int idx) |
957 | { | 965 | { |
958 | __mod_memcg_page_state(page, idx, -1); | 966 | __mod_memcg_page_state(page, idx, -1); |
959 | } | 967 | } |
@@ -982,26 +990,30 @@ static inline void __dec_lruvec_page_state(struct page *page, | |||
982 | __mod_lruvec_page_state(page, idx, -1); | 990 | __mod_lruvec_page_state(page, idx, -1); |
983 | } | 991 | } |
984 | 992 | ||
993 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
985 | static inline void inc_memcg_state(struct mem_cgroup *memcg, | 994 | static inline void inc_memcg_state(struct mem_cgroup *memcg, |
986 | enum memcg_stat_item idx) | 995 | int idx) |
987 | { | 996 | { |
988 | mod_memcg_state(memcg, idx, 1); | 997 | mod_memcg_state(memcg, idx, 1); |
989 | } | 998 | } |
990 | 999 | ||
1000 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
991 | static inline void dec_memcg_state(struct mem_cgroup *memcg, | 1001 | static inline void dec_memcg_state(struct mem_cgroup *memcg, |
992 | enum memcg_stat_item idx) | 1002 | int idx) |
993 | { | 1003 | { |
994 | mod_memcg_state(memcg, idx, -1); | 1004 | mod_memcg_state(memcg, idx, -1); |
995 | } | 1005 | } |
996 | 1006 | ||
1007 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
997 | static inline void inc_memcg_page_state(struct page *page, | 1008 | static inline void inc_memcg_page_state(struct page *page, |
998 | enum memcg_stat_item idx) | 1009 | int idx) |
999 | { | 1010 | { |
1000 | mod_memcg_page_state(page, idx, 1); | 1011 | mod_memcg_page_state(page, idx, 1); |
1001 | } | 1012 | } |
1002 | 1013 | ||
1014 | /* idx can be of type enum memcg_stat_item or node_stat_item */ | ||
1003 | static inline void dec_memcg_page_state(struct page *page, | 1015 | static inline void dec_memcg_page_state(struct page *page, |
1004 | enum memcg_stat_item idx) | 1016 | int idx) |
1005 | { | 1017 | { |
1006 | mod_memcg_page_state(page, idx, -1); | 1018 | mod_memcg_page_state(page, idx, -1); |
1007 | } | 1019 | } |
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c8a5056a5ae0..5e6e4cc36ff4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -319,6 +319,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, | |||
319 | unsigned long pnum); | 319 | unsigned long pnum); |
320 | extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, | 320 | extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, |
321 | int online_type); | 321 | int online_type); |
322 | extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, | 322 | extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, |
323 | unsigned long nr_pages); | 323 | unsigned long nr_pages); |
324 | #endif /* __LINUX_MEMORY_HOTPLUG_H */ | 324 | #endif /* __LINUX_MEMORY_HOTPLUG_H */ |
diff --git a/include/linux/mm.h b/include/linux/mm.h index c1f6c95f3496..39db8e54c5d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp); | |||
189 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ | 189 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ |
190 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ | 190 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ |
191 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ | 191 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ |
192 | #define VM_ARCH_2 0x02000000 | 192 | #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ |
193 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ | 193 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ |
194 | 194 | ||
195 | #ifdef CONFIG_MEM_SOFT_DIRTY | 195 | #ifdef CONFIG_MEM_SOFT_DIRTY |
@@ -208,10 +208,12 @@ extern unsigned int kobjsize(const void *objp); | |||
208 | #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ | 208 | #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ |
209 | #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ | 209 | #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ |
210 | #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ | 210 | #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ |
211 | #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ | ||
211 | #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) | 212 | #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) |
212 | #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) | 213 | #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) |
213 | #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) | 214 | #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) |
214 | #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) | 215 | #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) |
216 | #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) | ||
215 | #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ | 217 | #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ |
216 | 218 | ||
217 | #if defined(CONFIG_X86) | 219 | #if defined(CONFIG_X86) |
@@ -235,9 +237,11 @@ extern unsigned int kobjsize(const void *objp); | |||
235 | # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ | 237 | # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ |
236 | #endif | 238 | #endif |
237 | 239 | ||
238 | #if defined(CONFIG_X86) | 240 | #if defined(CONFIG_X86_INTEL_MPX) |
239 | /* MPX specific bounds table or bounds directory */ | 241 | /* MPX specific bounds table or bounds directory */ |
240 | # define VM_MPX VM_ARCH_2 | 242 | # define VM_MPX VM_HIGH_ARCH_BIT_4 |
243 | #else | ||
244 | # define VM_MPX VM_NONE | ||
241 | #endif | 245 | #endif |
242 | 246 | ||
243 | #ifndef VM_GROWSUP | 247 | #ifndef VM_GROWSUP |
@@ -2294,6 +2298,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, | |||
2294 | unsigned long pfn, pgprot_t pgprot); | 2298 | unsigned long pfn, pgprot_t pgprot); |
2295 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 2299 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
2296 | pfn_t pfn); | 2300 | pfn_t pfn); |
2301 | int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, | ||
2302 | pfn_t pfn); | ||
2297 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); | 2303 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); |
2298 | 2304 | ||
2299 | 2305 | ||
@@ -2506,7 +2512,7 @@ enum mf_action_page_type { | |||
2506 | 2512 | ||
2507 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) | 2513 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) |
2508 | extern void clear_huge_page(struct page *page, | 2514 | extern void clear_huge_page(struct page *page, |
2509 | unsigned long addr, | 2515 | unsigned long addr_hint, |
2510 | unsigned int pages_per_huge_page); | 2516 | unsigned int pages_per_huge_page); |
2511 | extern void copy_user_huge_page(struct page *dst, struct page *src, | 2517 | extern void copy_user_huge_page(struct page *dst, struct page *src, |
2512 | unsigned long addr, struct vm_area_struct *vma, | 2518 | unsigned long addr, struct vm_area_struct *vma, |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 57378c7cb5f8..f45ad815b7d7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -335,6 +335,7 @@ struct vm_area_struct { | |||
335 | struct file * vm_file; /* File we map to (can be NULL). */ | 335 | struct file * vm_file; /* File we map to (can be NULL). */ |
336 | void * vm_private_data; /* was vm_pte (shared mem) */ | 336 | void * vm_private_data; /* was vm_pte (shared mem) */ |
337 | 337 | ||
338 | atomic_long_t swap_readahead_info; | ||
338 | #ifndef CONFIG_MMU | 339 | #ifndef CONFIG_MMU |
339 | struct vm_region *vm_region; /* NOMMU mapping region */ | 340 | struct vm_region *vm_region; /* NOMMU mapping region */ |
340 | #endif | 341 | #endif |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc14b8b3f6ce..e7e92c8f4883 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -770,8 +770,7 @@ static inline bool is_dev_zone(const struct zone *zone) | |||
770 | 770 | ||
771 | #include <linux/memory_hotplug.h> | 771 | #include <linux/memory_hotplug.h> |
772 | 772 | ||
773 | extern struct mutex zonelists_mutex; | 773 | void build_all_zonelists(pg_data_t *pgdat); |
774 | void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); | ||
775 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); | 774 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); |
776 | bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, | 775 | bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
777 | int classzone_idx, unsigned int alloc_flags, | 776 | int classzone_idx, unsigned int alloc_flags, |
@@ -896,7 +895,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, | |||
896 | extern int numa_zonelist_order_handler(struct ctl_table *, int, | 895 | extern int numa_zonelist_order_handler(struct ctl_table *, int, |
897 | void __user *, size_t *, loff_t *); | 896 | void __user *, size_t *, loff_t *); |
898 | extern char numa_zonelist_order[]; | 897 | extern char numa_zonelist_order[]; |
899 | #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ | 898 | #define NUMA_ZONELIST_ORDER_LEN 16 |
900 | 899 | ||
901 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 900 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
902 | 901 | ||
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d33e3280c8ad..ba2d470d2d0a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -303,8 +303,8 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) | |||
303 | * Only test-and-set exist for PG_writeback. The unconditional operators are | 303 | * Only test-and-set exist for PG_writeback. The unconditional operators are |
304 | * risky: they bypass page accounting. | 304 | * risky: they bypass page accounting. |
305 | */ | 305 | */ |
306 | TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) | 306 | TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) |
307 | TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) | 307 | TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) |
308 | PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) | 308 | PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) |
309 | 309 | ||
310 | /* PG_readahead is only used for reads; PG_reclaim is only for writes */ | 310 | /* PG_readahead is only used for reads; PG_reclaim is only for writes */ |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 79b36f57c3ba..5bbd6780f205 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -353,8 +353,16 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); | |||
353 | unsigned find_get_entries(struct address_space *mapping, pgoff_t start, | 353 | unsigned find_get_entries(struct address_space *mapping, pgoff_t start, |
354 | unsigned int nr_entries, struct page **entries, | 354 | unsigned int nr_entries, struct page **entries, |
355 | pgoff_t *indices); | 355 | pgoff_t *indices); |
356 | unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | 356 | unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, |
357 | unsigned int nr_pages, struct page **pages); | 357 | pgoff_t end, unsigned int nr_pages, |
358 | struct page **pages); | ||
359 | static inline unsigned find_get_pages(struct address_space *mapping, | ||
360 | pgoff_t *start, unsigned int nr_pages, | ||
361 | struct page **pages) | ||
362 | { | ||
363 | return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages, | ||
364 | pages); | ||
365 | } | ||
358 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, | 366 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, |
359 | unsigned int nr_pages, struct page **pages); | 367 | unsigned int nr_pages, struct page **pages); |
360 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | 368 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, |
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index b45d391b4540..4dcd5506f1ed 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h | |||
@@ -27,8 +27,16 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, | |||
27 | pgoff_t start, unsigned nr_entries, | 27 | pgoff_t start, unsigned nr_entries, |
28 | pgoff_t *indices); | 28 | pgoff_t *indices); |
29 | void pagevec_remove_exceptionals(struct pagevec *pvec); | 29 | void pagevec_remove_exceptionals(struct pagevec *pvec); |
30 | unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, | 30 | unsigned pagevec_lookup_range(struct pagevec *pvec, |
31 | pgoff_t start, unsigned nr_pages); | 31 | struct address_space *mapping, |
32 | pgoff_t *start, pgoff_t end); | ||
33 | static inline unsigned pagevec_lookup(struct pagevec *pvec, | ||
34 | struct address_space *mapping, | ||
35 | pgoff_t *start) | ||
36 | { | ||
37 | return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1); | ||
38 | } | ||
39 | |||
32 | unsigned pagevec_lookup_tag(struct pagevec *pvec, | 40 | unsigned pagevec_lookup_tag(struct pagevec *pvec, |
33 | struct address_space *mapping, pgoff_t *index, int tag, | 41 | struct address_space *mapping, pgoff_t *index, int tag, |
34 | unsigned nr_pages); | 42 | unsigned nr_pages); |
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2b0a281f9d26..3a19c253bdb1 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h | |||
@@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm) | |||
84 | 84 | ||
85 | /* mmput gets rid of the mappings and all user-space */ | 85 | /* mmput gets rid of the mappings and all user-space */ |
86 | extern void mmput(struct mm_struct *); | 86 | extern void mmput(struct mm_struct *); |
87 | #ifdef CONFIG_MMU | ||
88 | /* same as above but performs the slow path from the async context. Can | ||
89 | * be called from the atomic context as well | ||
90 | */ | ||
91 | extern void mmput_async(struct mm_struct *); | ||
92 | #endif | ||
93 | 87 | ||
94 | /* Grab a reference to a task's mm, if it is not already going away */ | 88 | /* Grab a reference to a task's mm, if it is not already going away */ |
95 | extern struct mm_struct *get_task_mm(struct task_struct *task); | 89 | extern struct mm_struct *get_task_mm(struct task_struct *task); |
diff --git a/include/linux/shm.h b/include/linux/shm.h index 0fb7061ec54c..21a5e6c43385 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h | |||
@@ -27,23 +27,6 @@ struct shmid_kernel /* private to the kernel */ | |||
27 | /* shm_mode upper byte flags */ | 27 | /* shm_mode upper byte flags */ |
28 | #define SHM_DEST 01000 /* segment will be destroyed on last detach */ | 28 | #define SHM_DEST 01000 /* segment will be destroyed on last detach */ |
29 | #define SHM_LOCKED 02000 /* segment will not be swapped */ | 29 | #define SHM_LOCKED 02000 /* segment will not be swapped */ |
30 | #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ | ||
31 | #define SHM_NORESERVE 010000 /* don't check for reservations */ | ||
32 | |||
33 | /* Bits [26:31] are reserved */ | ||
34 | |||
35 | /* | ||
36 | * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
37 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
38 | * spaces. | ||
39 | * | ||
40 | * Assume these are all power of twos. | ||
41 | * When 0 use the default page size. | ||
42 | */ | ||
43 | #define SHM_HUGE_SHIFT 26 | ||
44 | #define SHM_HUGE_MASK 0x3f | ||
45 | #define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) | ||
46 | #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) | ||
47 | 30 | ||
48 | #ifdef CONFIG_SYSVIPC | 31 | #ifdef CONFIG_SYSVIPC |
49 | struct sysv_shm { | 32 | struct sysv_shm { |
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a7d6bd2a918f..b6c3540e07bc 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h | |||
@@ -137,9 +137,15 @@ extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, | |||
137 | unsigned long dst_addr, | 137 | unsigned long dst_addr, |
138 | unsigned long src_addr, | 138 | unsigned long src_addr, |
139 | struct page **pagep); | 139 | struct page **pagep); |
140 | extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, | ||
141 | pmd_t *dst_pmd, | ||
142 | struct vm_area_struct *dst_vma, | ||
143 | unsigned long dst_addr); | ||
140 | #else | 144 | #else |
141 | #define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ | 145 | #define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ |
142 | src_addr, pagep) ({ BUG(); 0; }) | 146 | src_addr, pagep) ({ BUG(); 0; }) |
147 | #define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \ | ||
148 | dst_addr) ({ BUG(); 0; }) | ||
143 | #endif | 149 | #endif |
144 | 150 | ||
145 | #endif | 151 | #endif |
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 4fcacd915d45..51d189615bda 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h | |||
@@ -18,6 +18,13 @@ struct shrink_control { | |||
18 | */ | 18 | */ |
19 | unsigned long nr_to_scan; | 19 | unsigned long nr_to_scan; |
20 | 20 | ||
21 | /* | ||
22 | * How many objects did scan_objects process? | ||
23 | * This defaults to nr_to_scan before every call, but the callee | ||
24 | * should track its actual progress. | ||
25 | */ | ||
26 | unsigned long nr_scanned; | ||
27 | |||
21 | /* current node being shrunk (for NUMA aware shrinkers) */ | 28 | /* current node being shrunk (for NUMA aware shrinkers) */ |
22 | int nid; | 29 | int nid; |
23 | 30 | ||
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index cc0faf3a90be..0783b622311e 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h | |||
@@ -115,6 +115,10 @@ struct kmem_cache { | |||
115 | #endif | 115 | #endif |
116 | #endif | 116 | #endif |
117 | 117 | ||
118 | #ifdef CONFIG_SLAB_FREELIST_HARDENED | ||
119 | unsigned long random; | ||
120 | #endif | ||
121 | |||
118 | #ifdef CONFIG_NUMA | 122 | #ifdef CONFIG_NUMA |
119 | /* | 123 | /* |
120 | * Defragmentation by allocating from a remote node. | 124 | * Defragmentation by allocating from a remote node. |
diff --git a/include/linux/swap.h b/include/linux/swap.h index d83d28e53e62..8bf3487fb204 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -188,6 +188,7 @@ struct swap_cluster_info { | |||
188 | }; | 188 | }; |
189 | #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ | 189 | #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ |
190 | #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ | 190 | #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ |
191 | #define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */ | ||
191 | 192 | ||
192 | /* | 193 | /* |
193 | * We assign a cluster to each CPU, so each CPU can allocate swap entry from | 194 | * We assign a cluster to each CPU, so each CPU can allocate swap entry from |
@@ -211,7 +212,7 @@ struct swap_info_struct { | |||
211 | unsigned long flags; /* SWP_USED etc: see above */ | 212 | unsigned long flags; /* SWP_USED etc: see above */ |
212 | signed short prio; /* swap priority of this type */ | 213 | signed short prio; /* swap priority of this type */ |
213 | struct plist_node list; /* entry in swap_active_head */ | 214 | struct plist_node list; /* entry in swap_active_head */ |
214 | struct plist_node avail_list; /* entry in swap_avail_head */ | 215 | struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */ |
215 | signed char type; /* strange name for an index */ | 216 | signed char type; /* strange name for an index */ |
216 | unsigned int max; /* extent of the swap_map */ | 217 | unsigned int max; /* extent of the swap_map */ |
217 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ | 218 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ |
@@ -250,6 +251,25 @@ struct swap_info_struct { | |||
250 | struct swap_cluster_list discard_clusters; /* discard clusters list */ | 251 | struct swap_cluster_list discard_clusters; /* discard clusters list */ |
251 | }; | 252 | }; |
252 | 253 | ||
254 | #ifdef CONFIG_64BIT | ||
255 | #define SWAP_RA_ORDER_CEILING 5 | ||
256 | #else | ||
257 | /* Avoid stack overflow, because we need to save part of page table */ | ||
258 | #define SWAP_RA_ORDER_CEILING 3 | ||
259 | #define SWAP_RA_PTE_CACHE_SIZE (1 << SWAP_RA_ORDER_CEILING) | ||
260 | #endif | ||
261 | |||
262 | struct vma_swap_readahead { | ||
263 | unsigned short win; | ||
264 | unsigned short offset; | ||
265 | unsigned short nr_pte; | ||
266 | #ifdef CONFIG_64BIT | ||
267 | pte_t *ptes; | ||
268 | #else | ||
269 | pte_t ptes[SWAP_RA_PTE_CACHE_SIZE]; | ||
270 | #endif | ||
271 | }; | ||
272 | |||
253 | /* linux/mm/workingset.c */ | 273 | /* linux/mm/workingset.c */ |
254 | void *workingset_eviction(struct address_space *mapping, struct page *page); | 274 | void *workingset_eviction(struct address_space *mapping, struct page *page); |
255 | bool workingset_refault(void *shadow); | 275 | bool workingset_refault(void *shadow); |
@@ -262,8 +282,8 @@ extern unsigned long totalreserve_pages; | |||
262 | extern unsigned long nr_free_buffer_pages(void); | 282 | extern unsigned long nr_free_buffer_pages(void); |
263 | extern unsigned long nr_free_pagecache_pages(void); | 283 | extern unsigned long nr_free_pagecache_pages(void); |
264 | 284 | ||
265 | /* Definition of global_page_state not available yet */ | 285 | /* Definition of global_zone_page_state not available yet */ |
266 | #define nr_free_pages() global_page_state(NR_FREE_PAGES) | 286 | #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) |
267 | 287 | ||
268 | 288 | ||
269 | /* linux/mm/swap.c */ | 289 | /* linux/mm/swap.c */ |
@@ -349,6 +369,7 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, | |||
349 | #define SWAP_ADDRESS_SPACE_SHIFT 14 | 369 | #define SWAP_ADDRESS_SPACE_SHIFT 14 |
350 | #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) | 370 | #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) |
351 | extern struct address_space *swapper_spaces[]; | 371 | extern struct address_space *swapper_spaces[]; |
372 | extern bool swap_vma_readahead; | ||
352 | #define swap_address_space(entry) \ | 373 | #define swap_address_space(entry) \ |
353 | (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ | 374 | (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ |
354 | >> SWAP_ADDRESS_SPACE_SHIFT]) | 375 | >> SWAP_ADDRESS_SPACE_SHIFT]) |
@@ -361,7 +382,9 @@ extern void __delete_from_swap_cache(struct page *); | |||
361 | extern void delete_from_swap_cache(struct page *); | 382 | extern void delete_from_swap_cache(struct page *); |
362 | extern void free_page_and_swap_cache(struct page *); | 383 | extern void free_page_and_swap_cache(struct page *); |
363 | extern void free_pages_and_swap_cache(struct page **, int); | 384 | extern void free_pages_and_swap_cache(struct page **, int); |
364 | extern struct page *lookup_swap_cache(swp_entry_t); | 385 | extern struct page *lookup_swap_cache(swp_entry_t entry, |
386 | struct vm_area_struct *vma, | ||
387 | unsigned long addr); | ||
365 | extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, | 388 | extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, |
366 | struct vm_area_struct *vma, unsigned long addr, | 389 | struct vm_area_struct *vma, unsigned long addr, |
367 | bool do_poll); | 390 | bool do_poll); |
@@ -371,11 +394,23 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, | |||
371 | extern struct page *swapin_readahead(swp_entry_t, gfp_t, | 394 | extern struct page *swapin_readahead(swp_entry_t, gfp_t, |
372 | struct vm_area_struct *vma, unsigned long addr); | 395 | struct vm_area_struct *vma, unsigned long addr); |
373 | 396 | ||
397 | extern struct page *swap_readahead_detect(struct vm_fault *vmf, | ||
398 | struct vma_swap_readahead *swap_ra); | ||
399 | extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, | ||
400 | struct vm_fault *vmf, | ||
401 | struct vma_swap_readahead *swap_ra); | ||
402 | |||
374 | /* linux/mm/swapfile.c */ | 403 | /* linux/mm/swapfile.c */ |
375 | extern atomic_long_t nr_swap_pages; | 404 | extern atomic_long_t nr_swap_pages; |
376 | extern long total_swap_pages; | 405 | extern long total_swap_pages; |
406 | extern atomic_t nr_rotate_swap; | ||
377 | extern bool has_usable_swap(void); | 407 | extern bool has_usable_swap(void); |
378 | 408 | ||
409 | static inline bool swap_use_vma_readahead(void) | ||
410 | { | ||
411 | return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap); | ||
412 | } | ||
413 | |||
379 | /* Swap 50% full? Release swapcache more aggressively.. */ | 414 | /* Swap 50% full? Release swapcache more aggressively.. */ |
380 | static inline bool vm_swap_full(void) | 415 | static inline bool vm_swap_full(void) |
381 | { | 416 | { |
@@ -465,12 +500,32 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, | |||
465 | return NULL; | 500 | return NULL; |
466 | } | 501 | } |
467 | 502 | ||
503 | static inline bool swap_use_vma_readahead(void) | ||
504 | { | ||
505 | return false; | ||
506 | } | ||
507 | |||
508 | static inline struct page *swap_readahead_detect( | ||
509 | struct vm_fault *vmf, struct vma_swap_readahead *swap_ra) | ||
510 | { | ||
511 | return NULL; | ||
512 | } | ||
513 | |||
514 | static inline struct page *do_swap_page_readahead( | ||
515 | swp_entry_t fentry, gfp_t gfp_mask, | ||
516 | struct vm_fault *vmf, struct vma_swap_readahead *swap_ra) | ||
517 | { | ||
518 | return NULL; | ||
519 | } | ||
520 | |||
468 | static inline int swap_writepage(struct page *p, struct writeback_control *wbc) | 521 | static inline int swap_writepage(struct page *p, struct writeback_control *wbc) |
469 | { | 522 | { |
470 | return 0; | 523 | return 0; |
471 | } | 524 | } |
472 | 525 | ||
473 | static inline struct page *lookup_swap_cache(swp_entry_t swp) | 526 | static inline struct page *lookup_swap_cache(swp_entry_t swp, |
527 | struct vm_area_struct *vma, | ||
528 | unsigned long addr) | ||
474 | { | 529 | { |
475 | return NULL; | 530 | return NULL; |
476 | } | 531 | } |
@@ -509,8 +564,8 @@ static inline int swp_swapcount(swp_entry_t entry) | |||
509 | return 0; | 564 | return 0; |
510 | } | 565 | } |
511 | 566 | ||
512 | #define reuse_swap_page(page, total_mapcount) \ | 567 | #define reuse_swap_page(page, total_map_swapcount) \ |
513 | (page_trans_huge_mapcount(page, total_mapcount) == 1) | 568 | (page_trans_huge_mapcount(page, total_map_swapcount) == 1) |
514 | 569 | ||
515 | static inline int try_to_free_swap(struct page *page) | 570 | static inline int try_to_free_swap(struct page *page) |
516 | { | 571 | { |
@@ -526,6 +581,15 @@ static inline swp_entry_t get_swap_page(struct page *page) | |||
526 | 581 | ||
527 | #endif /* CONFIG_SWAP */ | 582 | #endif /* CONFIG_SWAP */ |
528 | 583 | ||
584 | #ifdef CONFIG_THP_SWAP | ||
585 | extern int split_swap_cluster(swp_entry_t entry); | ||
586 | #else | ||
587 | static inline int split_swap_cluster(swp_entry_t entry) | ||
588 | { | ||
589 | return 0; | ||
590 | } | ||
591 | #endif | ||
592 | |||
529 | #ifdef CONFIG_MEMCG | 593 | #ifdef CONFIG_MEMCG |
530 | static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) | 594 | static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) |
531 | { | 595 | { |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 37e8d31a4632..d77bc35278b0 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -85,6 +85,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
85 | #endif | 85 | #endif |
86 | THP_ZERO_PAGE_ALLOC, | 86 | THP_ZERO_PAGE_ALLOC, |
87 | THP_ZERO_PAGE_ALLOC_FAILED, | 87 | THP_ZERO_PAGE_ALLOC_FAILED, |
88 | THP_SWPOUT, | ||
89 | THP_SWPOUT_FALLBACK, | ||
88 | #endif | 90 | #endif |
89 | #ifdef CONFIG_MEMORY_BALLOON | 91 | #ifdef CONFIG_MEMORY_BALLOON |
90 | BALLOON_INFLATE, | 92 | BALLOON_INFLATE, |
@@ -104,6 +106,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
104 | VMACACHE_FIND_HITS, | 106 | VMACACHE_FIND_HITS, |
105 | VMACACHE_FULL_FLUSHES, | 107 | VMACACHE_FULL_FLUSHES, |
106 | #endif | 108 | #endif |
109 | #ifdef CONFIG_SWAP | ||
110 | SWAP_RA, | ||
111 | SWAP_RA_HIT, | ||
112 | #endif | ||
107 | NR_VM_EVENT_ITEMS | 113 | NR_VM_EVENT_ITEMS |
108 | }; | 114 | }; |
109 | 115 | ||
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index b3d85f30d424..97e11ab573f0 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -123,7 +123,7 @@ static inline void node_page_state_add(long x, struct pglist_data *pgdat, | |||
123 | atomic_long_add(x, &vm_node_stat[item]); | 123 | atomic_long_add(x, &vm_node_stat[item]); |
124 | } | 124 | } |
125 | 125 | ||
126 | static inline unsigned long global_page_state(enum zone_stat_item item) | 126 | static inline unsigned long global_zone_page_state(enum zone_stat_item item) |
127 | { | 127 | { |
128 | long x = atomic_long_read(&vm_zone_stat[item]); | 128 | long x = atomic_long_read(&vm_zone_stat[item]); |
129 | #ifdef CONFIG_SMP | 129 | #ifdef CONFIG_SMP |
@@ -199,7 +199,7 @@ extern unsigned long sum_zone_node_page_state(int node, | |||
199 | extern unsigned long node_page_state(struct pglist_data *pgdat, | 199 | extern unsigned long node_page_state(struct pglist_data *pgdat, |
200 | enum node_stat_item item); | 200 | enum node_stat_item item); |
201 | #else | 201 | #else |
202 | #define sum_zone_node_page_state(node, item) global_page_state(item) | 202 | #define sum_zone_node_page_state(node, item) global_zone_page_state(item) |
203 | #define node_page_state(node, item) global_node_page_state(item) | 203 | #define node_page_state(node, item) global_node_page_state(item) |
204 | #endif /* CONFIG_NUMA */ | 204 | #endif /* CONFIG_NUMA */ |
205 | 205 | ||
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 08bb3ed18dcc..fbc4a06f7310 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h | |||
@@ -190,8 +190,6 @@ DEFINE_EVENT(dax_pte_fault_class, name, \ | |||
190 | 190 | ||
191 | DEFINE_PTE_FAULT_EVENT(dax_pte_fault); | 191 | DEFINE_PTE_FAULT_EVENT(dax_pte_fault); |
192 | DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); | 192 | DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); |
193 | DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry); | ||
194 | DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite); | ||
195 | DEFINE_PTE_FAULT_EVENT(dax_load_hole); | 193 | DEFINE_PTE_FAULT_EVENT(dax_load_hole); |
196 | 194 | ||
197 | TRACE_EVENT(dax_insert_mapping, | 195 | TRACE_EVENT(dax_insert_mapping, |
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 8e50d01c645f..4c2e4737d7bc 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h | |||
@@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) | |||
125 | #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" } | 125 | #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" } |
126 | #endif | 126 | #endif |
127 | 127 | ||
128 | #if defined(CONFIG_X86) | ||
129 | #define __VM_ARCH_SPECIFIC_2 {VM_MPX, "mpx" } | ||
130 | #else | ||
131 | #define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2, "arch_2" } | ||
132 | #endif | ||
133 | |||
134 | #ifdef CONFIG_MEM_SOFT_DIRTY | 128 | #ifdef CONFIG_MEM_SOFT_DIRTY |
135 | #define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name }, | 129 | #define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name }, |
136 | #else | 130 | #else |
@@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) | |||
162 | {VM_NORESERVE, "noreserve" }, \ | 156 | {VM_NORESERVE, "noreserve" }, \ |
163 | {VM_HUGETLB, "hugetlb" }, \ | 157 | {VM_HUGETLB, "hugetlb" }, \ |
164 | __VM_ARCH_SPECIFIC_1 , \ | 158 | __VM_ARCH_SPECIFIC_1 , \ |
165 | __VM_ARCH_SPECIFIC_2 , \ | 159 | {VM_WIPEONFORK, "wipeonfork" }, \ |
166 | {VM_DONTDUMP, "dontdump" }, \ | 160 | {VM_DONTDUMP, "dontdump" }, \ |
167 | IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ | 161 | IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ |
168 | {VM_MIXEDMAP, "mixedmap" }, \ | 162 | {VM_MIXEDMAP, "mixedmap" }, \ |
diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h new file mode 100644 index 000000000000..e4732d3c2998 --- /dev/null +++ b/include/uapi/asm-generic/hugetlb_encode.h | |||
@@ -0,0 +1,34 @@ | |||
1 | #ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_ | ||
2 | #define _ASM_GENERIC_HUGETLB_ENCODE_H_ | ||
3 | |||
4 | /* | ||
5 | * Several system calls take a flag to request "hugetlb" huge pages. | ||
6 | * Without further specification, these system calls will use the | ||
7 | * system's default huge page size. If a system supports multiple | ||
8 | * huge page sizes, the desired huge page size can be specified in | ||
9 | * bits [26:31] of the flag arguments. The value in these 6 bits | ||
10 | * will encode the log2 of the huge page size. | ||
11 | * | ||
12 | * The following definitions are associated with this huge page size | ||
13 | * encoding in flag arguments. System call specific header files | ||
14 | * that use this encoding should include this file. They can then | ||
15 | * provide definitions based on these with their own specific prefix. | ||
16 | * for example: | ||
17 | * #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT | ||
18 | */ | ||
19 | |||
20 | #define HUGETLB_FLAG_ENCODE_SHIFT 26 | ||
21 | #define HUGETLB_FLAG_ENCODE_MASK 0x3f | ||
22 | |||
23 | #define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
24 | #define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
25 | #define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
26 | #define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
27 | #define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
28 | #define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
29 | #define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
30 | #define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
31 | #define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
32 | #define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) | ||
33 | |||
34 | #endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ | ||
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 8c27db0c5c08..203268f9231e 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h | |||
@@ -58,20 +58,12 @@ | |||
58 | overrides the coredump filter bits */ | 58 | overrides the coredump filter bits */ |
59 | #define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ | 59 | #define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ |
60 | 60 | ||
61 | #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ | ||
62 | #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ | ||
63 | |||
61 | /* compatibility flags */ | 64 | /* compatibility flags */ |
62 | #define MAP_FILE 0 | 65 | #define MAP_FILE 0 |
63 | 66 | ||
64 | /* | ||
65 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
66 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
67 | * spaces. | ||
68 | * | ||
69 | * Assume these are all power of twos. | ||
70 | * When 0 use the default page size. | ||
71 | */ | ||
72 | #define MAP_HUGE_SHIFT 26 | ||
73 | #define MAP_HUGE_MASK 0x3f | ||
74 | |||
75 | #define PKEY_DISABLE_ACCESS 0x1 | 67 | #define PKEY_DISABLE_ACCESS 0x1 |
76 | #define PKEY_DISABLE_WRITE 0x2 | 68 | #define PKEY_DISABLE_WRITE 0x2 |
77 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ | 69 | #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ |
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 534e364bda92..7f3a722dbd72 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h | |||
@@ -1,8 +1,32 @@ | |||
1 | #ifndef _UAPI_LINUX_MEMFD_H | 1 | #ifndef _UAPI_LINUX_MEMFD_H |
2 | #define _UAPI_LINUX_MEMFD_H | 2 | #define _UAPI_LINUX_MEMFD_H |
3 | 3 | ||
4 | #include <asm-generic/hugetlb_encode.h> | ||
5 | |||
4 | /* flags for memfd_create(2) (unsigned int) */ | 6 | /* flags for memfd_create(2) (unsigned int) */ |
5 | #define MFD_CLOEXEC 0x0001U | 7 | #define MFD_CLOEXEC 0x0001U |
6 | #define MFD_ALLOW_SEALING 0x0002U | 8 | #define MFD_ALLOW_SEALING 0x0002U |
9 | #define MFD_HUGETLB 0x0004U | ||
10 | |||
11 | /* | ||
12 | * Huge page size encoding when MFD_HUGETLB is specified, and a huge page | ||
13 | * size other than the default is desired. See hugetlb_encode.h. | ||
14 | * All known huge page size encodings are provided here. It is the | ||
15 | * responsibility of the application to know which sizes are supported on | ||
16 | * the running system. See mmap(2) man page for details. | ||
17 | */ | ||
18 | #define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT | ||
19 | #define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK | ||
20 | |||
21 | #define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB | ||
22 | #define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB | ||
23 | #define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB | ||
24 | #define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB | ||
25 | #define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB | ||
26 | #define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB | ||
27 | #define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB | ||
28 | #define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB | ||
29 | #define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB | ||
30 | #define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB | ||
7 | 31 | ||
8 | #endif /* _UAPI_LINUX_MEMFD_H */ | 32 | #endif /* _UAPI_LINUX_MEMFD_H */ |
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index ade4acd3a90c..a937480d7cd3 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _UAPI_LINUX_MMAN_H | 2 | #define _UAPI_LINUX_MMAN_H |
3 | 3 | ||
4 | #include <asm/mman.h> | 4 | #include <asm/mman.h> |
5 | #include <asm-generic/hugetlb_encode.h> | ||
5 | 6 | ||
6 | #define MREMAP_MAYMOVE 1 | 7 | #define MREMAP_MAYMOVE 1 |
7 | #define MREMAP_FIXED 2 | 8 | #define MREMAP_FIXED 2 |
@@ -10,4 +11,25 @@ | |||
10 | #define OVERCOMMIT_ALWAYS 1 | 11 | #define OVERCOMMIT_ALWAYS 1 |
11 | #define OVERCOMMIT_NEVER 2 | 12 | #define OVERCOMMIT_NEVER 2 |
12 | 13 | ||
14 | /* | ||
15 | * Huge page size encoding when MAP_HUGETLB is specified, and a huge page | ||
16 | * size other than the default is desired. See hugetlb_encode.h. | ||
17 | * All known huge page size encodings are provided here. It is the | ||
18 | * responsibility of the application to know which sizes are supported on | ||
19 | * the running system. See mmap(2) man page for details. | ||
20 | */ | ||
21 | #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT | ||
22 | #define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK | ||
23 | |||
24 | #define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB | ||
25 | #define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB | ||
26 | #define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB | ||
27 | #define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB | ||
28 | #define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB | ||
29 | #define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB | ||
30 | #define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB | ||
31 | #define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB | ||
32 | #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB | ||
33 | #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB | ||
34 | |||
13 | #endif /* _UAPI_LINUX_MMAN_H */ | 35 | #endif /* _UAPI_LINUX_MMAN_H */ |
diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index 1fbf24ea37fd..cf23c873719d 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/ipc.h> | 4 | #include <linux/ipc.h> |
5 | #include <linux/errno.h> | 5 | #include <linux/errno.h> |
6 | #include <asm-generic/hugetlb_encode.h> | ||
6 | #ifndef __KERNEL__ | 7 | #ifndef __KERNEL__ |
7 | #include <unistd.h> | 8 | #include <unistd.h> |
8 | #endif | 9 | #endif |
@@ -40,11 +41,37 @@ struct shmid_ds { | |||
40 | /* Include the definition of shmid64_ds and shminfo64 */ | 41 | /* Include the definition of shmid64_ds and shminfo64 */ |
41 | #include <asm/shmbuf.h> | 42 | #include <asm/shmbuf.h> |
42 | 43 | ||
43 | /* permission flag for shmget */ | 44 | /* |
45 | * shmget() shmflg values. | ||
46 | */ | ||
47 | /* The bottom nine bits are the same as open(2) mode flags */ | ||
44 | #define SHM_R 0400 /* or S_IRUGO from <linux/stat.h> */ | 48 | #define SHM_R 0400 /* or S_IRUGO from <linux/stat.h> */ |
45 | #define SHM_W 0200 /* or S_IWUGO from <linux/stat.h> */ | 49 | #define SHM_W 0200 /* or S_IWUGO from <linux/stat.h> */ |
50 | /* Bits 9 & 10 are IPC_CREAT and IPC_EXCL */ | ||
51 | #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ | ||
52 | #define SHM_NORESERVE 010000 /* don't check for reservations */ | ||
53 | |||
54 | /* | ||
55 | * Huge page size encoding when SHM_HUGETLB is specified, and a huge page | ||
56 | * size other than the default is desired. See hugetlb_encode.h | ||
57 | */ | ||
58 | #define SHM_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT | ||
59 | #define SHM_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK | ||
60 | |||
61 | #define SHM_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB | ||
62 | #define SHM_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB | ||
63 | #define SHM_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB | ||
64 | #define SHM_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB | ||
65 | #define SHM_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB | ||
66 | #define SHM_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB | ||
67 | #define SHM_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB | ||
68 | #define SHM_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB | ||
69 | #define SHM_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB | ||
70 | #define SHM_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB | ||
46 | 71 | ||
47 | /* mode for attach */ | 72 | /* |
73 | * shmat() shmflg values | ||
74 | */ | ||
48 | #define SHM_RDONLY 010000 /* read-only access */ | 75 | #define SHM_RDONLY 010000 /* read-only access */ |
49 | #define SHM_RND 020000 /* round attach address to SHMLBA boundary */ | 76 | #define SHM_RND 020000 /* round attach address to SHMLBA boundary */ |
50 | #define SHM_REMAP 040000 /* take-over region on attach */ | 77 | #define SHM_REMAP 040000 /* take-over region on attach */ |
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 3b059530dac9..d6d1f65cb3c3 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h | |||
@@ -23,7 +23,9 @@ | |||
23 | UFFD_FEATURE_EVENT_REMOVE | \ | 23 | UFFD_FEATURE_EVENT_REMOVE | \ |
24 | UFFD_FEATURE_EVENT_UNMAP | \ | 24 | UFFD_FEATURE_EVENT_UNMAP | \ |
25 | UFFD_FEATURE_MISSING_HUGETLBFS | \ | 25 | UFFD_FEATURE_MISSING_HUGETLBFS | \ |
26 | UFFD_FEATURE_MISSING_SHMEM) | 26 | UFFD_FEATURE_MISSING_SHMEM | \ |
27 | UFFD_FEATURE_SIGBUS | \ | ||
28 | UFFD_FEATURE_THREAD_ID) | ||
27 | #define UFFD_API_IOCTLS \ | 29 | #define UFFD_API_IOCTLS \ |
28 | ((__u64)1 << _UFFDIO_REGISTER | \ | 30 | ((__u64)1 << _UFFDIO_REGISTER | \ |
29 | (__u64)1 << _UFFDIO_UNREGISTER | \ | 31 | (__u64)1 << _UFFDIO_UNREGISTER | \ |
@@ -78,6 +80,9 @@ struct uffd_msg { | |||
78 | struct { | 80 | struct { |
79 | __u64 flags; | 81 | __u64 flags; |
80 | __u64 address; | 82 | __u64 address; |
83 | union { | ||
84 | __u32 ptid; | ||
85 | } feat; | ||
81 | } pagefault; | 86 | } pagefault; |
82 | 87 | ||
83 | struct { | 88 | struct { |
@@ -153,6 +158,13 @@ struct uffdio_api { | |||
153 | * UFFD_FEATURE_MISSING_SHMEM works the same as | 158 | * UFFD_FEATURE_MISSING_SHMEM works the same as |
154 | * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem | 159 | * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem |
155 | * (i.e. tmpfs and other shmem based APIs). | 160 | * (i.e. tmpfs and other shmem based APIs). |
161 | * | ||
162 | * UFFD_FEATURE_SIGBUS feature means no page-fault | ||
163 | * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead | ||
164 | * a SIGBUS signal will be sent to the faulting process. | ||
165 | * | ||
166 | * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will | ||
167 | * be returned, if feature is not requested 0 will be returned. | ||
156 | */ | 168 | */ |
157 | #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) | 169 | #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) |
158 | #define UFFD_FEATURE_EVENT_FORK (1<<1) | 170 | #define UFFD_FEATURE_EVENT_FORK (1<<1) |
@@ -161,6 +173,8 @@ struct uffdio_api { | |||
161 | #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) | 173 | #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) |
162 | #define UFFD_FEATURE_MISSING_SHMEM (1<<5) | 174 | #define UFFD_FEATURE_MISSING_SHMEM (1<<5) |
163 | #define UFFD_FEATURE_EVENT_UNMAP (1<<6) | 175 | #define UFFD_FEATURE_EVENT_UNMAP (1<<6) |
176 | #define UFFD_FEATURE_SIGBUS (1<<7) | ||
177 | #define UFFD_FEATURE_THREAD_ID (1<<8) | ||
164 | __u64 features; | 178 | __u64 features; |
165 | 179 | ||
166 | __u64 ioctls; | 180 | __u64 ioctls; |
diff --git a/init/Kconfig b/init/Kconfig index 5f0ef850e808..78cb2461012e 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -1576,6 +1576,15 @@ config SLAB_FREELIST_RANDOM | |||
1576 | security feature reduces the predictability of the kernel slab | 1576 | security feature reduces the predictability of the kernel slab |
1577 | allocator against heap overflows. | 1577 | allocator against heap overflows. |
1578 | 1578 | ||
1579 | config SLAB_FREELIST_HARDENED | ||
1580 | bool "Harden slab freelist metadata" | ||
1581 | depends on SLUB | ||
1582 | help | ||
1583 | Many kernel heap attacks try to target slab cache metadata and | ||
1584 | other infrastructure. This options makes minor performance | ||
1585 | sacrifies to harden the kernel slab allocator against common | ||
1586 | freelist exploit methods. | ||
1587 | |||
1579 | config SLUB_CPU_PARTIAL | 1588 | config SLUB_CPU_PARTIAL |
1580 | default y | 1589 | default y |
1581 | depends on SLUB && SMP | 1590 | depends on SLUB && SMP |
diff --git a/init/main.c b/init/main.c index 8828fc148670..a21a1a8708a8 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -542,7 +542,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
542 | boot_cpu_state_init(); | 542 | boot_cpu_state_init(); |
543 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ | 543 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ |
544 | 544 | ||
545 | build_all_zonelists(NULL, NULL); | 545 | build_all_zonelists(NULL); |
546 | page_alloc_init(); | 546 | page_alloc_init(); |
547 | 547 | ||
548 | pr_notice("Kernel command line: %s\n", boot_command_line); | 548 | pr_notice("Kernel command line: %s\n", boot_command_line); |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index df2e0f14a95d..f64fc967a9ef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -4100,9 +4100,6 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
4100 | if (!(css->flags & CSS_ONLINE)) | 4100 | if (!(css->flags & CSS_ONLINE)) |
4101 | return; | 4101 | return; |
4102 | 4102 | ||
4103 | if (ss->css_reset) | ||
4104 | ss->css_reset(css); | ||
4105 | |||
4106 | if (ss->css_offline) | 4103 | if (ss->css_offline) |
4107 | ss->css_offline(css); | 4104 | ss->css_offline(css); |
4108 | 4105 | ||
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2f4039bafebb..e7485786db9b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/time64.h> | 56 | #include <linux/time64.h> |
57 | #include <linux/backing-dev.h> | 57 | #include <linux/backing-dev.h> |
58 | #include <linux/sort.h> | 58 | #include <linux/sort.h> |
59 | #include <linux/oom.h> | ||
59 | 60 | ||
60 | #include <linux/uaccess.h> | 61 | #include <linux/uaccess.h> |
61 | #include <linux/atomic.h> | 62 | #include <linux/atomic.h> |
@@ -2500,12 +2501,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
2500 | * If we're in interrupt, yes, we can always allocate. If @node is set in | 2501 | * If we're in interrupt, yes, we can always allocate. If @node is set in |
2501 | * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this | 2502 | * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this |
2502 | * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, | 2503 | * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, |
2503 | * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. | 2504 | * yes. If current has access to memory reserves as an oom victim, yes. |
2504 | * Otherwise, no. | 2505 | * Otherwise, no. |
2505 | * | 2506 | * |
2506 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | 2507 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, |
2507 | * and do not allow allocations outside the current tasks cpuset | 2508 | * and do not allow allocations outside the current tasks cpuset |
2508 | * unless the task has been OOM killed as is marked TIF_MEMDIE. | 2509 | * unless the task has been OOM killed. |
2509 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2510 | * GFP_KERNEL allocations are not so marked, so can escape to the |
2510 | * nearest enclosing hardwalled ancestor cpuset. | 2511 | * nearest enclosing hardwalled ancestor cpuset. |
2511 | * | 2512 | * |
@@ -2528,7 +2529,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
2528 | * affect that: | 2529 | * affect that: |
2529 | * in_interrupt - any node ok (current task context irrelevant) | 2530 | * in_interrupt - any node ok (current task context irrelevant) |
2530 | * GFP_ATOMIC - any node ok | 2531 | * GFP_ATOMIC - any node ok |
2531 | * TIF_MEMDIE - any node ok | 2532 | * tsk_is_oom_victim - any node ok |
2532 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok | 2533 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok |
2533 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2534 | * GFP_USER - only nodes in current tasks mems allowed ok. |
2534 | */ | 2535 | */ |
@@ -2546,7 +2547,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) | |||
2546 | * Allow tasks that have access to memory reserves because they have | 2547 | * Allow tasks that have access to memory reserves because they have |
2547 | * been OOM killed to get memory anywhere. | 2548 | * been OOM killed to get memory anywhere. |
2548 | */ | 2549 | */ |
2549 | if (unlikely(test_thread_flag(TIF_MEMDIE))) | 2550 | if (unlikely(tsk_is_oom_victim(current))) |
2550 | return true; | 2551 | return true; |
2551 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ | 2552 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ |
2552 | return false; | 2553 | return false; |
diff --git a/kernel/fork.c b/kernel/fork.c index 4e5345c07344..24a4c0be80d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, | |||
657 | retval = dup_userfaultfd(tmp, &uf); | 657 | retval = dup_userfaultfd(tmp, &uf); |
658 | if (retval) | 658 | if (retval) |
659 | goto fail_nomem_anon_vma_fork; | 659 | goto fail_nomem_anon_vma_fork; |
660 | if (anon_vma_fork(tmp, mpnt)) | 660 | if (tmp->vm_flags & VM_WIPEONFORK) { |
661 | /* VM_WIPEONFORK gets a clean slate in the child. */ | ||
662 | tmp->anon_vma = NULL; | ||
663 | if (anon_vma_prepare(tmp)) | ||
664 | goto fail_nomem_anon_vma_fork; | ||
665 | } else if (anon_vma_fork(tmp, mpnt)) | ||
661 | goto fail_nomem_anon_vma_fork; | 666 | goto fail_nomem_anon_vma_fork; |
662 | tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); | 667 | tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); |
663 | tmp->vm_next = tmp->vm_prev = NULL; | 668 | tmp->vm_next = tmp->vm_prev = NULL; |
@@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, | |||
701 | rb_parent = &tmp->vm_rb; | 706 | rb_parent = &tmp->vm_rb; |
702 | 707 | ||
703 | mm->map_count++; | 708 | mm->map_count++; |
704 | retval = copy_page_range(mm, oldmm, mpnt); | 709 | if (!(tmp->vm_flags & VM_WIPEONFORK)) |
710 | retval = copy_page_range(mm, oldmm, mpnt); | ||
705 | 711 | ||
706 | if (tmp->vm_ops && tmp->vm_ops->open) | 712 | if (tmp->vm_ops && tmp->vm_ops->open) |
707 | tmp->vm_ops->open(tmp); | 713 | tmp->vm_ops->open(tmp); |
@@ -922,7 +928,6 @@ static inline void __mmput(struct mm_struct *mm) | |||
922 | } | 928 | } |
923 | if (mm->binfmt) | 929 | if (mm->binfmt) |
924 | module_put(mm->binfmt->module); | 930 | module_put(mm->binfmt->module); |
925 | set_bit(MMF_OOM_SKIP, &mm->flags); | ||
926 | mmdrop(mm); | 931 | mmdrop(mm); |
927 | } | 932 | } |
928 | 933 | ||
@@ -938,22 +943,6 @@ void mmput(struct mm_struct *mm) | |||
938 | } | 943 | } |
939 | EXPORT_SYMBOL_GPL(mmput); | 944 | EXPORT_SYMBOL_GPL(mmput); |
940 | 945 | ||
941 | #ifdef CONFIG_MMU | ||
942 | static void mmput_async_fn(struct work_struct *work) | ||
943 | { | ||
944 | struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); | ||
945 | __mmput(mm); | ||
946 | } | ||
947 | |||
948 | void mmput_async(struct mm_struct *mm) | ||
949 | { | ||
950 | if (atomic_dec_and_test(&mm->mm_users)) { | ||
951 | INIT_WORK(&mm->async_put_work, mmput_async_fn); | ||
952 | schedule_work(&mm->async_put_work); | ||
953 | } | ||
954 | } | ||
955 | #endif | ||
956 | |||
957 | /** | 946 | /** |
958 | * set_mm_exe_file - change a reference to the mm's executable file | 947 | * set_mm_exe_file - change a reference to the mm's executable file |
959 | * | 948 | * |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 9afdc434fb49..066e73c2fcc9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -194,18 +194,41 @@ struct page_map { | |||
194 | struct vmem_altmap altmap; | 194 | struct vmem_altmap altmap; |
195 | }; | 195 | }; |
196 | 196 | ||
197 | static void pgmap_radix_release(struct resource *res) | 197 | static unsigned long order_at(struct resource *res, unsigned long pgoff) |
198 | { | 198 | { |
199 | resource_size_t key, align_start, align_size, align_end; | 199 | unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; |
200 | unsigned long nr_pages, mask; | ||
200 | 201 | ||
201 | align_start = res->start & ~(SECTION_SIZE - 1); | 202 | nr_pages = PHYS_PFN(resource_size(res)); |
202 | align_size = ALIGN(resource_size(res), SECTION_SIZE); | 203 | if (nr_pages == pgoff) |
203 | align_end = align_start + align_size - 1; | 204 | return ULONG_MAX; |
205 | |||
206 | /* | ||
207 | * What is the largest aligned power-of-2 range available from | ||
208 | * this resource pgoff to the end of the resource range, | ||
209 | * considering the alignment of the current pgoff? | ||
210 | */ | ||
211 | mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); | ||
212 | if (!mask) | ||
213 | return ULONG_MAX; | ||
214 | |||
215 | return find_first_bit(&mask, BITS_PER_LONG); | ||
216 | } | ||
217 | |||
218 | #define foreach_order_pgoff(res, order, pgoff) \ | ||
219 | for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ | ||
220 | pgoff += 1UL << order, order = order_at((res), pgoff)) | ||
221 | |||
222 | static void pgmap_radix_release(struct resource *res) | ||
223 | { | ||
224 | unsigned long pgoff, order; | ||
204 | 225 | ||
205 | mutex_lock(&pgmap_lock); | 226 | mutex_lock(&pgmap_lock); |
206 | for (key = res->start; key <= res->end; key += SECTION_SIZE) | 227 | foreach_order_pgoff(res, order, pgoff) |
207 | radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); | 228 | radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); |
208 | mutex_unlock(&pgmap_lock); | 229 | mutex_unlock(&pgmap_lock); |
230 | |||
231 | synchronize_rcu(); | ||
209 | } | 232 | } |
210 | 233 | ||
211 | static unsigned long pfn_first(struct page_map *page_map) | 234 | static unsigned long pfn_first(struct page_map *page_map) |
@@ -268,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) | |||
268 | 291 | ||
269 | WARN_ON_ONCE(!rcu_read_lock_held()); | 292 | WARN_ON_ONCE(!rcu_read_lock_held()); |
270 | 293 | ||
271 | page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); | 294 | page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); |
272 | return page_map ? &page_map->pgmap : NULL; | 295 | return page_map ? &page_map->pgmap : NULL; |
273 | } | 296 | } |
274 | 297 | ||
@@ -293,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) | |||
293 | void *devm_memremap_pages(struct device *dev, struct resource *res, | 316 | void *devm_memremap_pages(struct device *dev, struct resource *res, |
294 | struct percpu_ref *ref, struct vmem_altmap *altmap) | 317 | struct percpu_ref *ref, struct vmem_altmap *altmap) |
295 | { | 318 | { |
296 | resource_size_t key, align_start, align_size, align_end; | 319 | resource_size_t align_start, align_size, align_end; |
320 | unsigned long pfn, pgoff, order; | ||
297 | pgprot_t pgprot = PAGE_KERNEL; | 321 | pgprot_t pgprot = PAGE_KERNEL; |
298 | struct dev_pagemap *pgmap; | 322 | struct dev_pagemap *pgmap; |
299 | struct page_map *page_map; | 323 | struct page_map *page_map; |
300 | int error, nid, is_ram; | 324 | int error, nid, is_ram; |
301 | unsigned long pfn; | ||
302 | 325 | ||
303 | align_start = res->start & ~(SECTION_SIZE - 1); | 326 | align_start = res->start & ~(SECTION_SIZE - 1); |
304 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) | 327 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) |
@@ -337,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
337 | mutex_lock(&pgmap_lock); | 360 | mutex_lock(&pgmap_lock); |
338 | error = 0; | 361 | error = 0; |
339 | align_end = align_start + align_size - 1; | 362 | align_end = align_start + align_size - 1; |
340 | for (key = align_start; key <= align_end; key += SECTION_SIZE) { | 363 | |
364 | foreach_order_pgoff(res, order, pgoff) { | ||
341 | struct dev_pagemap *dup; | 365 | struct dev_pagemap *dup; |
342 | 366 | ||
343 | rcu_read_lock(); | 367 | rcu_read_lock(); |
344 | dup = find_dev_pagemap(key); | 368 | dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); |
345 | rcu_read_unlock(); | 369 | rcu_read_unlock(); |
346 | if (dup) { | 370 | if (dup) { |
347 | dev_err(dev, "%s: %pr collides with mapping for %s\n", | 371 | dev_err(dev, "%s: %pr collides with mapping for %s\n", |
@@ -349,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
349 | error = -EBUSY; | 373 | error = -EBUSY; |
350 | break; | 374 | break; |
351 | } | 375 | } |
352 | error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, | 376 | error = __radix_tree_insert(&pgmap_radix, |
353 | page_map); | 377 | PHYS_PFN(res->start) + pgoff, order, page_map); |
354 | if (error) { | 378 | if (error) { |
355 | dev_err(dev, "%s: failed: %d\n", __func__, error); | 379 | dev_err(dev, "%s: failed: %d\n", __func__, error); |
356 | break; | 380 | break; |
diff --git a/mm/Kconfig b/mm/Kconfig index 48b1af447fa7..0ded10a22639 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -678,6 +678,7 @@ config ZONE_DEVICE | |||
678 | depends on MEMORY_HOTREMOVE | 678 | depends on MEMORY_HOTREMOVE |
679 | depends on SPARSEMEM_VMEMMAP | 679 | depends on SPARSEMEM_VMEMMAP |
680 | depends on ARCH_HAS_ZONE_DEVICE | 680 | depends on ARCH_HAS_ZONE_DEVICE |
681 | select RADIX_TREE_MULTIORDER | ||
681 | 682 | ||
682 | help | 683 | help |
683 | Device memory hotplug support allows for establishing pmem, | 684 | Device memory hotplug support allows for establishing pmem, |
diff --git a/mm/filemap.c b/mm/filemap.c index 1e01cb6e5173..9d21afd692b9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping, | |||
130 | return -EEXIST; | 130 | return -EEXIST; |
131 | 131 | ||
132 | mapping->nrexceptional--; | 132 | mapping->nrexceptional--; |
133 | if (!dax_mapping(mapping)) { | 133 | if (shadowp) |
134 | if (shadowp) | 134 | *shadowp = p; |
135 | *shadowp = p; | ||
136 | } else { | ||
137 | /* DAX can replace empty locked entry with a hole */ | ||
138 | WARN_ON_ONCE(p != | ||
139 | dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); | ||
140 | /* Wakeup waiters for exceptional entry lock */ | ||
141 | dax_wake_mapping_entry_waiter(mapping, page->index, p, | ||
142 | true); | ||
143 | } | ||
144 | } | 135 | } |
145 | __radix_tree_replace(&mapping->page_tree, node, slot, page, | 136 | __radix_tree_replace(&mapping->page_tree, node, slot, page, |
146 | workingset_update_node, mapping); | 137 | workingset_update_node, mapping); |
@@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping, | |||
402 | { | 393 | { |
403 | pgoff_t index = start_byte >> PAGE_SHIFT; | 394 | pgoff_t index = start_byte >> PAGE_SHIFT; |
404 | pgoff_t end = end_byte >> PAGE_SHIFT; | 395 | pgoff_t end = end_byte >> PAGE_SHIFT; |
405 | struct pagevec pvec; | 396 | struct page *page; |
406 | bool ret; | ||
407 | 397 | ||
408 | if (end_byte < start_byte) | 398 | if (end_byte < start_byte) |
409 | return false; | 399 | return false; |
@@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping, | |||
411 | if (mapping->nrpages == 0) | 401 | if (mapping->nrpages == 0) |
412 | return false; | 402 | return false; |
413 | 403 | ||
414 | pagevec_init(&pvec, 0); | 404 | if (!find_get_pages_range(mapping, &index, end, 1, &page)) |
415 | if (!pagevec_lookup(&pvec, mapping, index, 1)) | ||
416 | return false; | 405 | return false; |
417 | ret = (pvec.pages[0]->index <= end); | 406 | put_page(page); |
418 | pagevec_release(&pvec); | 407 | return true; |
419 | return ret; | ||
420 | } | 408 | } |
421 | EXPORT_SYMBOL(filemap_range_has_page); | 409 | EXPORT_SYMBOL(filemap_range_has_page); |
422 | 410 | ||
@@ -1564,23 +1552,29 @@ export: | |||
1564 | } | 1552 | } |
1565 | 1553 | ||
1566 | /** | 1554 | /** |
1567 | * find_get_pages - gang pagecache lookup | 1555 | * find_get_pages_range - gang pagecache lookup |
1568 | * @mapping: The address_space to search | 1556 | * @mapping: The address_space to search |
1569 | * @start: The starting page index | 1557 | * @start: The starting page index |
1558 | * @end: The final page index (inclusive) | ||
1570 | * @nr_pages: The maximum number of pages | 1559 | * @nr_pages: The maximum number of pages |
1571 | * @pages: Where the resulting pages are placed | 1560 | * @pages: Where the resulting pages are placed |
1572 | * | 1561 | * |
1573 | * find_get_pages() will search for and return a group of up to | 1562 | * find_get_pages_range() will search for and return a group of up to @nr_pages |
1574 | * @nr_pages pages in the mapping. The pages are placed at @pages. | 1563 | * pages in the mapping starting at index @start and up to index @end |
1575 | * find_get_pages() takes a reference against the returned pages. | 1564 | * (inclusive). The pages are placed at @pages. find_get_pages_range() takes |
1565 | * a reference against the returned pages. | ||
1576 | * | 1566 | * |
1577 | * The search returns a group of mapping-contiguous pages with ascending | 1567 | * The search returns a group of mapping-contiguous pages with ascending |
1578 | * indexes. There may be holes in the indices due to not-present pages. | 1568 | * indexes. There may be holes in the indices due to not-present pages. |
1569 | * We also update @start to index the next page for the traversal. | ||
1579 | * | 1570 | * |
1580 | * find_get_pages() returns the number of pages which were found. | 1571 | * find_get_pages_range() returns the number of pages which were found. If this |
1572 | * number is smaller than @nr_pages, the end of specified range has been | ||
1573 | * reached. | ||
1581 | */ | 1574 | */ |
1582 | unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | 1575 | unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, |
1583 | unsigned int nr_pages, struct page **pages) | 1576 | pgoff_t end, unsigned int nr_pages, |
1577 | struct page **pages) | ||
1584 | { | 1578 | { |
1585 | struct radix_tree_iter iter; | 1579 | struct radix_tree_iter iter; |
1586 | void **slot; | 1580 | void **slot; |
@@ -1590,8 +1584,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
1590 | return 0; | 1584 | return 0; |
1591 | 1585 | ||
1592 | rcu_read_lock(); | 1586 | rcu_read_lock(); |
1593 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { | 1587 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { |
1594 | struct page *head, *page; | 1588 | struct page *head, *page; |
1589 | |||
1590 | if (iter.index > end) | ||
1591 | break; | ||
1595 | repeat: | 1592 | repeat: |
1596 | page = radix_tree_deref_slot(slot); | 1593 | page = radix_tree_deref_slot(slot); |
1597 | if (unlikely(!page)) | 1594 | if (unlikely(!page)) |
@@ -1627,11 +1624,25 @@ repeat: | |||
1627 | } | 1624 | } |
1628 | 1625 | ||
1629 | pages[ret] = page; | 1626 | pages[ret] = page; |
1630 | if (++ret == nr_pages) | 1627 | if (++ret == nr_pages) { |
1631 | break; | 1628 | *start = pages[ret - 1]->index + 1; |
1629 | goto out; | ||
1630 | } | ||
1632 | } | 1631 | } |
1633 | 1632 | ||
1633 | /* | ||
1634 | * We come here when there is no page beyond @end. We take care to not | ||
1635 | * overflow the index @start as it confuses some of the callers. This | ||
1636 | * breaks the iteration when there is page at index -1 but that is | ||
1637 | * already broken anyway. | ||
1638 | */ | ||
1639 | if (end == (pgoff_t)-1) | ||
1640 | *start = (pgoff_t)-1; | ||
1641 | else | ||
1642 | *start = end + 1; | ||
1643 | out: | ||
1634 | rcu_read_unlock(); | 1644 | rcu_read_unlock(); |
1645 | |||
1635 | return ret; | 1646 | return ret; |
1636 | } | 1647 | } |
1637 | 1648 | ||
@@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1352 | } | 1352 | } |
1353 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ | 1353 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ |
1354 | 1354 | ||
1355 | #ifdef __HAVE_ARCH_PTE_DEVMAP | 1355 | #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) |
1356 | static int __gup_device_huge(unsigned long pfn, unsigned long addr, | 1356 | static int __gup_device_huge(unsigned long pfn, unsigned long addr, |
1357 | unsigned long end, struct page **pages, int *nr) | 1357 | unsigned long end, struct page **pages, int *nr) |
1358 | { | 1358 | { |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3644ff918434..0b51e70e0a8b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -328,7 +328,7 @@ static struct attribute *hugepage_attr[] = { | |||
328 | NULL, | 328 | NULL, |
329 | }; | 329 | }; |
330 | 330 | ||
331 | static struct attribute_group hugepage_attr_group = { | 331 | static const struct attribute_group hugepage_attr_group = { |
332 | .attrs = hugepage_attr, | 332 | .attrs = hugepage_attr, |
333 | }; | 333 | }; |
334 | 334 | ||
@@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, | |||
567 | goto release; | 567 | goto release; |
568 | } | 568 | } |
569 | 569 | ||
570 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | 570 | clear_huge_page(page, vmf->address, HPAGE_PMD_NR); |
571 | /* | 571 | /* |
572 | * The memory barrier inside __SetPageUptodate makes sure that | 572 | * The memory barrier inside __SetPageUptodate makes sure that |
573 | * clear_huge_page writes become visible before the set_pmd_at() | 573 | * clear_huge_page writes become visible before the set_pmd_at() |
@@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) | |||
1240 | * We can only reuse the page if nobody else maps the huge page or it's | 1240 | * We can only reuse the page if nobody else maps the huge page or it's |
1241 | * part. | 1241 | * part. |
1242 | */ | 1242 | */ |
1243 | if (page_trans_huge_mapcount(page, NULL) == 1) { | 1243 | if (!trylock_page(page)) { |
1244 | get_page(page); | ||
1245 | spin_unlock(vmf->ptl); | ||
1246 | lock_page(page); | ||
1247 | spin_lock(vmf->ptl); | ||
1248 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { | ||
1249 | unlock_page(page); | ||
1250 | put_page(page); | ||
1251 | goto out_unlock; | ||
1252 | } | ||
1253 | put_page(page); | ||
1254 | } | ||
1255 | if (reuse_swap_page(page, NULL)) { | ||
1244 | pmd_t entry; | 1256 | pmd_t entry; |
1245 | entry = pmd_mkyoung(orig_pmd); | 1257 | entry = pmd_mkyoung(orig_pmd); |
1246 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1258 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1247 | if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) | 1259 | if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) |
1248 | update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); | 1260 | update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
1249 | ret |= VM_FAULT_WRITE; | 1261 | ret |= VM_FAULT_WRITE; |
1262 | unlock_page(page); | ||
1250 | goto out_unlock; | 1263 | goto out_unlock; |
1251 | } | 1264 | } |
1265 | unlock_page(page); | ||
1252 | get_page(page); | 1266 | get_page(page); |
1253 | spin_unlock(vmf->ptl); | 1267 | spin_unlock(vmf->ptl); |
1254 | alloc: | 1268 | alloc: |
@@ -1291,7 +1305,7 @@ alloc: | |||
1291 | count_vm_event(THP_FAULT_ALLOC); | 1305 | count_vm_event(THP_FAULT_ALLOC); |
1292 | 1306 | ||
1293 | if (!page) | 1307 | if (!page) |
1294 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | 1308 | clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); |
1295 | else | 1309 | else |
1296 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 1310 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); |
1297 | __SetPageUptodate(new_page); | 1311 | __SetPageUptodate(new_page); |
@@ -2467,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2467 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 2481 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
2468 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 2482 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
2469 | 2483 | ||
2484 | if (PageWriteback(page)) | ||
2485 | return -EBUSY; | ||
2486 | |||
2470 | if (PageAnon(head)) { | 2487 | if (PageAnon(head)) { |
2471 | /* | 2488 | /* |
2472 | * The caller does not necessarily hold an mmap_sem that would | 2489 | * The caller does not necessarily hold an mmap_sem that would |
@@ -2544,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2544 | __dec_node_page_state(page, NR_SHMEM_THPS); | 2561 | __dec_node_page_state(page, NR_SHMEM_THPS); |
2545 | spin_unlock(&pgdata->split_queue_lock); | 2562 | spin_unlock(&pgdata->split_queue_lock); |
2546 | __split_huge_page(page, list, flags); | 2563 | __split_huge_page(page, list, flags); |
2547 | ret = 0; | 2564 | if (PageSwapCache(head)) { |
2565 | swp_entry_t entry = { .val = page_private(head) }; | ||
2566 | |||
2567 | ret = split_swap_cluster(entry); | ||
2568 | } else | ||
2569 | ret = 0; | ||
2548 | } else { | 2570 | } else { |
2549 | if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { | 2571 | if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { |
2550 | pr_alert("total_mapcount: %u, page_count(): %u\n", | 2572 | pr_alert("total_mapcount: %u, page_count(): %u\n", |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 31e207cb399b..34625b257128 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order) | |||
1066 | } | 1066 | } |
1067 | 1067 | ||
1068 | static int __alloc_gigantic_page(unsigned long start_pfn, | 1068 | static int __alloc_gigantic_page(unsigned long start_pfn, |
1069 | unsigned long nr_pages) | 1069 | unsigned long nr_pages, gfp_t gfp_mask) |
1070 | { | 1070 | { |
1071 | unsigned long end_pfn = start_pfn + nr_pages; | 1071 | unsigned long end_pfn = start_pfn + nr_pages; |
1072 | return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, | 1072 | return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, |
1073 | GFP_KERNEL); | 1073 | gfp_mask); |
1074 | } | 1074 | } |
1075 | 1075 | ||
1076 | static bool pfn_range_valid_gigantic(struct zone *z, | 1076 | static bool pfn_range_valid_gigantic(struct zone *z, |
@@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone, | |||
1108 | return zone_spans_pfn(zone, last_pfn); | 1108 | return zone_spans_pfn(zone, last_pfn); |
1109 | } | 1109 | } |
1110 | 1110 | ||
1111 | static struct page *alloc_gigantic_page(int nid, unsigned int order) | 1111 | static struct page *alloc_gigantic_page(int nid, struct hstate *h) |
1112 | { | 1112 | { |
1113 | unsigned int order = huge_page_order(h); | ||
1113 | unsigned long nr_pages = 1 << order; | 1114 | unsigned long nr_pages = 1 << order; |
1114 | unsigned long ret, pfn, flags; | 1115 | unsigned long ret, pfn, flags; |
1115 | struct zone *z; | 1116 | struct zonelist *zonelist; |
1117 | struct zone *zone; | ||
1118 | struct zoneref *z; | ||
1119 | gfp_t gfp_mask; | ||
1116 | 1120 | ||
1117 | z = NODE_DATA(nid)->node_zones; | 1121 | gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; |
1118 | for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { | 1122 | zonelist = node_zonelist(nid, gfp_mask); |
1119 | spin_lock_irqsave(&z->lock, flags); | 1123 | for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { |
1124 | spin_lock_irqsave(&zone->lock, flags); | ||
1120 | 1125 | ||
1121 | pfn = ALIGN(z->zone_start_pfn, nr_pages); | 1126 | pfn = ALIGN(zone->zone_start_pfn, nr_pages); |
1122 | while (zone_spans_last_pfn(z, pfn, nr_pages)) { | 1127 | while (zone_spans_last_pfn(zone, pfn, nr_pages)) { |
1123 | if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { | 1128 | if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { |
1124 | /* | 1129 | /* |
1125 | * We release the zone lock here because | 1130 | * We release the zone lock here because |
1126 | * alloc_contig_range() will also lock the zone | 1131 | * alloc_contig_range() will also lock the zone |
@@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order) | |||
1128 | * spinning on this lock, it may win the race | 1133 | * spinning on this lock, it may win the race |
1129 | * and cause alloc_contig_range() to fail... | 1134 | * and cause alloc_contig_range() to fail... |
1130 | */ | 1135 | */ |
1131 | spin_unlock_irqrestore(&z->lock, flags); | 1136 | spin_unlock_irqrestore(&zone->lock, flags); |
1132 | ret = __alloc_gigantic_page(pfn, nr_pages); | 1137 | ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); |
1133 | if (!ret) | 1138 | if (!ret) |
1134 | return pfn_to_page(pfn); | 1139 | return pfn_to_page(pfn); |
1135 | spin_lock_irqsave(&z->lock, flags); | 1140 | spin_lock_irqsave(&zone->lock, flags); |
1136 | } | 1141 | } |
1137 | pfn += nr_pages; | 1142 | pfn += nr_pages; |
1138 | } | 1143 | } |
1139 | 1144 | ||
1140 | spin_unlock_irqrestore(&z->lock, flags); | 1145 | spin_unlock_irqrestore(&zone->lock, flags); |
1141 | } | 1146 | } |
1142 | 1147 | ||
1143 | return NULL; | 1148 | return NULL; |
@@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) | |||
1150 | { | 1155 | { |
1151 | struct page *page; | 1156 | struct page *page; |
1152 | 1157 | ||
1153 | page = alloc_gigantic_page(nid, huge_page_order(h)); | 1158 | page = alloc_gigantic_page(nid, h); |
1154 | if (page) { | 1159 | if (page) { |
1155 | prep_compound_gigantic_page(page, huge_page_order(h)); | 1160 | prep_compound_gigantic_page(page, huge_page_order(h)); |
1156 | prep_new_huge_page(h, page, nid); | 1161 | prep_new_huge_page(h, page, nid); |
@@ -2569,13 +2574,13 @@ static struct attribute *hstate_attrs[] = { | |||
2569 | NULL, | 2574 | NULL, |
2570 | }; | 2575 | }; |
2571 | 2576 | ||
2572 | static struct attribute_group hstate_attr_group = { | 2577 | static const struct attribute_group hstate_attr_group = { |
2573 | .attrs = hstate_attrs, | 2578 | .attrs = hstate_attrs, |
2574 | }; | 2579 | }; |
2575 | 2580 | ||
2576 | static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, | 2581 | static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, |
2577 | struct kobject **hstate_kobjs, | 2582 | struct kobject **hstate_kobjs, |
2578 | struct attribute_group *hstate_attr_group) | 2583 | const struct attribute_group *hstate_attr_group) |
2579 | { | 2584 | { |
2580 | int retval; | 2585 | int retval; |
2581 | int hi = hstate_index(h); | 2586 | int hi = hstate_index(h); |
@@ -2633,7 +2638,7 @@ static struct attribute *per_node_hstate_attrs[] = { | |||
2633 | NULL, | 2638 | NULL, |
2634 | }; | 2639 | }; |
2635 | 2640 | ||
2636 | static struct attribute_group per_node_hstate_attr_group = { | 2641 | static const struct attribute_group per_node_hstate_attr_group = { |
2637 | .attrs = per_node_hstate_attrs, | 2642 | .attrs = per_node_hstate_attrs, |
2638 | }; | 2643 | }; |
2639 | 2644 | ||
@@ -4600,6 +4605,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, | |||
4600 | return pte; | 4605 | return pte; |
4601 | } | 4606 | } |
4602 | 4607 | ||
4608 | /* | ||
4609 | * huge_pte_offset() - Walk the page table to resolve the hugepage | ||
4610 | * entry at address @addr | ||
4611 | * | ||
4612 | * Return: Pointer to page table or swap entry (PUD or PMD) for | ||
4613 | * address @addr, or NULL if a p*d_none() entry is encountered and the | ||
4614 | * size @sz doesn't match the hugepage size at this level of the page | ||
4615 | * table. | ||
4616 | */ | ||
4603 | pte_t *huge_pte_offset(struct mm_struct *mm, | 4617 | pte_t *huge_pte_offset(struct mm_struct *mm, |
4604 | unsigned long addr, unsigned long sz) | 4618 | unsigned long addr, unsigned long sz) |
4605 | { | 4619 | { |
@@ -4614,13 +4628,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm, | |||
4614 | p4d = p4d_offset(pgd, addr); | 4628 | p4d = p4d_offset(pgd, addr); |
4615 | if (!p4d_present(*p4d)) | 4629 | if (!p4d_present(*p4d)) |
4616 | return NULL; | 4630 | return NULL; |
4631 | |||
4617 | pud = pud_offset(p4d, addr); | 4632 | pud = pud_offset(p4d, addr); |
4618 | if (!pud_present(*pud)) | 4633 | if (sz != PUD_SIZE && pud_none(*pud)) |
4619 | return NULL; | 4634 | return NULL; |
4620 | if (pud_huge(*pud)) | 4635 | /* hugepage or swap? */ |
4636 | if (pud_huge(*pud) || !pud_present(*pud)) | ||
4621 | return (pte_t *)pud; | 4637 | return (pte_t *)pud; |
4638 | |||
4622 | pmd = pmd_offset(pud, addr); | 4639 | pmd = pmd_offset(pud, addr); |
4623 | return (pte_t *) pmd; | 4640 | if (sz != PMD_SIZE && pmd_none(*pmd)) |
4641 | return NULL; | ||
4642 | /* hugepage or swap? */ | ||
4643 | if (pmd_huge(*pmd) || !pmd_present(*pmd)) | ||
4644 | return (pte_t *)pmd; | ||
4645 | |||
4646 | return NULL; | ||
4624 | } | 4647 | } |
4625 | 4648 | ||
4626 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ | 4649 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ |
diff --git a/mm/internal.h b/mm/internal.h index 4ef49fc55e58..1df011f62480 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
480 | /* Mask to get the watermark bits */ | 480 | /* Mask to get the watermark bits */ |
481 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | 481 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) |
482 | 482 | ||
483 | /* | ||
484 | * Only MMU archs have async oom victim reclaim - aka oom_reaper so we | ||
485 | * cannot assume a reduced access to memory reserves is sufficient for | ||
486 | * !MMU | ||
487 | */ | ||
488 | #ifdef CONFIG_MMU | ||
489 | #define ALLOC_OOM 0x08 | ||
490 | #else | ||
491 | #define ALLOC_OOM ALLOC_NO_WATERMARKS | ||
492 | #endif | ||
493 | |||
483 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 494 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
484 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 495 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
485 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 496 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
@@ -525,4 +536,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) | |||
525 | return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; | 536 | return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; |
526 | } | 537 | } |
527 | 538 | ||
539 | void setup_zone_pageset(struct zone *zone); | ||
528 | #endif /* __MM_INTERNAL_H */ | 540 | #endif /* __MM_INTERNAL_H */ |
@@ -3043,7 +3043,7 @@ static struct attribute *ksm_attrs[] = { | |||
3043 | NULL, | 3043 | NULL, |
3044 | }; | 3044 | }; |
3045 | 3045 | ||
3046 | static struct attribute_group ksm_attr_group = { | 3046 | static const struct attribute_group ksm_attr_group = { |
3047 | .attrs = ksm_attrs, | 3047 | .attrs = ksm_attrs, |
3048 | .name = "ksm", | 3048 | .name = "ksm", |
3049 | }; | 3049 | }; |
diff --git a/mm/madvise.c b/mm/madvise.c index 4d7d1e5ddba9..eea1c733286f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma, | |||
80 | } | 80 | } |
81 | new_flags &= ~VM_DONTCOPY; | 81 | new_flags &= ~VM_DONTCOPY; |
82 | break; | 82 | break; |
83 | case MADV_WIPEONFORK: | ||
84 | /* MADV_WIPEONFORK is only supported on anonymous memory. */ | ||
85 | if (vma->vm_file || vma->vm_flags & VM_SHARED) { | ||
86 | error = -EINVAL; | ||
87 | goto out; | ||
88 | } | ||
89 | new_flags |= VM_WIPEONFORK; | ||
90 | break; | ||
91 | case MADV_KEEPONFORK: | ||
92 | new_flags &= ~VM_WIPEONFORK; | ||
93 | break; | ||
83 | case MADV_DONTDUMP: | 94 | case MADV_DONTDUMP: |
84 | new_flags |= VM_DONTDUMP; | 95 | new_flags |= VM_DONTDUMP; |
85 | break; | 96 | break; |
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior) | |||
696 | #endif | 707 | #endif |
697 | case MADV_DONTDUMP: | 708 | case MADV_DONTDUMP: |
698 | case MADV_DODUMP: | 709 | case MADV_DODUMP: |
710 | case MADV_WIPEONFORK: | ||
711 | case MADV_KEEPONFORK: | ||
699 | #ifdef CONFIG_MEMORY_FAILURE | 712 | #ifdef CONFIG_MEMORY_FAILURE |
700 | case MADV_SOFT_OFFLINE: | 713 | case MADV_SOFT_OFFLINE: |
701 | case MADV_HWPOISON: | 714 | case MADV_HWPOISON: |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e09741af816f..ad15850ee157 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) | |||
550 | * value, and reading all cpu value can be performance bottleneck in some | 550 | * value, and reading all cpu value can be performance bottleneck in some |
551 | * common workload, threshold and synchronization as vmstat[] should be | 551 | * common workload, threshold and synchronization as vmstat[] should be |
552 | * implemented. | 552 | * implemented. |
553 | * | ||
554 | * The parameter idx can be of type enum memcg_event_item or vm_event_item. | ||
553 | */ | 555 | */ |
554 | 556 | ||
555 | static unsigned long memcg_sum_events(struct mem_cgroup *memcg, | 557 | static unsigned long memcg_sum_events(struct mem_cgroup *memcg, |
556 | enum memcg_event_item event) | 558 | int event) |
557 | { | 559 | { |
558 | unsigned long val = 0; | 560 | unsigned long val = 0; |
559 | int cpu; | 561 | int cpu; |
@@ -1915,7 +1917,7 @@ retry: | |||
1915 | * bypass the last charges so that they can exit quickly and | 1917 | * bypass the last charges so that they can exit quickly and |
1916 | * free their memory. | 1918 | * free their memory. |
1917 | */ | 1919 | */ |
1918 | if (unlikely(test_thread_flag(TIF_MEMDIE) || | 1920 | if (unlikely(tsk_is_oom_victim(current) || |
1919 | fatal_signal_pending(current) || | 1921 | fatal_signal_pending(current) || |
1920 | current->flags & PF_EXITING)) | 1922 | current->flags & PF_EXITING)) |
1921 | goto force; | 1923 | goto force; |
@@ -4319,6 +4321,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
4319 | } | 4321 | } |
4320 | spin_unlock(&memcg->event_list_lock); | 4322 | spin_unlock(&memcg->event_list_lock); |
4321 | 4323 | ||
4324 | memcg->low = 0; | ||
4325 | |||
4322 | memcg_offline_kmem(memcg); | 4326 | memcg_offline_kmem(memcg); |
4323 | wb_memcg_offline(memcg); | 4327 | wb_memcg_offline(memcg); |
4324 | 4328 | ||
@@ -4635,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
4635 | if (!ret || !target) | 4639 | if (!ret || !target) |
4636 | put_page(page); | 4640 | put_page(page); |
4637 | } | 4641 | } |
4638 | /* There is a swap entry and a page doesn't exist or isn't charged */ | 4642 | /* |
4639 | if (ent.val && !ret && | 4643 | * There is a swap entry and a page doesn't exist or isn't charged. |
4644 | * But we cannot move a tail-page in a THP. | ||
4645 | */ | ||
4646 | if (ent.val && !ret && (!page || !PageTransCompound(page)) && | ||
4640 | mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { | 4647 | mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { |
4641 | ret = MC_TARGET_SWAP; | 4648 | ret = MC_TARGET_SWAP; |
4642 | if (target) | 4649 | if (target) |
@@ -4647,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
4647 | 4654 | ||
4648 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 4655 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
4649 | /* | 4656 | /* |
4650 | * We don't consider swapping or file mapped pages because THP does not | 4657 | * We don't consider PMD mapped swapping or file mapped pages because THP does |
4651 | * support them for now. | 4658 | * not support them for now. |
4652 | * Caller should make sure that pmd_trans_huge(pmd) is true. | 4659 | * Caller should make sure that pmd_trans_huge(pmd) is true. |
4653 | */ | 4660 | */ |
4654 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | 4661 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, |
@@ -5423,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
5423 | * in turn serializes uncharging. | 5430 | * in turn serializes uncharging. |
5424 | */ | 5431 | */ |
5425 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 5432 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
5426 | if (page->mem_cgroup) | 5433 | if (compound_head(page)->mem_cgroup) |
5427 | goto out; | 5434 | goto out; |
5428 | 5435 | ||
5429 | if (do_swap_account) { | 5436 | if (do_swap_account) { |
@@ -5906,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) | |||
5906 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | 5913 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) |
5907 | { | 5914 | { |
5908 | struct mem_cgroup *memcg, *swap_memcg; | 5915 | struct mem_cgroup *memcg, *swap_memcg; |
5916 | unsigned int nr_entries; | ||
5909 | unsigned short oldid; | 5917 | unsigned short oldid; |
5910 | 5918 | ||
5911 | VM_BUG_ON_PAGE(PageLRU(page), page); | 5919 | VM_BUG_ON_PAGE(PageLRU(page), page); |
@@ -5926,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | |||
5926 | * ancestor for the swap instead and transfer the memory+swap charge. | 5934 | * ancestor for the swap instead and transfer the memory+swap charge. |
5927 | */ | 5935 | */ |
5928 | swap_memcg = mem_cgroup_id_get_online(memcg); | 5936 | swap_memcg = mem_cgroup_id_get_online(memcg); |
5929 | oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); | 5937 | nr_entries = hpage_nr_pages(page); |
5938 | /* Get references for the tail pages, too */ | ||
5939 | if (nr_entries > 1) | ||
5940 | mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); | ||
5941 | oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), | ||
5942 | nr_entries); | ||
5930 | VM_BUG_ON_PAGE(oldid, page); | 5943 | VM_BUG_ON_PAGE(oldid, page); |
5931 | mem_cgroup_swap_statistics(swap_memcg, 1); | 5944 | mem_cgroup_swap_statistics(swap_memcg, nr_entries); |
5932 | 5945 | ||
5933 | page->mem_cgroup = NULL; | 5946 | page->mem_cgroup = NULL; |
5934 | 5947 | ||
5935 | if (!mem_cgroup_is_root(memcg)) | 5948 | if (!mem_cgroup_is_root(memcg)) |
5936 | page_counter_uncharge(&memcg->memory, 1); | 5949 | page_counter_uncharge(&memcg->memory, nr_entries); |
5937 | 5950 | ||
5938 | if (memcg != swap_memcg) { | 5951 | if (memcg != swap_memcg) { |
5939 | if (!mem_cgroup_is_root(swap_memcg)) | 5952 | if (!mem_cgroup_is_root(swap_memcg)) |
5940 | page_counter_charge(&swap_memcg->memsw, 1); | 5953 | page_counter_charge(&swap_memcg->memsw, nr_entries); |
5941 | page_counter_uncharge(&memcg->memsw, 1); | 5954 | page_counter_uncharge(&memcg->memsw, nr_entries); |
5942 | } | 5955 | } |
5943 | 5956 | ||
5944 | /* | 5957 | /* |
@@ -5948,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | |||
5948 | * only synchronisation we have for udpating the per-CPU variables. | 5961 | * only synchronisation we have for udpating the per-CPU variables. |
5949 | */ | 5962 | */ |
5950 | VM_BUG_ON(!irqs_disabled()); | 5963 | VM_BUG_ON(!irqs_disabled()); |
5951 | mem_cgroup_charge_statistics(memcg, page, false, -1); | 5964 | mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), |
5965 | -nr_entries); | ||
5952 | memcg_check_events(memcg, page); | 5966 | memcg_check_events(memcg, page); |
5953 | 5967 | ||
5954 | if (!mem_cgroup_is_root(memcg)) | 5968 | if (!mem_cgroup_is_root(memcg)) |
diff --git a/mm/memory.c b/mm/memory.c index 56e48e4593cb..13ee83b43878 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1513,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, | |||
1513 | tlb_gather_mmu(&tlb, mm, start, end); | 1513 | tlb_gather_mmu(&tlb, mm, start, end); |
1514 | update_hiwater_rss(mm); | 1514 | update_hiwater_rss(mm); |
1515 | mmu_notifier_invalidate_range_start(mm, start, end); | 1515 | mmu_notifier_invalidate_range_start(mm, start, end); |
1516 | for ( ; vma && vma->vm_start < end; vma = vma->vm_next) | 1516 | for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { |
1517 | unmap_single_vma(&tlb, vma, start, end, NULL); | 1517 | unmap_single_vma(&tlb, vma, start, end, NULL); |
1518 | |||
1519 | /* | ||
1520 | * zap_page_range does not specify whether mmap_sem should be | ||
1521 | * held for read or write. That allows parallel zap_page_range | ||
1522 | * operations to unmap a PTE and defer a flush meaning that | ||
1523 | * this call observes pte_none and fails to flush the TLB. | ||
1524 | * Rather than adding a complex API, ensure that no stale | ||
1525 | * TLB entries exist when this call returns. | ||
1526 | */ | ||
1527 | flush_tlb_range(vma, start, end); | ||
1528 | } | ||
1529 | |||
1518 | mmu_notifier_invalidate_range_end(mm, start, end); | 1530 | mmu_notifier_invalidate_range_end(mm, start, end); |
1519 | tlb_finish_mmu(&tlb, start, end); | 1531 | tlb_finish_mmu(&tlb, start, end); |
1520 | } | 1532 | } |
@@ -1676,7 +1688,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1676 | EXPORT_SYMBOL(vm_insert_page); | 1688 | EXPORT_SYMBOL(vm_insert_page); |
1677 | 1689 | ||
1678 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1690 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1679 | pfn_t pfn, pgprot_t prot) | 1691 | pfn_t pfn, pgprot_t prot, bool mkwrite) |
1680 | { | 1692 | { |
1681 | struct mm_struct *mm = vma->vm_mm; | 1693 | struct mm_struct *mm = vma->vm_mm; |
1682 | int retval; | 1694 | int retval; |
@@ -1688,14 +1700,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1688 | if (!pte) | 1700 | if (!pte) |
1689 | goto out; | 1701 | goto out; |
1690 | retval = -EBUSY; | 1702 | retval = -EBUSY; |
1691 | if (!pte_none(*pte)) | 1703 | if (!pte_none(*pte)) { |
1692 | goto out_unlock; | 1704 | if (mkwrite) { |
1705 | /* | ||
1706 | * For read faults on private mappings the PFN passed | ||
1707 | * in may not match the PFN we have mapped if the | ||
1708 | * mapped PFN is a writeable COW page. In the mkwrite | ||
1709 | * case we are creating a writable PTE for a shared | ||
1710 | * mapping and we expect the PFNs to match. | ||
1711 | */ | ||
1712 | if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) | ||
1713 | goto out_unlock; | ||
1714 | entry = *pte; | ||
1715 | goto out_mkwrite; | ||
1716 | } else | ||
1717 | goto out_unlock; | ||
1718 | } | ||
1693 | 1719 | ||
1694 | /* Ok, finally just insert the thing.. */ | 1720 | /* Ok, finally just insert the thing.. */ |
1695 | if (pfn_t_devmap(pfn)) | 1721 | if (pfn_t_devmap(pfn)) |
1696 | entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); | 1722 | entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); |
1697 | else | 1723 | else |
1698 | entry = pte_mkspecial(pfn_t_pte(pfn, prot)); | 1724 | entry = pte_mkspecial(pfn_t_pte(pfn, prot)); |
1725 | |||
1726 | out_mkwrite: | ||
1727 | if (mkwrite) { | ||
1728 | entry = pte_mkyoung(entry); | ||
1729 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1730 | } | ||
1731 | |||
1699 | set_pte_at(mm, addr, pte, entry); | 1732 | set_pte_at(mm, addr, pte, entry); |
1700 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ | 1733 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ |
1701 | 1734 | ||
@@ -1766,14 +1799,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, | |||
1766 | 1799 | ||
1767 | track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); | 1800 | track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); |
1768 | 1801 | ||
1769 | ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); | 1802 | ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, |
1803 | false); | ||
1770 | 1804 | ||
1771 | return ret; | 1805 | return ret; |
1772 | } | 1806 | } |
1773 | EXPORT_SYMBOL(vm_insert_pfn_prot); | 1807 | EXPORT_SYMBOL(vm_insert_pfn_prot); |
1774 | 1808 | ||
1775 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1809 | static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
1776 | pfn_t pfn) | 1810 | pfn_t pfn, bool mkwrite) |
1777 | { | 1811 | { |
1778 | pgprot_t pgprot = vma->vm_page_prot; | 1812 | pgprot_t pgprot = vma->vm_page_prot; |
1779 | 1813 | ||
@@ -1802,10 +1836,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | |||
1802 | page = pfn_to_page(pfn_t_to_pfn(pfn)); | 1836 | page = pfn_to_page(pfn_t_to_pfn(pfn)); |
1803 | return insert_page(vma, addr, page, pgprot); | 1837 | return insert_page(vma, addr, page, pgprot); |
1804 | } | 1838 | } |
1805 | return insert_pfn(vma, addr, pfn, pgprot); | 1839 | return insert_pfn(vma, addr, pfn, pgprot, mkwrite); |
1840 | } | ||
1841 | |||
1842 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | ||
1843 | pfn_t pfn) | ||
1844 | { | ||
1845 | return __vm_insert_mixed(vma, addr, pfn, false); | ||
1846 | |||
1806 | } | 1847 | } |
1807 | EXPORT_SYMBOL(vm_insert_mixed); | 1848 | EXPORT_SYMBOL(vm_insert_mixed); |
1808 | 1849 | ||
1850 | int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, | ||
1851 | pfn_t pfn) | ||
1852 | { | ||
1853 | return __vm_insert_mixed(vma, addr, pfn, true); | ||
1854 | } | ||
1855 | EXPORT_SYMBOL(vm_insert_mixed_mkwrite); | ||
1856 | |||
1809 | /* | 1857 | /* |
1810 | * maps a range of physical memory into the requested pages. the old | 1858 | * maps a range of physical memory into the requested pages. the old |
1811 | * mappings are removed. any references to nonexistent pages results | 1859 | * mappings are removed. any references to nonexistent pages results |
@@ -2571,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf) | |||
2571 | * not dirty accountable. | 2619 | * not dirty accountable. |
2572 | */ | 2620 | */ |
2573 | if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { | 2621 | if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { |
2574 | int total_mapcount; | 2622 | int total_map_swapcount; |
2575 | if (!trylock_page(vmf->page)) { | 2623 | if (!trylock_page(vmf->page)) { |
2576 | get_page(vmf->page); | 2624 | get_page(vmf->page); |
2577 | pte_unmap_unlock(vmf->pte, vmf->ptl); | 2625 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
@@ -2586,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf) | |||
2586 | } | 2634 | } |
2587 | put_page(vmf->page); | 2635 | put_page(vmf->page); |
2588 | } | 2636 | } |
2589 | if (reuse_swap_page(vmf->page, &total_mapcount)) { | 2637 | if (reuse_swap_page(vmf->page, &total_map_swapcount)) { |
2590 | if (total_mapcount == 1) { | 2638 | if (total_map_swapcount == 1) { |
2591 | /* | 2639 | /* |
2592 | * The page is all ours. Move it to | 2640 | * The page is all ours. Move it to |
2593 | * our anon_vma so the rmap code will | 2641 | * our anon_vma so the rmap code will |
@@ -2704,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2704 | int do_swap_page(struct vm_fault *vmf) | 2752 | int do_swap_page(struct vm_fault *vmf) |
2705 | { | 2753 | { |
2706 | struct vm_area_struct *vma = vmf->vma; | 2754 | struct vm_area_struct *vma = vmf->vma; |
2707 | struct page *page, *swapcache; | 2755 | struct page *page = NULL, *swapcache; |
2708 | struct mem_cgroup *memcg; | 2756 | struct mem_cgroup *memcg; |
2757 | struct vma_swap_readahead swap_ra; | ||
2709 | swp_entry_t entry; | 2758 | swp_entry_t entry; |
2710 | pte_t pte; | 2759 | pte_t pte; |
2711 | int locked; | 2760 | int locked; |
2712 | int exclusive = 0; | 2761 | int exclusive = 0; |
2713 | int ret = 0; | 2762 | int ret = 0; |
2763 | bool vma_readahead = swap_use_vma_readahead(); | ||
2714 | 2764 | ||
2715 | if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) | 2765 | if (vma_readahead) |
2766 | page = swap_readahead_detect(vmf, &swap_ra); | ||
2767 | if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { | ||
2768 | if (page) | ||
2769 | put_page(page); | ||
2716 | goto out; | 2770 | goto out; |
2771 | } | ||
2717 | 2772 | ||
2718 | entry = pte_to_swp_entry(vmf->orig_pte); | 2773 | entry = pte_to_swp_entry(vmf->orig_pte); |
2719 | if (unlikely(non_swap_entry(entry))) { | 2774 | if (unlikely(non_swap_entry(entry))) { |
@@ -2729,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf) | |||
2729 | goto out; | 2784 | goto out; |
2730 | } | 2785 | } |
2731 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2786 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2732 | page = lookup_swap_cache(entry); | 2787 | if (!page) |
2788 | page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, | ||
2789 | vmf->address); | ||
2733 | if (!page) { | 2790 | if (!page) { |
2734 | page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, | 2791 | if (vma_readahead) |
2735 | vmf->address); | 2792 | page = do_swap_page_readahead(entry, |
2793 | GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); | ||
2794 | else | ||
2795 | page = swapin_readahead(entry, | ||
2796 | GFP_HIGHUSER_MOVABLE, vma, vmf->address); | ||
2736 | if (!page) { | 2797 | if (!page) { |
2737 | /* | 2798 | /* |
2738 | * Back out if somebody else faulted in this pte | 2799 | * Back out if somebody else faulted in this pte |
@@ -4356,19 +4417,53 @@ static void clear_gigantic_page(struct page *page, | |||
4356 | } | 4417 | } |
4357 | } | 4418 | } |
4358 | void clear_huge_page(struct page *page, | 4419 | void clear_huge_page(struct page *page, |
4359 | unsigned long addr, unsigned int pages_per_huge_page) | 4420 | unsigned long addr_hint, unsigned int pages_per_huge_page) |
4360 | { | 4421 | { |
4361 | int i; | 4422 | int i, n, base, l; |
4423 | unsigned long addr = addr_hint & | ||
4424 | ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); | ||
4362 | 4425 | ||
4363 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | 4426 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { |
4364 | clear_gigantic_page(page, addr, pages_per_huge_page); | 4427 | clear_gigantic_page(page, addr, pages_per_huge_page); |
4365 | return; | 4428 | return; |
4366 | } | 4429 | } |
4367 | 4430 | ||
4431 | /* Clear sub-page to access last to keep its cache lines hot */ | ||
4368 | might_sleep(); | 4432 | might_sleep(); |
4369 | for (i = 0; i < pages_per_huge_page; i++) { | 4433 | n = (addr_hint - addr) / PAGE_SIZE; |
4434 | if (2 * n <= pages_per_huge_page) { | ||
4435 | /* If sub-page to access in first half of huge page */ | ||
4436 | base = 0; | ||
4437 | l = n; | ||
4438 | /* Clear sub-pages at the end of huge page */ | ||
4439 | for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { | ||
4440 | cond_resched(); | ||
4441 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
4442 | } | ||
4443 | } else { | ||
4444 | /* If sub-page to access in second half of huge page */ | ||
4445 | base = pages_per_huge_page - 2 * (pages_per_huge_page - n); | ||
4446 | l = pages_per_huge_page - n; | ||
4447 | /* Clear sub-pages at the begin of huge page */ | ||
4448 | for (i = 0; i < base; i++) { | ||
4449 | cond_resched(); | ||
4450 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
4451 | } | ||
4452 | } | ||
4453 | /* | ||
4454 | * Clear remaining sub-pages in left-right-left-right pattern | ||
4455 | * towards the sub-page to access | ||
4456 | */ | ||
4457 | for (i = 0; i < l; i++) { | ||
4458 | int left_idx = base + i; | ||
4459 | int right_idx = base + 2 * l - 1 - i; | ||
4460 | |||
4461 | cond_resched(); | ||
4462 | clear_user_highpage(page + left_idx, | ||
4463 | addr + left_idx * PAGE_SIZE); | ||
4370 | cond_resched(); | 4464 | cond_resched(); |
4371 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | 4465 | clear_user_highpage(page + right_idx, |
4466 | addr + right_idx * PAGE_SIZE); | ||
4372 | } | 4467 | } |
4373 | } | 4468 | } |
4374 | 4469 | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8dccc317aac2..73bf17df6899 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
773 | node_set_state(node, N_MEMORY); | 773 | node_set_state(node, N_MEMORY); |
774 | } | 774 | } |
775 | 775 | ||
776 | bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) | ||
777 | { | ||
778 | struct pglist_data *pgdat = NODE_DATA(nid); | ||
779 | struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; | ||
780 | struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); | ||
781 | |||
782 | /* | ||
783 | * TODO there shouldn't be any inherent reason to have ZONE_NORMAL | ||
784 | * physically before ZONE_MOVABLE. All we need is they do not | ||
785 | * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE | ||
786 | * though so let's stick with it for simplicity for now. | ||
787 | * TODO make sure we do not overlap with ZONE_DEVICE | ||
788 | */ | ||
789 | if (online_type == MMOP_ONLINE_KERNEL) { | ||
790 | if (zone_is_empty(movable_zone)) | ||
791 | return true; | ||
792 | return movable_zone->zone_start_pfn >= pfn + nr_pages; | ||
793 | } else if (online_type == MMOP_ONLINE_MOVABLE) { | ||
794 | return zone_end_pfn(default_zone) <= pfn; | ||
795 | } | ||
796 | |||
797 | /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ | ||
798 | return online_type == MMOP_ONLINE_KEEP; | ||
799 | } | ||
800 | |||
801 | static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, | 776 | static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, |
802 | unsigned long nr_pages) | 777 | unsigned long nr_pages) |
803 | { | 778 | { |
@@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, | |||
856 | * If no kernel zone covers this pfn range it will automatically go | 831 | * If no kernel zone covers this pfn range it will automatically go |
857 | * to the ZONE_NORMAL. | 832 | * to the ZONE_NORMAL. |
858 | */ | 833 | */ |
859 | struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, | 834 | static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, |
860 | unsigned long nr_pages) | 835 | unsigned long nr_pages) |
861 | { | 836 | { |
862 | struct pglist_data *pgdat = NODE_DATA(nid); | 837 | struct pglist_data *pgdat = NODE_DATA(nid); |
@@ -872,17 +847,40 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, | |||
872 | return &pgdat->node_zones[ZONE_NORMAL]; | 847 | return &pgdat->node_zones[ZONE_NORMAL]; |
873 | } | 848 | } |
874 | 849 | ||
875 | static inline bool movable_pfn_range(int nid, struct zone *default_zone, | 850 | static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, |
876 | unsigned long start_pfn, unsigned long nr_pages) | 851 | unsigned long nr_pages) |
877 | { | 852 | { |
878 | if (!allow_online_pfn_range(nid, start_pfn, nr_pages, | 853 | struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, |
879 | MMOP_ONLINE_KERNEL)) | 854 | nr_pages); |
880 | return true; | 855 | struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; |
856 | bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); | ||
857 | bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); | ||
881 | 858 | ||
882 | if (!movable_node_is_enabled()) | 859 | /* |
883 | return false; | 860 | * We inherit the existing zone in a simple case where zones do not |
861 | * overlap in the given range | ||
862 | */ | ||
863 | if (in_kernel ^ in_movable) | ||
864 | return (in_kernel) ? kernel_zone : movable_zone; | ||
884 | 865 | ||
885 | return !zone_intersects(default_zone, start_pfn, nr_pages); | 866 | /* |
867 | * If the range doesn't belong to any zone or two zones overlap in the | ||
868 | * given range then we use movable zone only if movable_node is | ||
869 | * enabled because we always online to a kernel zone by default. | ||
870 | */ | ||
871 | return movable_node_enabled ? movable_zone : kernel_zone; | ||
872 | } | ||
873 | |||
874 | struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, | ||
875 | unsigned long nr_pages) | ||
876 | { | ||
877 | if (online_type == MMOP_ONLINE_KERNEL) | ||
878 | return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); | ||
879 | |||
880 | if (online_type == MMOP_ONLINE_MOVABLE) | ||
881 | return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; | ||
882 | |||
883 | return default_zone_for_pfn(nid, start_pfn, nr_pages); | ||
886 | } | 884 | } |
887 | 885 | ||
888 | /* | 886 | /* |
@@ -892,28 +890,14 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone, | |||
892 | static struct zone * __meminit move_pfn_range(int online_type, int nid, | 890 | static struct zone * __meminit move_pfn_range(int online_type, int nid, |
893 | unsigned long start_pfn, unsigned long nr_pages) | 891 | unsigned long start_pfn, unsigned long nr_pages) |
894 | { | 892 | { |
895 | struct pglist_data *pgdat = NODE_DATA(nid); | 893 | struct zone *zone; |
896 | struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); | ||
897 | |||
898 | if (online_type == MMOP_ONLINE_KEEP) { | ||
899 | struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; | ||
900 | /* | ||
901 | * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use | ||
902 | * movable zone if that is not possible (e.g. we are within | ||
903 | * or past the existing movable zone). movable_node overrides | ||
904 | * this default and defaults to movable zone | ||
905 | */ | ||
906 | if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) | ||
907 | zone = movable_zone; | ||
908 | } else if (online_type == MMOP_ONLINE_MOVABLE) { | ||
909 | zone = &pgdat->node_zones[ZONE_MOVABLE]; | ||
910 | } | ||
911 | 894 | ||
895 | zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); | ||
912 | move_pfn_range_to_zone(zone, start_pfn, nr_pages); | 896 | move_pfn_range_to_zone(zone, start_pfn, nr_pages); |
913 | return zone; | 897 | return zone; |
914 | } | 898 | } |
915 | 899 | ||
916 | /* Must be protected by mem_hotplug_begin() */ | 900 | /* Must be protected by mem_hotplug_begin() or a device_lock */ |
917 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | 901 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) |
918 | { | 902 | { |
919 | unsigned long flags; | 903 | unsigned long flags; |
@@ -925,9 +909,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
925 | struct memory_notify arg; | 909 | struct memory_notify arg; |
926 | 910 | ||
927 | nid = pfn_to_nid(pfn); | 911 | nid = pfn_to_nid(pfn); |
928 | if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) | ||
929 | return -EINVAL; | ||
930 | |||
931 | /* associate pfn range with the zone */ | 912 | /* associate pfn range with the zone */ |
932 | zone = move_pfn_range(online_type, nid, pfn, nr_pages); | 913 | zone = move_pfn_range(online_type, nid, pfn, nr_pages); |
933 | 914 | ||
@@ -945,10 +926,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
945 | * This means the page allocator ignores this zone. | 926 | * This means the page allocator ignores this zone. |
946 | * So, zonelist must be updated after online. | 927 | * So, zonelist must be updated after online. |
947 | */ | 928 | */ |
948 | mutex_lock(&zonelists_mutex); | ||
949 | if (!populated_zone(zone)) { | 929 | if (!populated_zone(zone)) { |
950 | need_zonelists_rebuild = 1; | 930 | need_zonelists_rebuild = 1; |
951 | build_all_zonelists(NULL, zone); | 931 | setup_zone_pageset(zone); |
952 | } | 932 | } |
953 | 933 | ||
954 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, | 934 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
@@ -956,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
956 | if (ret) { | 936 | if (ret) { |
957 | if (need_zonelists_rebuild) | 937 | if (need_zonelists_rebuild) |
958 | zone_pcp_reset(zone); | 938 | zone_pcp_reset(zone); |
959 | mutex_unlock(&zonelists_mutex); | ||
960 | goto failed_addition; | 939 | goto failed_addition; |
961 | } | 940 | } |
962 | 941 | ||
@@ -969,13 +948,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
969 | if (onlined_pages) { | 948 | if (onlined_pages) { |
970 | node_states_set_node(nid, &arg); | 949 | node_states_set_node(nid, &arg); |
971 | if (need_zonelists_rebuild) | 950 | if (need_zonelists_rebuild) |
972 | build_all_zonelists(NULL, NULL); | 951 | build_all_zonelists(NULL); |
973 | else | 952 | else |
974 | zone_pcp_update(zone); | 953 | zone_pcp_update(zone); |
975 | } | 954 | } |
976 | 955 | ||
977 | mutex_unlock(&zonelists_mutex); | ||
978 | |||
979 | init_per_zone_wmark_min(); | 956 | init_per_zone_wmark_min(); |
980 | 957 | ||
981 | if (onlined_pages) { | 958 | if (onlined_pages) { |
@@ -1046,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
1046 | * The node we allocated has no zone fallback lists. For avoiding | 1023 | * The node we allocated has no zone fallback lists. For avoiding |
1047 | * to access not-initialized zonelist, build here. | 1024 | * to access not-initialized zonelist, build here. |
1048 | */ | 1025 | */ |
1049 | mutex_lock(&zonelists_mutex); | 1026 | build_all_zonelists(pgdat); |
1050 | build_all_zonelists(pgdat, NULL); | ||
1051 | mutex_unlock(&zonelists_mutex); | ||
1052 | 1027 | ||
1053 | /* | 1028 | /* |
1054 | * zone->managed_pages is set to an approximate value in | 1029 | * zone->managed_pages is set to an approximate value in |
@@ -1100,13 +1075,6 @@ int try_online_node(int nid) | |||
1100 | node_set_online(nid); | 1075 | node_set_online(nid); |
1101 | ret = register_one_node(nid); | 1076 | ret = register_one_node(nid); |
1102 | BUG_ON(ret); | 1077 | BUG_ON(ret); |
1103 | |||
1104 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) { | ||
1105 | mutex_lock(&zonelists_mutex); | ||
1106 | build_all_zonelists(NULL, NULL); | ||
1107 | mutex_unlock(&zonelists_mutex); | ||
1108 | } | ||
1109 | |||
1110 | out: | 1078 | out: |
1111 | mem_hotplug_done(); | 1079 | mem_hotplug_done(); |
1112 | return ret; | 1080 | return ret; |
@@ -1722,9 +1690,7 @@ repeat: | |||
1722 | 1690 | ||
1723 | if (!populated_zone(zone)) { | 1691 | if (!populated_zone(zone)) { |
1724 | zone_pcp_reset(zone); | 1692 | zone_pcp_reset(zone); |
1725 | mutex_lock(&zonelists_mutex); | 1693 | build_all_zonelists(NULL); |
1726 | build_all_zonelists(NULL, NULL); | ||
1727 | mutex_unlock(&zonelists_mutex); | ||
1728 | } else | 1694 | } else |
1729 | zone_pcp_update(zone); | 1695 | zone_pcp_update(zone); |
1730 | 1696 | ||
@@ -1750,7 +1716,7 @@ failed_removal: | |||
1750 | return ret; | 1716 | return ret; |
1751 | } | 1717 | } |
1752 | 1718 | ||
1753 | /* Must be protected by mem_hotplug_begin() */ | 1719 | /* Must be protected by mem_hotplug_begin() or a device_lock */ |
1754 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | 1720 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) |
1755 | { | 1721 | { |
1756 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | 1722 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); |
@@ -44,6 +44,7 @@ | |||
44 | #include <linux/userfaultfd_k.h> | 44 | #include <linux/userfaultfd_k.h> |
45 | #include <linux/moduleparam.h> | 45 | #include <linux/moduleparam.h> |
46 | #include <linux/pkeys.h> | 46 | #include <linux/pkeys.h> |
47 | #include <linux/oom.h> | ||
47 | 48 | ||
48 | #include <linux/uaccess.h> | 49 | #include <linux/uaccess.h> |
49 | #include <asm/cacheflush.h> | 50 | #include <asm/cacheflush.h> |
@@ -2639,13 +2640,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, | |||
2639 | if (vma->vm_start >= end) | 2640 | if (vma->vm_start >= end) |
2640 | return 0; | 2641 | return 0; |
2641 | 2642 | ||
2642 | if (uf) { | ||
2643 | int error = userfaultfd_unmap_prep(vma, start, end, uf); | ||
2644 | |||
2645 | if (error) | ||
2646 | return error; | ||
2647 | } | ||
2648 | |||
2649 | /* | 2643 | /* |
2650 | * If we need to split any vma, do it now to save pain later. | 2644 | * If we need to split any vma, do it now to save pain later. |
2651 | * | 2645 | * |
@@ -2679,6 +2673,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, | |||
2679 | } | 2673 | } |
2680 | vma = prev ? prev->vm_next : mm->mmap; | 2674 | vma = prev ? prev->vm_next : mm->mmap; |
2681 | 2675 | ||
2676 | if (unlikely(uf)) { | ||
2677 | /* | ||
2678 | * If userfaultfd_unmap_prep returns an error the vmas | ||
2679 | * will remain splitted, but userland will get a | ||
2680 | * highly unexpected error anyway. This is no | ||
2681 | * different than the case where the first of the two | ||
2682 | * __split_vma fails, but we don't undo the first | ||
2683 | * split, despite we could. This is unlikely enough | ||
2684 | * failure that it's not worth optimizing it for. | ||
2685 | */ | ||
2686 | int error = userfaultfd_unmap_prep(vma, start, end, uf); | ||
2687 | if (error) | ||
2688 | return error; | ||
2689 | } | ||
2690 | |||
2682 | /* | 2691 | /* |
2683 | * unlock any mlock()ed ranges before detaching vmas | 2692 | * unlock any mlock()ed ranges before detaching vmas |
2684 | */ | 2693 | */ |
@@ -2993,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm) | |||
2993 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 3002 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2994 | unmap_vmas(&tlb, vma, 0, -1); | 3003 | unmap_vmas(&tlb, vma, 0, -1); |
2995 | 3004 | ||
3005 | set_bit(MMF_OOM_SKIP, &mm->flags); | ||
3006 | if (unlikely(tsk_is_oom_victim(current))) { | ||
3007 | /* | ||
3008 | * Wait for oom_reap_task() to stop working on this | ||
3009 | * mm. Because MMF_OOM_SKIP is already set before | ||
3010 | * calling down_read(), oom_reap_task() will not run | ||
3011 | * on this "mm" post up_write(). | ||
3012 | * | ||
3013 | * tsk_is_oom_victim() cannot be set from under us | ||
3014 | * either because current->mm is already set to NULL | ||
3015 | * under task_lock before calling mmput and oom_mm is | ||
3016 | * set not NULL by the OOM killer only if current->mm | ||
3017 | * is found not NULL while holding the task_lock. | ||
3018 | */ | ||
3019 | down_write(&mm->mmap_sem); | ||
3020 | up_write(&mm->mmap_sem); | ||
3021 | } | ||
2996 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); | 3022 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); |
2997 | tlb_finish_mmu(&tlb, 0, -1); | 3023 | tlb_finish_mmu(&tlb, 0, -1); |
2998 | 3024 | ||
@@ -3514,7 +3540,7 @@ static int init_user_reserve(void) | |||
3514 | { | 3540 | { |
3515 | unsigned long free_kbytes; | 3541 | unsigned long free_kbytes; |
3516 | 3542 | ||
3517 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3543 | free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3518 | 3544 | ||
3519 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); | 3545 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); |
3520 | return 0; | 3546 | return 0; |
@@ -3535,7 +3561,7 @@ static int init_admin_reserve(void) | |||
3535 | { | 3561 | { |
3536 | unsigned long free_kbytes; | 3562 | unsigned long free_kbytes; |
3537 | 3563 | ||
3538 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3564 | free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3539 | 3565 | ||
3540 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | 3566 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); |
3541 | return 0; | 3567 | return 0; |
@@ -3579,7 +3605,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, | |||
3579 | 3605 | ||
3580 | break; | 3606 | break; |
3581 | case MEM_OFFLINE: | 3607 | case MEM_OFFLINE: |
3582 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3608 | free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3583 | 3609 | ||
3584 | if (sysctl_user_reserve_kbytes > free_kbytes) { | 3610 | if (sysctl_user_reserve_kbytes > free_kbytes) { |
3585 | init_user_reserve(); | 3611 | init_user_reserve(); |
diff --git a/mm/mremap.c b/mm/mremap.c index 3f23715d3c69..7395564daa6c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -384,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
384 | if (!vma || vma->vm_start > addr) | 384 | if (!vma || vma->vm_start > addr) |
385 | return ERR_PTR(-EFAULT); | 385 | return ERR_PTR(-EFAULT); |
386 | 386 | ||
387 | /* | ||
388 | * !old_len is a special case where an attempt is made to 'duplicate' | ||
389 | * a mapping. This makes no sense for private mappings as it will | ||
390 | * instead create a fresh/new mapping unrelated to the original. This | ||
391 | * is contrary to the basic idea of mremap which creates new mappings | ||
392 | * based on the original. There are no known use cases for this | ||
393 | * behavior. As a result, fail such attempts. | ||
394 | */ | ||
395 | if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { | ||
396 | pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); | ||
397 | return ERR_PTR(-EINVAL); | ||
398 | } | ||
399 | |||
387 | if (is_vm_hugetlb_page(vma)) | 400 | if (is_vm_hugetlb_page(vma)) |
388 | return ERR_PTR(-EINVAL); | 401 | return ERR_PTR(-EINVAL); |
389 | 402 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index fc184f597d59..53d5175a5c14 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void) | |||
1962 | { | 1962 | { |
1963 | unsigned long free_kbytes; | 1963 | unsigned long free_kbytes; |
1964 | 1964 | ||
1965 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 1965 | free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
1966 | 1966 | ||
1967 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); | 1967 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); |
1968 | return 0; | 1968 | return 0; |
@@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void) | |||
1983 | { | 1983 | { |
1984 | unsigned long free_kbytes; | 1984 | unsigned long free_kbytes; |
1985 | 1985 | ||
1986 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 1986 | free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
1987 | 1987 | ||
1988 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | 1988 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); |
1989 | return 0; | 1989 | return 0; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9e8b4f030c1c..99736e026712 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) | |||
495 | } | 495 | } |
496 | 496 | ||
497 | /* | 497 | /* |
498 | * increase mm_users only after we know we will reap something so | 498 | * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't |
499 | * that the mmput_async is called only when we have reaped something | 499 | * work on the mm anymore. The check for MMF_OOM_SKIP must run |
500 | * and delayed __mmput doesn't matter that much | 500 | * under mmap_sem for reading because it serializes against the |
501 | * down_write();up_write() cycle in exit_mmap(). | ||
501 | */ | 502 | */ |
502 | if (!mmget_not_zero(mm)) { | 503 | if (test_bit(MMF_OOM_SKIP, &mm->flags)) { |
503 | up_read(&mm->mmap_sem); | 504 | up_read(&mm->mmap_sem); |
504 | trace_skip_task_reaping(tsk->pid); | 505 | trace_skip_task_reaping(tsk->pid); |
505 | goto unlock_oom; | 506 | goto unlock_oom; |
@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) | |||
542 | K(get_mm_counter(mm, MM_SHMEMPAGES))); | 543 | K(get_mm_counter(mm, MM_SHMEMPAGES))); |
543 | up_read(&mm->mmap_sem); | 544 | up_read(&mm->mmap_sem); |
544 | 545 | ||
545 | /* | ||
546 | * Drop our reference but make sure the mmput slow path is called from a | ||
547 | * different context because we shouldn't risk we get stuck there and | ||
548 | * put the oom_reaper out of the way. | ||
549 | */ | ||
550 | mmput_async(mm); | ||
551 | trace_finish_task_reaping(tsk->pid); | 546 | trace_finish_task_reaping(tsk->pid); |
552 | unlock_oom: | 547 | unlock_oom: |
553 | mutex_unlock(&oom_lock); | 548 | mutex_unlock(&oom_lock); |
@@ -824,7 +819,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message) | |||
824 | 819 | ||
825 | /* | 820 | /* |
826 | * If the task is already exiting, don't alarm the sysadmin or kill | 821 | * If the task is already exiting, don't alarm the sysadmin or kill |
827 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 822 | * its children or threads, just give it access to memory reserves |
823 | * so it can die quickly | ||
828 | */ | 824 | */ |
829 | task_lock(p); | 825 | task_lock(p); |
830 | if (task_will_free_mem(p)) { | 826 | if (task_will_free_mem(p)) { |
@@ -889,9 +885,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message) | |||
889 | count_memcg_event_mm(mm, OOM_KILL); | 885 | count_memcg_event_mm(mm, OOM_KILL); |
890 | 886 | ||
891 | /* | 887 | /* |
892 | * We should send SIGKILL before setting TIF_MEMDIE in order to prevent | 888 | * We should send SIGKILL before granting access to memory reserves |
893 | * the OOM victim from depleting the memory reserves from the user | 889 | * in order to prevent the OOM victim from depleting the memory |
894 | * space under its control. | 890 | * reserves from the user space under its control. |
895 | */ | 891 | */ |
896 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 892 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); |
897 | mark_oom_victim(victim); | 893 | mark_oom_victim(victim); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bf050ab025b7..0b9c5cbe8eba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void) | |||
363 | { | 363 | { |
364 | unsigned long x; | 364 | unsigned long x; |
365 | 365 | ||
366 | x = global_page_state(NR_FREE_PAGES); | 366 | x = global_zone_page_state(NR_FREE_PAGES); |
367 | /* | 367 | /* |
368 | * Pages reserved for the kernel should not be considered | 368 | * Pages reserved for the kernel should not be considered |
369 | * dirtyable, to prevent a situation where reclaim has to | 369 | * dirtyable, to prevent a situation where reclaim has to |
@@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) | |||
1405 | * will look to see if it needs to start dirty throttling. | 1405 | * will look to see if it needs to start dirty throttling. |
1406 | * | 1406 | * |
1407 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive | 1407 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive |
1408 | * global_page_state() too often. So scale it near-sqrt to the safety margin | 1408 | * global_zone_page_state() too often. So scale it near-sqrt to the safety margin |
1409 | * (the number of pages we may dirty without exceeding the dirty limits). | 1409 | * (the number of pages we may dirty without exceeding the dirty limits). |
1410 | */ | 1410 | */ |
1411 | static unsigned long dirty_poll_interval(unsigned long dirty, | 1411 | static unsigned long dirty_poll_interval(unsigned long dirty, |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9327a940e373..a9add06fe768 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, | |||
2951 | { | 2951 | { |
2952 | long min = mark; | 2952 | long min = mark; |
2953 | int o; | 2953 | int o; |
2954 | const bool alloc_harder = (alloc_flags & ALLOC_HARDER); | 2954 | const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); |
2955 | 2955 | ||
2956 | /* free_pages may go negative - that's OK */ | 2956 | /* free_pages may go negative - that's OK */ |
2957 | free_pages -= (1 << order) - 1; | 2957 | free_pages -= (1 << order) - 1; |
@@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, | |||
2964 | * the high-atomic reserves. This will over-estimate the size of the | 2964 | * the high-atomic reserves. This will over-estimate the size of the |
2965 | * atomic reserve but it avoids a search. | 2965 | * atomic reserve but it avoids a search. |
2966 | */ | 2966 | */ |
2967 | if (likely(!alloc_harder)) | 2967 | if (likely(!alloc_harder)) { |
2968 | free_pages -= z->nr_reserved_highatomic; | 2968 | free_pages -= z->nr_reserved_highatomic; |
2969 | else | 2969 | } else { |
2970 | min -= min / 4; | 2970 | /* |
2971 | * OOM victims can try even harder than normal ALLOC_HARDER | ||
2972 | * users on the grounds that it's definitely going to be in | ||
2973 | * the exit path shortly and free memory. Any allocation it | ||
2974 | * makes during the free path will be small and short-lived. | ||
2975 | */ | ||
2976 | if (alloc_flags & ALLOC_OOM) | ||
2977 | min -= min / 2; | ||
2978 | else | ||
2979 | min -= min / 4; | ||
2980 | } | ||
2981 | |||
2971 | 2982 | ||
2972 | #ifdef CONFIG_CMA | 2983 | #ifdef CONFIG_CMA |
2973 | /* If allocation can't use CMA areas don't use free CMA pages */ | 2984 | /* If allocation can't use CMA areas don't use free CMA pages */ |
@@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) | |||
3205 | * of allowed nodes. | 3216 | * of allowed nodes. |
3206 | */ | 3217 | */ |
3207 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | 3218 | if (!(gfp_mask & __GFP_NOMEMALLOC)) |
3208 | if (test_thread_flag(TIF_MEMDIE) || | 3219 | if (tsk_is_oom_victim(current) || |
3209 | (current->flags & (PF_MEMALLOC | PF_EXITING))) | 3220 | (current->flags & (PF_MEMALLOC | PF_EXITING))) |
3210 | filter &= ~SHOW_MEM_FILTER_NODES; | 3221 | filter &= ~SHOW_MEM_FILTER_NODES; |
3211 | if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) | 3222 | if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) |
@@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
3668 | return alloc_flags; | 3679 | return alloc_flags; |
3669 | } | 3680 | } |
3670 | 3681 | ||
3671 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | 3682 | static bool oom_reserves_allowed(struct task_struct *tsk) |
3672 | { | 3683 | { |
3673 | if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) | 3684 | if (!tsk_is_oom_victim(tsk)) |
3674 | return false; | 3685 | return false; |
3675 | 3686 | ||
3687 | /* | ||
3688 | * !MMU doesn't have oom reaper so give access to memory reserves | ||
3689 | * only to the thread with TIF_MEMDIE set | ||
3690 | */ | ||
3691 | if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) | ||
3692 | return false; | ||
3693 | |||
3694 | return true; | ||
3695 | } | ||
3696 | |||
3697 | /* | ||
3698 | * Distinguish requests which really need access to full memory | ||
3699 | * reserves from oom victims which can live with a portion of it | ||
3700 | */ | ||
3701 | static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) | ||
3702 | { | ||
3703 | if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) | ||
3704 | return 0; | ||
3676 | if (gfp_mask & __GFP_MEMALLOC) | 3705 | if (gfp_mask & __GFP_MEMALLOC) |
3677 | return true; | 3706 | return ALLOC_NO_WATERMARKS; |
3678 | if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) | 3707 | if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) |
3679 | return true; | 3708 | return ALLOC_NO_WATERMARKS; |
3680 | if (!in_interrupt() && | 3709 | if (!in_interrupt()) { |
3681 | ((current->flags & PF_MEMALLOC) || | 3710 | if (current->flags & PF_MEMALLOC) |
3682 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 3711 | return ALLOC_NO_WATERMARKS; |
3683 | return true; | 3712 | else if (oom_reserves_allowed(current)) |
3713 | return ALLOC_OOM; | ||
3714 | } | ||
3684 | 3715 | ||
3685 | return false; | 3716 | return 0; |
3717 | } | ||
3718 | |||
3719 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | ||
3720 | { | ||
3721 | return !!__gfp_pfmemalloc_flags(gfp_mask); | ||
3686 | } | 3722 | } |
3687 | 3723 | ||
3688 | /* | 3724 | /* |
@@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3835 | unsigned long alloc_start = jiffies; | 3871 | unsigned long alloc_start = jiffies; |
3836 | unsigned int stall_timeout = 10 * HZ; | 3872 | unsigned int stall_timeout = 10 * HZ; |
3837 | unsigned int cpuset_mems_cookie; | 3873 | unsigned int cpuset_mems_cookie; |
3874 | int reserve_flags; | ||
3838 | 3875 | ||
3839 | /* | 3876 | /* |
3840 | * In the slowpath, we sanity check order to avoid ever trying to | 3877 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -3940,15 +3977,16 @@ retry: | |||
3940 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) | 3977 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) |
3941 | wake_all_kswapds(order, ac); | 3978 | wake_all_kswapds(order, ac); |
3942 | 3979 | ||
3943 | if (gfp_pfmemalloc_allowed(gfp_mask)) | 3980 | reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); |
3944 | alloc_flags = ALLOC_NO_WATERMARKS; | 3981 | if (reserve_flags) |
3982 | alloc_flags = reserve_flags; | ||
3945 | 3983 | ||
3946 | /* | 3984 | /* |
3947 | * Reset the zonelist iterators if memory policies can be ignored. | 3985 | * Reset the zonelist iterators if memory policies can be ignored. |
3948 | * These allocations are high priority and system rather than user | 3986 | * These allocations are high priority and system rather than user |
3949 | * orientated. | 3987 | * orientated. |
3950 | */ | 3988 | */ |
3951 | if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { | 3989 | if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { |
3952 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); | 3990 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); |
3953 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, | 3991 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
3954 | ac->high_zoneidx, ac->nodemask); | 3992 | ac->high_zoneidx, ac->nodemask); |
@@ -4025,8 +4063,8 @@ retry: | |||
4025 | goto got_pg; | 4063 | goto got_pg; |
4026 | 4064 | ||
4027 | /* Avoid allocations with no watermarks from looping endlessly */ | 4065 | /* Avoid allocations with no watermarks from looping endlessly */ |
4028 | if (test_thread_flag(TIF_MEMDIE) && | 4066 | if (tsk_is_oom_victim(current) && |
4029 | (alloc_flags == ALLOC_NO_WATERMARKS || | 4067 | (alloc_flags == ALLOC_OOM || |
4030 | (gfp_mask & __GFP_NOMEMALLOC))) | 4068 | (gfp_mask & __GFP_NOMEMALLOC))) |
4031 | goto nopage; | 4069 | goto nopage; |
4032 | 4070 | ||
@@ -4509,7 +4547,7 @@ long si_mem_available(void) | |||
4509 | * Estimate the amount of memory available for userspace allocations, | 4547 | * Estimate the amount of memory available for userspace allocations, |
4510 | * without causing swapping. | 4548 | * without causing swapping. |
4511 | */ | 4549 | */ |
4512 | available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; | 4550 | available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; |
4513 | 4551 | ||
4514 | /* | 4552 | /* |
4515 | * Not all the page cache can be freed, otherwise the system will | 4553 | * Not all the page cache can be freed, otherwise the system will |
@@ -4538,7 +4576,7 @@ void si_meminfo(struct sysinfo *val) | |||
4538 | { | 4576 | { |
4539 | val->totalram = totalram_pages; | 4577 | val->totalram = totalram_pages; |
4540 | val->sharedram = global_node_page_state(NR_SHMEM); | 4578 | val->sharedram = global_node_page_state(NR_SHMEM); |
4541 | val->freeram = global_page_state(NR_FREE_PAGES); | 4579 | val->freeram = global_zone_page_state(NR_FREE_PAGES); |
4542 | val->bufferram = nr_blockdev_pages(); | 4580 | val->bufferram = nr_blockdev_pages(); |
4543 | val->totalhigh = totalhigh_pages; | 4581 | val->totalhigh = totalhigh_pages; |
4544 | val->freehigh = nr_free_highpages(); | 4582 | val->freehigh = nr_free_highpages(); |
@@ -4673,11 +4711,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) | |||
4673 | global_node_page_state(NR_SLAB_UNRECLAIMABLE), | 4711 | global_node_page_state(NR_SLAB_UNRECLAIMABLE), |
4674 | global_node_page_state(NR_FILE_MAPPED), | 4712 | global_node_page_state(NR_FILE_MAPPED), |
4675 | global_node_page_state(NR_SHMEM), | 4713 | global_node_page_state(NR_SHMEM), |
4676 | global_page_state(NR_PAGETABLE), | 4714 | global_zone_page_state(NR_PAGETABLE), |
4677 | global_page_state(NR_BOUNCE), | 4715 | global_zone_page_state(NR_BOUNCE), |
4678 | global_page_state(NR_FREE_PAGES), | 4716 | global_zone_page_state(NR_FREE_PAGES), |
4679 | free_pcp, | 4717 | free_pcp, |
4680 | global_page_state(NR_FREE_CMA_PAGES)); | 4718 | global_zone_page_state(NR_FREE_CMA_PAGES)); |
4681 | 4719 | ||
4682 | for_each_online_pgdat(pgdat) { | 4720 | for_each_online_pgdat(pgdat) { |
4683 | if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) | 4721 | if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) |
@@ -4839,18 +4877,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | |||
4839 | * | 4877 | * |
4840 | * Add all populated zones of a node to the zonelist. | 4878 | * Add all populated zones of a node to the zonelist. |
4841 | */ | 4879 | */ |
4842 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | 4880 | static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) |
4843 | int nr_zones) | ||
4844 | { | 4881 | { |
4845 | struct zone *zone; | 4882 | struct zone *zone; |
4846 | enum zone_type zone_type = MAX_NR_ZONES; | 4883 | enum zone_type zone_type = MAX_NR_ZONES; |
4884 | int nr_zones = 0; | ||
4847 | 4885 | ||
4848 | do { | 4886 | do { |
4849 | zone_type--; | 4887 | zone_type--; |
4850 | zone = pgdat->node_zones + zone_type; | 4888 | zone = pgdat->node_zones + zone_type; |
4851 | if (managed_zone(zone)) { | 4889 | if (managed_zone(zone)) { |
4852 | zoneref_set_zone(zone, | 4890 | zoneref_set_zone(zone, &zonerefs[nr_zones++]); |
4853 | &zonelist->_zonerefs[nr_zones++]); | ||
4854 | check_highest_zone(zone_type); | 4891 | check_highest_zone(zone_type); |
4855 | } | 4892 | } |
4856 | } while (zone_type); | 4893 | } while (zone_type); |
@@ -4858,52 +4895,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | |||
4858 | return nr_zones; | 4895 | return nr_zones; |
4859 | } | 4896 | } |
4860 | 4897 | ||
4861 | |||
4862 | /* | ||
4863 | * zonelist_order: | ||
4864 | * 0 = automatic detection of better ordering. | ||
4865 | * 1 = order by ([node] distance, -zonetype) | ||
4866 | * 2 = order by (-zonetype, [node] distance) | ||
4867 | * | ||
4868 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create | ||
4869 | * the same zonelist. So only NUMA can configure this param. | ||
4870 | */ | ||
4871 | #define ZONELIST_ORDER_DEFAULT 0 | ||
4872 | #define ZONELIST_ORDER_NODE 1 | ||
4873 | #define ZONELIST_ORDER_ZONE 2 | ||
4874 | |||
4875 | /* zonelist order in the kernel. | ||
4876 | * set_zonelist_order() will set this to NODE or ZONE. | ||
4877 | */ | ||
4878 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
4879 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; | ||
4880 | |||
4881 | |||
4882 | #ifdef CONFIG_NUMA | 4898 | #ifdef CONFIG_NUMA |
4883 | /* The value user specified ....changed by config */ | ||
4884 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
4885 | /* string for sysctl */ | ||
4886 | #define NUMA_ZONELIST_ORDER_LEN 16 | ||
4887 | char numa_zonelist_order[16] = "default"; | ||
4888 | |||
4889 | /* | ||
4890 | * interface for configure zonelist ordering. | ||
4891 | * command line option "numa_zonelist_order" | ||
4892 | * = "[dD]efault - default, automatic configuration. | ||
4893 | * = "[nN]ode - order by node locality, then by zone within node | ||
4894 | * = "[zZ]one - order by zone, then by locality within zone | ||
4895 | */ | ||
4896 | 4899 | ||
4897 | static int __parse_numa_zonelist_order(char *s) | 4900 | static int __parse_numa_zonelist_order(char *s) |
4898 | { | 4901 | { |
4899 | if (*s == 'd' || *s == 'D') { | 4902 | /* |
4900 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; | 4903 | * We used to support different zonlists modes but they turned |
4901 | } else if (*s == 'n' || *s == 'N') { | 4904 | * out to be just not useful. Let's keep the warning in place |
4902 | user_zonelist_order = ZONELIST_ORDER_NODE; | 4905 | * if somebody still use the cmd line parameter so that we do |
4903 | } else if (*s == 'z' || *s == 'Z') { | 4906 | * not fail it silently |
4904 | user_zonelist_order = ZONELIST_ORDER_ZONE; | 4907 | */ |
4905 | } else { | 4908 | if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { |
4906 | pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); | 4909 | pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); |
4907 | return -EINVAL; | 4910 | return -EINVAL; |
4908 | } | 4911 | } |
4909 | return 0; | 4912 | return 0; |
@@ -4911,19 +4914,15 @@ static int __parse_numa_zonelist_order(char *s) | |||
4911 | 4914 | ||
4912 | static __init int setup_numa_zonelist_order(char *s) | 4915 | static __init int setup_numa_zonelist_order(char *s) |
4913 | { | 4916 | { |
4914 | int ret; | ||
4915 | |||
4916 | if (!s) | 4917 | if (!s) |
4917 | return 0; | 4918 | return 0; |
4918 | 4919 | ||
4919 | ret = __parse_numa_zonelist_order(s); | 4920 | return __parse_numa_zonelist_order(s); |
4920 | if (ret == 0) | ||
4921 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); | ||
4922 | |||
4923 | return ret; | ||
4924 | } | 4921 | } |
4925 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | 4922 | early_param("numa_zonelist_order", setup_numa_zonelist_order); |
4926 | 4923 | ||
4924 | char numa_zonelist_order[] = "Node"; | ||
4925 | |||
4927 | /* | 4926 | /* |
4928 | * sysctl handler for numa_zonelist_order | 4927 | * sysctl handler for numa_zonelist_order |
4929 | */ | 4928 | */ |
@@ -4931,42 +4930,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, | |||
4931 | void __user *buffer, size_t *length, | 4930 | void __user *buffer, size_t *length, |
4932 | loff_t *ppos) | 4931 | loff_t *ppos) |
4933 | { | 4932 | { |
4934 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 4933 | char *str; |
4935 | int ret; | 4934 | int ret; |
4936 | static DEFINE_MUTEX(zl_order_mutex); | ||
4937 | 4935 | ||
4938 | mutex_lock(&zl_order_mutex); | 4936 | if (!write) |
4939 | if (write) { | 4937 | return proc_dostring(table, write, buffer, length, ppos); |
4940 | if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { | 4938 | str = memdup_user_nul(buffer, 16); |
4941 | ret = -EINVAL; | 4939 | if (IS_ERR(str)) |
4942 | goto out; | 4940 | return PTR_ERR(str); |
4943 | } | ||
4944 | strcpy(saved_string, (char *)table->data); | ||
4945 | } | ||
4946 | ret = proc_dostring(table, write, buffer, length, ppos); | ||
4947 | if (ret) | ||
4948 | goto out; | ||
4949 | if (write) { | ||
4950 | int oldval = user_zonelist_order; | ||
4951 | 4941 | ||
4952 | ret = __parse_numa_zonelist_order((char *)table->data); | 4942 | ret = __parse_numa_zonelist_order(str); |
4953 | if (ret) { | 4943 | kfree(str); |
4954 | /* | ||
4955 | * bogus value. restore saved string | ||
4956 | */ | ||
4957 | strncpy((char *)table->data, saved_string, | ||
4958 | NUMA_ZONELIST_ORDER_LEN); | ||
4959 | user_zonelist_order = oldval; | ||
4960 | } else if (oldval != user_zonelist_order) { | ||
4961 | mem_hotplug_begin(); | ||
4962 | mutex_lock(&zonelists_mutex); | ||
4963 | build_all_zonelists(NULL, NULL); | ||
4964 | mutex_unlock(&zonelists_mutex); | ||
4965 | mem_hotplug_done(); | ||
4966 | } | ||
4967 | } | ||
4968 | out: | ||
4969 | mutex_unlock(&zl_order_mutex); | ||
4970 | return ret; | 4944 | return ret; |
4971 | } | 4945 | } |
4972 | 4946 | ||
@@ -5040,17 +5014,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
5040 | * This results in maximum locality--normal zone overflows into local | 5014 | * This results in maximum locality--normal zone overflows into local |
5041 | * DMA zone, if any--but risks exhausting DMA zone. | 5015 | * DMA zone, if any--but risks exhausting DMA zone. |
5042 | */ | 5016 | */ |
5043 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | 5017 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, |
5018 | unsigned nr_nodes) | ||
5044 | { | 5019 | { |
5045 | int j; | 5020 | struct zoneref *zonerefs; |
5046 | struct zonelist *zonelist; | 5021 | int i; |
5022 | |||
5023 | zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; | ||
5024 | |||
5025 | for (i = 0; i < nr_nodes; i++) { | ||
5026 | int nr_zones; | ||
5047 | 5027 | ||
5048 | zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; | 5028 | pg_data_t *node = NODE_DATA(node_order[i]); |
5049 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) | 5029 | |
5050 | ; | 5030 | nr_zones = build_zonerefs_node(node, zonerefs); |
5051 | j = build_zonelists_node(NODE_DATA(node), zonelist, j); | 5031 | zonerefs += nr_zones; |
5052 | zonelist->_zonerefs[j].zone = NULL; | 5032 | } |
5053 | zonelist->_zonerefs[j].zone_idx = 0; | 5033 | zonerefs->zone = NULL; |
5034 | zonerefs->zone_idx = 0; | ||
5054 | } | 5035 | } |
5055 | 5036 | ||
5056 | /* | 5037 | /* |
@@ -5058,13 +5039,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
5058 | */ | 5039 | */ |
5059 | static void build_thisnode_zonelists(pg_data_t *pgdat) | 5040 | static void build_thisnode_zonelists(pg_data_t *pgdat) |
5060 | { | 5041 | { |
5061 | int j; | 5042 | struct zoneref *zonerefs; |
5062 | struct zonelist *zonelist; | 5043 | int nr_zones; |
5063 | 5044 | ||
5064 | zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; | 5045 | zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; |
5065 | j = build_zonelists_node(pgdat, zonelist, 0); | 5046 | nr_zones = build_zonerefs_node(pgdat, zonerefs); |
5066 | zonelist->_zonerefs[j].zone = NULL; | 5047 | zonerefs += nr_zones; |
5067 | zonelist->_zonerefs[j].zone_idx = 0; | 5048 | zonerefs->zone = NULL; |
5049 | zonerefs->zone_idx = 0; | ||
5068 | } | 5050 | } |
5069 | 5051 | ||
5070 | /* | 5052 | /* |
@@ -5073,79 +5055,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) | |||
5073 | * exhausted, but results in overflowing to remote node while memory | 5055 | * exhausted, but results in overflowing to remote node while memory |
5074 | * may still exist in local DMA zone. | 5056 | * may still exist in local DMA zone. |
5075 | */ | 5057 | */ |
5076 | static int node_order[MAX_NUMNODES]; | ||
5077 | |||
5078 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | ||
5079 | { | ||
5080 | int pos, j, node; | ||
5081 | int zone_type; /* needs to be signed */ | ||
5082 | struct zone *z; | ||
5083 | struct zonelist *zonelist; | ||
5084 | |||
5085 | zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; | ||
5086 | pos = 0; | ||
5087 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { | ||
5088 | for (j = 0; j < nr_nodes; j++) { | ||
5089 | node = node_order[j]; | ||
5090 | z = &NODE_DATA(node)->node_zones[zone_type]; | ||
5091 | if (managed_zone(z)) { | ||
5092 | zoneref_set_zone(z, | ||
5093 | &zonelist->_zonerefs[pos++]); | ||
5094 | check_highest_zone(zone_type); | ||
5095 | } | ||
5096 | } | ||
5097 | } | ||
5098 | zonelist->_zonerefs[pos].zone = NULL; | ||
5099 | zonelist->_zonerefs[pos].zone_idx = 0; | ||
5100 | } | ||
5101 | |||
5102 | #if defined(CONFIG_64BIT) | ||
5103 | /* | ||
5104 | * Devices that require DMA32/DMA are relatively rare and do not justify a | ||
5105 | * penalty to every machine in case the specialised case applies. Default | ||
5106 | * to Node-ordering on 64-bit NUMA machines | ||
5107 | */ | ||
5108 | static int default_zonelist_order(void) | ||
5109 | { | ||
5110 | return ZONELIST_ORDER_NODE; | ||
5111 | } | ||
5112 | #else | ||
5113 | /* | ||
5114 | * On 32-bit, the Normal zone needs to be preserved for allocations accessible | ||
5115 | * by the kernel. If processes running on node 0 deplete the low memory zone | ||
5116 | * then reclaim will occur more frequency increasing stalls and potentially | ||
5117 | * be easier to OOM if a large percentage of the zone is under writeback or | ||
5118 | * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. | ||
5119 | * Hence, default to zone ordering on 32-bit. | ||
5120 | */ | ||
5121 | static int default_zonelist_order(void) | ||
5122 | { | ||
5123 | return ZONELIST_ORDER_ZONE; | ||
5124 | } | ||
5125 | #endif /* CONFIG_64BIT */ | ||
5126 | |||
5127 | static void set_zonelist_order(void) | ||
5128 | { | ||
5129 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) | ||
5130 | current_zonelist_order = default_zonelist_order(); | ||
5131 | else | ||
5132 | current_zonelist_order = user_zonelist_order; | ||
5133 | } | ||
5134 | 5058 | ||
5135 | static void build_zonelists(pg_data_t *pgdat) | 5059 | static void build_zonelists(pg_data_t *pgdat) |
5136 | { | 5060 | { |
5137 | int i, node, load; | 5061 | static int node_order[MAX_NUMNODES]; |
5062 | int node, load, nr_nodes = 0; | ||
5138 | nodemask_t used_mask; | 5063 | nodemask_t used_mask; |
5139 | int local_node, prev_node; | 5064 | int local_node, prev_node; |
5140 | struct zonelist *zonelist; | ||
5141 | unsigned int order = current_zonelist_order; | ||
5142 | |||
5143 | /* initialize zonelists */ | ||
5144 | for (i = 0; i < MAX_ZONELISTS; i++) { | ||
5145 | zonelist = pgdat->node_zonelists + i; | ||
5146 | zonelist->_zonerefs[0].zone = NULL; | ||
5147 | zonelist->_zonerefs[0].zone_idx = 0; | ||
5148 | } | ||
5149 | 5065 | ||
5150 | /* NUMA-aware ordering of nodes */ | 5066 | /* NUMA-aware ordering of nodes */ |
5151 | local_node = pgdat->node_id; | 5067 | local_node = pgdat->node_id; |
@@ -5154,8 +5070,6 @@ static void build_zonelists(pg_data_t *pgdat) | |||
5154 | nodes_clear(used_mask); | 5070 | nodes_clear(used_mask); |
5155 | 5071 | ||
5156 | memset(node_order, 0, sizeof(node_order)); | 5072 | memset(node_order, 0, sizeof(node_order)); |
5157 | i = 0; | ||
5158 | |||
5159 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 5073 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
5160 | /* | 5074 | /* |
5161 | * We don't want to pressure a particular node. | 5075 | * We don't want to pressure a particular node. |
@@ -5166,19 +5080,12 @@ static void build_zonelists(pg_data_t *pgdat) | |||
5166 | node_distance(local_node, prev_node)) | 5080 | node_distance(local_node, prev_node)) |
5167 | node_load[node] = load; | 5081 | node_load[node] = load; |
5168 | 5082 | ||
5083 | node_order[nr_nodes++] = node; | ||
5169 | prev_node = node; | 5084 | prev_node = node; |
5170 | load--; | 5085 | load--; |
5171 | if (order == ZONELIST_ORDER_NODE) | ||
5172 | build_zonelists_in_node_order(pgdat, node); | ||
5173 | else | ||
5174 | node_order[i++] = node; /* remember order */ | ||
5175 | } | ||
5176 | |||
5177 | if (order == ZONELIST_ORDER_ZONE) { | ||
5178 | /* calculate node order -- i.e., DMA last! */ | ||
5179 | build_zonelists_in_zone_order(pgdat, i); | ||
5180 | } | 5086 | } |
5181 | 5087 | ||
5088 | build_zonelists_in_node_order(pgdat, node_order, nr_nodes); | ||
5182 | build_thisnode_zonelists(pgdat); | 5089 | build_thisnode_zonelists(pgdat); |
5183 | } | 5090 | } |
5184 | 5091 | ||
@@ -5204,21 +5111,17 @@ static void setup_min_unmapped_ratio(void); | |||
5204 | static void setup_min_slab_ratio(void); | 5111 | static void setup_min_slab_ratio(void); |
5205 | #else /* CONFIG_NUMA */ | 5112 | #else /* CONFIG_NUMA */ |
5206 | 5113 | ||
5207 | static void set_zonelist_order(void) | ||
5208 | { | ||
5209 | current_zonelist_order = ZONELIST_ORDER_ZONE; | ||
5210 | } | ||
5211 | |||
5212 | static void build_zonelists(pg_data_t *pgdat) | 5114 | static void build_zonelists(pg_data_t *pgdat) |
5213 | { | 5115 | { |
5214 | int node, local_node; | 5116 | int node, local_node; |
5215 | enum zone_type j; | 5117 | struct zoneref *zonerefs; |
5216 | struct zonelist *zonelist; | 5118 | int nr_zones; |
5217 | 5119 | ||
5218 | local_node = pgdat->node_id; | 5120 | local_node = pgdat->node_id; |
5219 | 5121 | ||
5220 | zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; | 5122 | zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; |
5221 | j = build_zonelists_node(pgdat, zonelist, 0); | 5123 | nr_zones = build_zonerefs_node(pgdat, zonerefs); |
5124 | zonerefs += nr_zones; | ||
5222 | 5125 | ||
5223 | /* | 5126 | /* |
5224 | * Now we build the zonelist so that it contains the zones | 5127 | * Now we build the zonelist so that it contains the zones |
@@ -5231,16 +5134,18 @@ static void build_zonelists(pg_data_t *pgdat) | |||
5231 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 5134 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
5232 | if (!node_online(node)) | 5135 | if (!node_online(node)) |
5233 | continue; | 5136 | continue; |
5234 | j = build_zonelists_node(NODE_DATA(node), zonelist, j); | 5137 | nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); |
5138 | zonerefs += nr_zones; | ||
5235 | } | 5139 | } |
5236 | for (node = 0; node < local_node; node++) { | 5140 | for (node = 0; node < local_node; node++) { |
5237 | if (!node_online(node)) | 5141 | if (!node_online(node)) |
5238 | continue; | 5142 | continue; |
5239 | j = build_zonelists_node(NODE_DATA(node), zonelist, j); | 5143 | nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); |
5144 | zonerefs += nr_zones; | ||
5240 | } | 5145 | } |
5241 | 5146 | ||
5242 | zonelist->_zonerefs[j].zone = NULL; | 5147 | zonerefs->zone = NULL; |
5243 | zonelist->_zonerefs[j].zone_idx = 0; | 5148 | zonerefs->zone_idx = 0; |
5244 | } | 5149 | } |
5245 | 5150 | ||
5246 | #endif /* CONFIG_NUMA */ | 5151 | #endif /* CONFIG_NUMA */ |
@@ -5263,50 +5168,32 @@ static void build_zonelists(pg_data_t *pgdat) | |||
5263 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | 5168 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); |
5264 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | 5169 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
5265 | static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); | 5170 | static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); |
5266 | static void setup_zone_pageset(struct zone *zone); | ||
5267 | |||
5268 | /* | ||
5269 | * Global mutex to protect against size modification of zonelists | ||
5270 | * as well as to serialize pageset setup for the new populated zone. | ||
5271 | */ | ||
5272 | DEFINE_MUTEX(zonelists_mutex); | ||
5273 | 5171 | ||
5274 | /* return values int ....just for stop_machine() */ | 5172 | static void __build_all_zonelists(void *data) |
5275 | static int __build_all_zonelists(void *data) | ||
5276 | { | 5173 | { |
5277 | int nid; | 5174 | int nid; |
5278 | int cpu; | 5175 | int __maybe_unused cpu; |
5279 | pg_data_t *self = data; | 5176 | pg_data_t *self = data; |
5177 | static DEFINE_SPINLOCK(lock); | ||
5178 | |||
5179 | spin_lock(&lock); | ||
5280 | 5180 | ||
5281 | #ifdef CONFIG_NUMA | 5181 | #ifdef CONFIG_NUMA |
5282 | memset(node_load, 0, sizeof(node_load)); | 5182 | memset(node_load, 0, sizeof(node_load)); |
5283 | #endif | 5183 | #endif |
5284 | 5184 | ||
5185 | /* | ||
5186 | * This node is hotadded and no memory is yet present. So just | ||
5187 | * building zonelists is fine - no need to touch other nodes. | ||
5188 | */ | ||
5285 | if (self && !node_online(self->node_id)) { | 5189 | if (self && !node_online(self->node_id)) { |
5286 | build_zonelists(self); | 5190 | build_zonelists(self); |
5287 | } | 5191 | } else { |
5288 | 5192 | for_each_online_node(nid) { | |
5289 | for_each_online_node(nid) { | 5193 | pg_data_t *pgdat = NODE_DATA(nid); |
5290 | pg_data_t *pgdat = NODE_DATA(nid); | ||
5291 | |||
5292 | build_zonelists(pgdat); | ||
5293 | } | ||
5294 | 5194 | ||
5295 | /* | 5195 | build_zonelists(pgdat); |
5296 | * Initialize the boot_pagesets that are going to be used | 5196 | } |
5297 | * for bootstrapping processors. The real pagesets for | ||
5298 | * each zone will be allocated later when the per cpu | ||
5299 | * allocator is available. | ||
5300 | * | ||
5301 | * boot_pagesets are used also for bootstrapping offline | ||
5302 | * cpus if the system is already booted because the pagesets | ||
5303 | * are needed to initialize allocators on a specific cpu too. | ||
5304 | * F.e. the percpu allocator needs the page allocator which | ||
5305 | * needs the percpu allocator in order to allocate its pagesets | ||
5306 | * (a chicken-egg dilemma). | ||
5307 | */ | ||
5308 | for_each_possible_cpu(cpu) { | ||
5309 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | ||
5310 | 5197 | ||
5311 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 5198 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
5312 | /* | 5199 | /* |
@@ -5317,45 +5204,53 @@ static int __build_all_zonelists(void *data) | |||
5317 | * secondary cpus' numa_mem as they come on-line. During | 5204 | * secondary cpus' numa_mem as they come on-line. During |
5318 | * node/memory hotplug, we'll fixup all on-line cpus. | 5205 | * node/memory hotplug, we'll fixup all on-line cpus. |
5319 | */ | 5206 | */ |
5320 | if (cpu_online(cpu)) | 5207 | for_each_online_cpu(cpu) |
5321 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); | 5208 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); |
5322 | #endif | 5209 | #endif |
5323 | } | 5210 | } |
5324 | 5211 | ||
5325 | return 0; | 5212 | spin_unlock(&lock); |
5326 | } | 5213 | } |
5327 | 5214 | ||
5328 | static noinline void __init | 5215 | static noinline void __init |
5329 | build_all_zonelists_init(void) | 5216 | build_all_zonelists_init(void) |
5330 | { | 5217 | { |
5218 | int cpu; | ||
5219 | |||
5331 | __build_all_zonelists(NULL); | 5220 | __build_all_zonelists(NULL); |
5221 | |||
5222 | /* | ||
5223 | * Initialize the boot_pagesets that are going to be used | ||
5224 | * for bootstrapping processors. The real pagesets for | ||
5225 | * each zone will be allocated later when the per cpu | ||
5226 | * allocator is available. | ||
5227 | * | ||
5228 | * boot_pagesets are used also for bootstrapping offline | ||
5229 | * cpus if the system is already booted because the pagesets | ||
5230 | * are needed to initialize allocators on a specific cpu too. | ||
5231 | * F.e. the percpu allocator needs the page allocator which | ||
5232 | * needs the percpu allocator in order to allocate its pagesets | ||
5233 | * (a chicken-egg dilemma). | ||
5234 | */ | ||
5235 | for_each_possible_cpu(cpu) | ||
5236 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | ||
5237 | |||
5332 | mminit_verify_zonelist(); | 5238 | mminit_verify_zonelist(); |
5333 | cpuset_init_current_mems_allowed(); | 5239 | cpuset_init_current_mems_allowed(); |
5334 | } | 5240 | } |
5335 | 5241 | ||
5336 | /* | 5242 | /* |
5337 | * Called with zonelists_mutex held always | ||
5338 | * unless system_state == SYSTEM_BOOTING. | 5243 | * unless system_state == SYSTEM_BOOTING. |
5339 | * | 5244 | * |
5340 | * __ref due to (1) call of __meminit annotated setup_zone_pageset | 5245 | * __ref due to call of __init annotated helper build_all_zonelists_init |
5341 | * [we're only called with non-NULL zone through __meminit paths] and | ||
5342 | * (2) call of __init annotated helper build_all_zonelists_init | ||
5343 | * [protected by SYSTEM_BOOTING]. | 5246 | * [protected by SYSTEM_BOOTING]. |
5344 | */ | 5247 | */ |
5345 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | 5248 | void __ref build_all_zonelists(pg_data_t *pgdat) |
5346 | { | 5249 | { |
5347 | set_zonelist_order(); | ||
5348 | |||
5349 | if (system_state == SYSTEM_BOOTING) { | 5250 | if (system_state == SYSTEM_BOOTING) { |
5350 | build_all_zonelists_init(); | 5251 | build_all_zonelists_init(); |
5351 | } else { | 5252 | } else { |
5352 | #ifdef CONFIG_MEMORY_HOTPLUG | 5253 | __build_all_zonelists(pgdat); |
5353 | if (zone) | ||
5354 | setup_zone_pageset(zone); | ||
5355 | #endif | ||
5356 | /* we have to stop all cpus to guarantee there is no user | ||
5357 | of zonelist */ | ||
5358 | stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL); | ||
5359 | /* cpuset refresh routine should be here */ | 5254 | /* cpuset refresh routine should be here */ |
5360 | } | 5255 | } |
5361 | vm_total_pages = nr_free_pagecache_pages(); | 5256 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -5371,9 +5266,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | |||
5371 | else | 5266 | else |
5372 | page_group_by_mobility_disabled = 0; | 5267 | page_group_by_mobility_disabled = 0; |
5373 | 5268 | ||
5374 | pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", | 5269 | pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", |
5375 | nr_online_nodes, | 5270 | nr_online_nodes, |
5376 | zonelist_order_name[current_zonelist_order], | ||
5377 | page_group_by_mobility_disabled ? "off" : "on", | 5271 | page_group_by_mobility_disabled ? "off" : "on", |
5378 | vm_total_pages); | 5272 | vm_total_pages); |
5379 | #ifdef CONFIG_NUMA | 5273 | #ifdef CONFIG_NUMA |
@@ -5627,7 +5521,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) | |||
5627 | pageset_set_high_and_batch(zone, pcp); | 5521 | pageset_set_high_and_batch(zone, pcp); |
5628 | } | 5522 | } |
5629 | 5523 | ||
5630 | static void __meminit setup_zone_pageset(struct zone *zone) | 5524 | void __meminit setup_zone_pageset(struct zone *zone) |
5631 | { | 5525 | { |
5632 | int cpu; | 5526 | int cpu; |
5633 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | 5527 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
@@ -7081,9 +6975,11 @@ static void __setup_per_zone_wmarks(void) | |||
7081 | */ | 6975 | */ |
7082 | void setup_per_zone_wmarks(void) | 6976 | void setup_per_zone_wmarks(void) |
7083 | { | 6977 | { |
7084 | mutex_lock(&zonelists_mutex); | 6978 | static DEFINE_SPINLOCK(lock); |
6979 | |||
6980 | spin_lock(&lock); | ||
7085 | __setup_per_zone_wmarks(); | 6981 | __setup_per_zone_wmarks(); |
7086 | mutex_unlock(&zonelists_mutex); | 6982 | spin_unlock(&lock); |
7087 | } | 6983 | } |
7088 | 6984 | ||
7089 | /* | 6985 | /* |
diff --git a/mm/page_ext.c b/mm/page_ext.c index 88ccc044b09a..32f18911deda 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c | |||
@@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid) | |||
222 | return addr; | 222 | return addr; |
223 | } | 223 | } |
224 | 224 | ||
225 | if (node_state(nid, N_HIGH_MEMORY)) | 225 | addr = vzalloc_node(size, nid); |
226 | addr = vzalloc_node(size, nid); | ||
227 | else | ||
228 | addr = vzalloc(size); | ||
229 | 226 | ||
230 | return addr; | 227 | return addr; |
231 | } | 228 | } |
@@ -409,6 +406,7 @@ void __init page_ext_init(void) | |||
409 | continue; | 406 | continue; |
410 | if (init_section_page_ext(pfn, nid)) | 407 | if (init_section_page_ext(pfn, nid)) |
411 | goto oom; | 408 | goto oom; |
409 | cond_resched(); | ||
412 | } | 410 | } |
413 | } | 411 | } |
414 | hotplug_memory_notifier(page_ext_callback, 0); | 412 | hotplug_memory_notifier(page_ext_callback, 0); |
diff --git a/mm/page_idle.c b/mm/page_idle.c index 1b0f48c62316..4bd03a8d809e 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c | |||
@@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = { | |||
204 | NULL, | 204 | NULL, |
205 | }; | 205 | }; |
206 | 206 | ||
207 | static struct attribute_group page_idle_attr_group = { | 207 | static const struct attribute_group page_idle_attr_group = { |
208 | .bin_attrs = page_idle_bin_attrs, | 208 | .bin_attrs = page_idle_bin_attrs, |
209 | .name = "page_idle", | 209 | .name = "page_idle", |
210 | }; | 210 | }; |
diff --git a/mm/page_io.c b/mm/page_io.c index 5f61b54ee1f3..20139b90125a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -28,16 +28,18 @@ | |||
28 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 28 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
29 | struct page *page, bio_end_io_t end_io) | 29 | struct page *page, bio_end_io_t end_io) |
30 | { | 30 | { |
31 | int i, nr = hpage_nr_pages(page); | ||
31 | struct bio *bio; | 32 | struct bio *bio; |
32 | 33 | ||
33 | bio = bio_alloc(gfp_flags, 1); | 34 | bio = bio_alloc(gfp_flags, nr); |
34 | if (bio) { | 35 | if (bio) { |
35 | bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); | 36 | bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); |
36 | bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; | 37 | bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; |
37 | bio->bi_end_io = end_io; | 38 | bio->bi_end_io = end_io; |
38 | 39 | ||
39 | bio_add_page(bio, page, PAGE_SIZE, 0); | 40 | for (i = 0; i < nr; i++) |
40 | BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE); | 41 | bio_add_page(bio, page + i, PAGE_SIZE, 0); |
42 | VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr); | ||
41 | } | 43 | } |
42 | return bio; | 44 | return bio; |
43 | } | 45 | } |
@@ -262,6 +264,15 @@ static sector_t swap_page_sector(struct page *page) | |||
262 | return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9); | 264 | return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9); |
263 | } | 265 | } |
264 | 266 | ||
267 | static inline void count_swpout_vm_event(struct page *page) | ||
268 | { | ||
269 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
270 | if (unlikely(PageTransHuge(page))) | ||
271 | count_vm_event(THP_SWPOUT); | ||
272 | #endif | ||
273 | count_vm_events(PSWPOUT, hpage_nr_pages(page)); | ||
274 | } | ||
275 | |||
265 | int __swap_writepage(struct page *page, struct writeback_control *wbc, | 276 | int __swap_writepage(struct page *page, struct writeback_control *wbc, |
266 | bio_end_io_t end_write_func) | 277 | bio_end_io_t end_write_func) |
267 | { | 278 | { |
@@ -313,7 +324,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
313 | 324 | ||
314 | ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); | 325 | ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); |
315 | if (!ret) { | 326 | if (!ret) { |
316 | count_vm_event(PSWPOUT); | 327 | count_swpout_vm_event(page); |
317 | return 0; | 328 | return 0; |
318 | } | 329 | } |
319 | 330 | ||
@@ -326,7 +337,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
326 | goto out; | 337 | goto out; |
327 | } | 338 | } |
328 | bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); | 339 | bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); |
329 | count_vm_event(PSWPOUT); | 340 | count_swpout_vm_event(page); |
330 | set_page_writeback(page); | 341 | set_page_writeback(page); |
331 | unlock_page(page); | 342 | unlock_page(page); |
332 | submit_bio(bio); | 343 | submit_bio(bio); |
diff --git a/mm/page_owner.c b/mm/page_owner.c index 0fd9dcf2c5dc..8e2d7137510c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited); | |||
30 | 30 | ||
31 | static depot_stack_handle_t dummy_handle; | 31 | static depot_stack_handle_t dummy_handle; |
32 | static depot_stack_handle_t failure_handle; | 32 | static depot_stack_handle_t failure_handle; |
33 | static depot_stack_handle_t early_handle; | ||
33 | 34 | ||
34 | static void init_early_allocated_pages(void); | 35 | static void init_early_allocated_pages(void); |
35 | 36 | ||
@@ -53,7 +54,7 @@ static bool need_page_owner(void) | |||
53 | return true; | 54 | return true; |
54 | } | 55 | } |
55 | 56 | ||
56 | static noinline void register_dummy_stack(void) | 57 | static __always_inline depot_stack_handle_t create_dummy_stack(void) |
57 | { | 58 | { |
58 | unsigned long entries[4]; | 59 | unsigned long entries[4]; |
59 | struct stack_trace dummy; | 60 | struct stack_trace dummy; |
@@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void) | |||
64 | dummy.skip = 0; | 65 | dummy.skip = 0; |
65 | 66 | ||
66 | save_stack_trace(&dummy); | 67 | save_stack_trace(&dummy); |
67 | dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); | 68 | return depot_save_stack(&dummy, GFP_KERNEL); |
68 | } | 69 | } |
69 | 70 | ||
70 | static noinline void register_failure_stack(void) | 71 | static noinline void register_dummy_stack(void) |
71 | { | 72 | { |
72 | unsigned long entries[4]; | 73 | dummy_handle = create_dummy_stack(); |
73 | struct stack_trace failure; | 74 | } |
74 | 75 | ||
75 | failure.nr_entries = 0; | 76 | static noinline void register_failure_stack(void) |
76 | failure.max_entries = ARRAY_SIZE(entries); | 77 | { |
77 | failure.entries = &entries[0]; | 78 | failure_handle = create_dummy_stack(); |
78 | failure.skip = 0; | 79 | } |
79 | 80 | ||
80 | save_stack_trace(&failure); | 81 | static noinline void register_early_stack(void) |
81 | failure_handle = depot_save_stack(&failure, GFP_KERNEL); | 82 | { |
83 | early_handle = create_dummy_stack(); | ||
82 | } | 84 | } |
83 | 85 | ||
84 | static void init_page_owner(void) | 86 | static void init_page_owner(void) |
@@ -88,6 +90,7 @@ static void init_page_owner(void) | |||
88 | 90 | ||
89 | register_dummy_stack(); | 91 | register_dummy_stack(); |
90 | register_failure_stack(); | 92 | register_failure_stack(); |
93 | register_early_stack(); | ||
91 | static_branch_enable(&page_owner_inited); | 94 | static_branch_enable(&page_owner_inited); |
92 | init_early_allocated_pages(); | 95 | init_early_allocated_pages(); |
93 | } | 96 | } |
@@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) | |||
165 | return handle; | 168 | return handle; |
166 | } | 169 | } |
167 | 170 | ||
168 | noinline void __set_page_owner(struct page *page, unsigned int order, | 171 | static inline void __set_page_owner_handle(struct page_ext *page_ext, |
169 | gfp_t gfp_mask) | 172 | depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) |
170 | { | 173 | { |
171 | struct page_ext *page_ext = lookup_page_ext(page); | ||
172 | struct page_owner *page_owner; | 174 | struct page_owner *page_owner; |
173 | 175 | ||
174 | if (unlikely(!page_ext)) | ||
175 | return; | ||
176 | |||
177 | page_owner = get_page_owner(page_ext); | 176 | page_owner = get_page_owner(page_ext); |
178 | page_owner->handle = save_stack(gfp_mask); | 177 | page_owner->handle = handle; |
179 | page_owner->order = order; | 178 | page_owner->order = order; |
180 | page_owner->gfp_mask = gfp_mask; | 179 | page_owner->gfp_mask = gfp_mask; |
181 | page_owner->last_migrate_reason = -1; | 180 | page_owner->last_migrate_reason = -1; |
@@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order, | |||
183 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | 182 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); |
184 | } | 183 | } |
185 | 184 | ||
185 | noinline void __set_page_owner(struct page *page, unsigned int order, | ||
186 | gfp_t gfp_mask) | ||
187 | { | ||
188 | struct page_ext *page_ext = lookup_page_ext(page); | ||
189 | depot_stack_handle_t handle; | ||
190 | |||
191 | if (unlikely(!page_ext)) | ||
192 | return; | ||
193 | |||
194 | handle = save_stack(gfp_mask); | ||
195 | __set_page_owner_handle(page_ext, handle, order, gfp_mask); | ||
196 | } | ||
197 | |||
186 | void __set_page_owner_migrate_reason(struct page *page, int reason) | 198 | void __set_page_owner_migrate_reason(struct page *page, int reason) |
187 | { | 199 | { |
188 | struct page_ext *page_ext = lookup_page_ext(page); | 200 | struct page_ext *page_ext = lookup_page_ext(page); |
@@ -550,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) | |||
550 | continue; | 562 | continue; |
551 | 563 | ||
552 | /* | 564 | /* |
553 | * We are safe to check buddy flag and order, because | 565 | * To avoid having to grab zone->lock, be a little |
554 | * this is init stage and only single thread runs. | 566 | * careful when reading buddy page order. The only |
567 | * danger is that we skip too much and potentially miss | ||
568 | * some early allocated pages, which is better than | ||
569 | * heavy lock contention. | ||
555 | */ | 570 | */ |
556 | if (PageBuddy(page)) { | 571 | if (PageBuddy(page)) { |
557 | pfn += (1UL << page_order(page)) - 1; | 572 | unsigned long order = page_order_unsafe(page); |
573 | |||
574 | if (order > 0 && order < MAX_ORDER) | ||
575 | pfn += (1UL << order) - 1; | ||
558 | continue; | 576 | continue; |
559 | } | 577 | } |
560 | 578 | ||
@@ -565,14 +583,15 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) | |||
565 | if (unlikely(!page_ext)) | 583 | if (unlikely(!page_ext)) |
566 | continue; | 584 | continue; |
567 | 585 | ||
568 | /* Maybe overraping zone */ | 586 | /* Maybe overlapping zone */ |
569 | if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | 587 | if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) |
570 | continue; | 588 | continue; |
571 | 589 | ||
572 | /* Found early allocated page */ | 590 | /* Found early allocated page */ |
573 | set_page_owner(page, 0, 0); | 591 | __set_page_owner_handle(page_ext, early_handle, 0, 0); |
574 | count++; | 592 | count++; |
575 | } | 593 | } |
594 | cond_resched(); | ||
576 | } | 595 | } |
577 | 596 | ||
578 | pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", | 597 | pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", |
@@ -583,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat) | |||
583 | { | 602 | { |
584 | struct zone *zone; | 603 | struct zone *zone; |
585 | struct zone *node_zones = pgdat->node_zones; | 604 | struct zone *node_zones = pgdat->node_zones; |
586 | unsigned long flags; | ||
587 | 605 | ||
588 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 606 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
589 | if (!populated_zone(zone)) | 607 | if (!populated_zone(zone)) |
590 | continue; | 608 | continue; |
591 | 609 | ||
592 | spin_lock_irqsave(&zone->lock, flags); | ||
593 | init_pages_in_zone(pgdat, zone); | 610 | init_pages_in_zone(pgdat, zone); |
594 | spin_unlock_irqrestore(&zone->lock, flags); | ||
595 | } | 611 | } |
596 | } | 612 | } |
597 | 613 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index fbcb3c96a186..ace53a582be5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/uio.h> | 35 | #include <linux/uio.h> |
36 | #include <linux/khugepaged.h> | 36 | #include <linux/khugepaged.h> |
37 | #include <linux/hugetlb.h> | ||
37 | 38 | ||
38 | #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ | 39 | #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ |
39 | 40 | ||
@@ -188,6 +189,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) | |||
188 | vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); | 189 | vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); |
189 | } | 190 | } |
190 | 191 | ||
192 | static inline bool shmem_inode_acct_block(struct inode *inode, long pages) | ||
193 | { | ||
194 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
195 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
196 | |||
197 | if (shmem_acct_block(info->flags, pages)) | ||
198 | return false; | ||
199 | |||
200 | if (sbinfo->max_blocks) { | ||
201 | if (percpu_counter_compare(&sbinfo->used_blocks, | ||
202 | sbinfo->max_blocks - pages) > 0) | ||
203 | goto unacct; | ||
204 | percpu_counter_add(&sbinfo->used_blocks, pages); | ||
205 | } | ||
206 | |||
207 | return true; | ||
208 | |||
209 | unacct: | ||
210 | shmem_unacct_blocks(info->flags, pages); | ||
211 | return false; | ||
212 | } | ||
213 | |||
214 | static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) | ||
215 | { | ||
216 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
217 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
218 | |||
219 | if (sbinfo->max_blocks) | ||
220 | percpu_counter_sub(&sbinfo->used_blocks, pages); | ||
221 | shmem_unacct_blocks(info->flags, pages); | ||
222 | } | ||
223 | |||
191 | static const struct super_operations shmem_ops; | 224 | static const struct super_operations shmem_ops; |
192 | static const struct address_space_operations shmem_aops; | 225 | static const struct address_space_operations shmem_aops; |
193 | static const struct file_operations shmem_file_operations; | 226 | static const struct file_operations shmem_file_operations; |
@@ -249,23 +282,20 @@ static void shmem_recalc_inode(struct inode *inode) | |||
249 | 282 | ||
250 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; | 283 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; |
251 | if (freed > 0) { | 284 | if (freed > 0) { |
252 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
253 | if (sbinfo->max_blocks) | ||
254 | percpu_counter_add(&sbinfo->used_blocks, -freed); | ||
255 | info->alloced -= freed; | 285 | info->alloced -= freed; |
256 | inode->i_blocks -= freed * BLOCKS_PER_PAGE; | 286 | inode->i_blocks -= freed * BLOCKS_PER_PAGE; |
257 | shmem_unacct_blocks(info->flags, freed); | 287 | shmem_inode_unacct_blocks(inode, freed); |
258 | } | 288 | } |
259 | } | 289 | } |
260 | 290 | ||
261 | bool shmem_charge(struct inode *inode, long pages) | 291 | bool shmem_charge(struct inode *inode, long pages) |
262 | { | 292 | { |
263 | struct shmem_inode_info *info = SHMEM_I(inode); | 293 | struct shmem_inode_info *info = SHMEM_I(inode); |
264 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
265 | unsigned long flags; | 294 | unsigned long flags; |
266 | 295 | ||
267 | if (shmem_acct_block(info->flags, pages)) | 296 | if (!shmem_inode_acct_block(inode, pages)) |
268 | return false; | 297 | return false; |
298 | |||
269 | spin_lock_irqsave(&info->lock, flags); | 299 | spin_lock_irqsave(&info->lock, flags); |
270 | info->alloced += pages; | 300 | info->alloced += pages; |
271 | inode->i_blocks += pages * BLOCKS_PER_PAGE; | 301 | inode->i_blocks += pages * BLOCKS_PER_PAGE; |
@@ -273,26 +303,12 @@ bool shmem_charge(struct inode *inode, long pages) | |||
273 | spin_unlock_irqrestore(&info->lock, flags); | 303 | spin_unlock_irqrestore(&info->lock, flags); |
274 | inode->i_mapping->nrpages += pages; | 304 | inode->i_mapping->nrpages += pages; |
275 | 305 | ||
276 | if (!sbinfo->max_blocks) | ||
277 | return true; | ||
278 | if (percpu_counter_compare(&sbinfo->used_blocks, | ||
279 | sbinfo->max_blocks - pages) > 0) { | ||
280 | inode->i_mapping->nrpages -= pages; | ||
281 | spin_lock_irqsave(&info->lock, flags); | ||
282 | info->alloced -= pages; | ||
283 | shmem_recalc_inode(inode); | ||
284 | spin_unlock_irqrestore(&info->lock, flags); | ||
285 | shmem_unacct_blocks(info->flags, pages); | ||
286 | return false; | ||
287 | } | ||
288 | percpu_counter_add(&sbinfo->used_blocks, pages); | ||
289 | return true; | 306 | return true; |
290 | } | 307 | } |
291 | 308 | ||
292 | void shmem_uncharge(struct inode *inode, long pages) | 309 | void shmem_uncharge(struct inode *inode, long pages) |
293 | { | 310 | { |
294 | struct shmem_inode_info *info = SHMEM_I(inode); | 311 | struct shmem_inode_info *info = SHMEM_I(inode); |
295 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
296 | unsigned long flags; | 312 | unsigned long flags; |
297 | 313 | ||
298 | spin_lock_irqsave(&info->lock, flags); | 314 | spin_lock_irqsave(&info->lock, flags); |
@@ -301,9 +317,7 @@ void shmem_uncharge(struct inode *inode, long pages) | |||
301 | shmem_recalc_inode(inode); | 317 | shmem_recalc_inode(inode); |
302 | spin_unlock_irqrestore(&info->lock, flags); | 318 | spin_unlock_irqrestore(&info->lock, flags); |
303 | 319 | ||
304 | if (sbinfo->max_blocks) | 320 | shmem_inode_unacct_blocks(inode, pages); |
305 | percpu_counter_sub(&sbinfo->used_blocks, pages); | ||
306 | shmem_unacct_blocks(info->flags, pages); | ||
307 | } | 321 | } |
308 | 322 | ||
309 | /* | 323 | /* |
@@ -1452,9 +1466,10 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
1452 | } | 1466 | } |
1453 | 1467 | ||
1454 | static struct page *shmem_alloc_and_acct_page(gfp_t gfp, | 1468 | static struct page *shmem_alloc_and_acct_page(gfp_t gfp, |
1455 | struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, | 1469 | struct inode *inode, |
1456 | pgoff_t index, bool huge) | 1470 | pgoff_t index, bool huge) |
1457 | { | 1471 | { |
1472 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
1458 | struct page *page; | 1473 | struct page *page; |
1459 | int nr; | 1474 | int nr; |
1460 | int err = -ENOSPC; | 1475 | int err = -ENOSPC; |
@@ -1463,14 +1478,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, | |||
1463 | huge = false; | 1478 | huge = false; |
1464 | nr = huge ? HPAGE_PMD_NR : 1; | 1479 | nr = huge ? HPAGE_PMD_NR : 1; |
1465 | 1480 | ||
1466 | if (shmem_acct_block(info->flags, nr)) | 1481 | if (!shmem_inode_acct_block(inode, nr)) |
1467 | goto failed; | 1482 | goto failed; |
1468 | if (sbinfo->max_blocks) { | ||
1469 | if (percpu_counter_compare(&sbinfo->used_blocks, | ||
1470 | sbinfo->max_blocks - nr) > 0) | ||
1471 | goto unacct; | ||
1472 | percpu_counter_add(&sbinfo->used_blocks, nr); | ||
1473 | } | ||
1474 | 1483 | ||
1475 | if (huge) | 1484 | if (huge) |
1476 | page = shmem_alloc_hugepage(gfp, info, index); | 1485 | page = shmem_alloc_hugepage(gfp, info, index); |
@@ -1483,10 +1492,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, | |||
1483 | } | 1492 | } |
1484 | 1493 | ||
1485 | err = -ENOMEM; | 1494 | err = -ENOMEM; |
1486 | if (sbinfo->max_blocks) | 1495 | shmem_inode_unacct_blocks(inode, nr); |
1487 | percpu_counter_add(&sbinfo->used_blocks, -nr); | ||
1488 | unacct: | ||
1489 | shmem_unacct_blocks(info->flags, nr); | ||
1490 | failed: | 1496 | failed: |
1491 | return ERR_PTR(err); | 1497 | return ERR_PTR(err); |
1492 | } | 1498 | } |
@@ -1644,7 +1650,7 @@ repeat: | |||
1644 | 1650 | ||
1645 | if (swap.val) { | 1651 | if (swap.val) { |
1646 | /* Look it up and read it in.. */ | 1652 | /* Look it up and read it in.. */ |
1647 | page = lookup_swap_cache(swap); | 1653 | page = lookup_swap_cache(swap, NULL, 0); |
1648 | if (!page) { | 1654 | if (!page) { |
1649 | /* Or update major stats only when swapin succeeds?? */ | 1655 | /* Or update major stats only when swapin succeeds?? */ |
1650 | if (fault_type) { | 1656 | if (fault_type) { |
@@ -1751,10 +1757,9 @@ repeat: | |||
1751 | } | 1757 | } |
1752 | 1758 | ||
1753 | alloc_huge: | 1759 | alloc_huge: |
1754 | page = shmem_alloc_and_acct_page(gfp, info, sbinfo, | 1760 | page = shmem_alloc_and_acct_page(gfp, inode, index, true); |
1755 | index, true); | ||
1756 | if (IS_ERR(page)) { | 1761 | if (IS_ERR(page)) { |
1757 | alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, | 1762 | alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, |
1758 | index, false); | 1763 | index, false); |
1759 | } | 1764 | } |
1760 | if (IS_ERR(page)) { | 1765 | if (IS_ERR(page)) { |
@@ -1876,10 +1881,7 @@ clear: | |||
1876 | * Error recovery. | 1881 | * Error recovery. |
1877 | */ | 1882 | */ |
1878 | unacct: | 1883 | unacct: |
1879 | if (sbinfo->max_blocks) | 1884 | shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); |
1880 | percpu_counter_sub(&sbinfo->used_blocks, | ||
1881 | 1 << compound_order(page)); | ||
1882 | shmem_unacct_blocks(info->flags, 1 << compound_order(page)); | ||
1883 | 1885 | ||
1884 | if (PageTransHuge(page)) { | 1886 | if (PageTransHuge(page)) { |
1885 | unlock_page(page); | 1887 | unlock_page(page); |
@@ -2206,16 +2208,16 @@ bool shmem_mapping(struct address_space *mapping) | |||
2206 | return mapping->a_ops == &shmem_aops; | 2208 | return mapping->a_ops == &shmem_aops; |
2207 | } | 2209 | } |
2208 | 2210 | ||
2209 | int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, | 2211 | static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, |
2210 | pmd_t *dst_pmd, | 2212 | pmd_t *dst_pmd, |
2211 | struct vm_area_struct *dst_vma, | 2213 | struct vm_area_struct *dst_vma, |
2212 | unsigned long dst_addr, | 2214 | unsigned long dst_addr, |
2213 | unsigned long src_addr, | 2215 | unsigned long src_addr, |
2214 | struct page **pagep) | 2216 | bool zeropage, |
2217 | struct page **pagep) | ||
2215 | { | 2218 | { |
2216 | struct inode *inode = file_inode(dst_vma->vm_file); | 2219 | struct inode *inode = file_inode(dst_vma->vm_file); |
2217 | struct shmem_inode_info *info = SHMEM_I(inode); | 2220 | struct shmem_inode_info *info = SHMEM_I(inode); |
2218 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
2219 | struct address_space *mapping = inode->i_mapping; | 2221 | struct address_space *mapping = inode->i_mapping; |
2220 | gfp_t gfp = mapping_gfp_mask(mapping); | 2222 | gfp_t gfp = mapping_gfp_mask(mapping); |
2221 | pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); | 2223 | pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); |
@@ -2227,33 +2229,30 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, | |||
2227 | int ret; | 2229 | int ret; |
2228 | 2230 | ||
2229 | ret = -ENOMEM; | 2231 | ret = -ENOMEM; |
2230 | if (shmem_acct_block(info->flags, 1)) | 2232 | if (!shmem_inode_acct_block(inode, 1)) |
2231 | goto out; | 2233 | goto out; |
2232 | if (sbinfo->max_blocks) { | ||
2233 | if (percpu_counter_compare(&sbinfo->used_blocks, | ||
2234 | sbinfo->max_blocks) >= 0) | ||
2235 | goto out_unacct_blocks; | ||
2236 | percpu_counter_inc(&sbinfo->used_blocks); | ||
2237 | } | ||
2238 | 2234 | ||
2239 | if (!*pagep) { | 2235 | if (!*pagep) { |
2240 | page = shmem_alloc_page(gfp, info, pgoff); | 2236 | page = shmem_alloc_page(gfp, info, pgoff); |
2241 | if (!page) | 2237 | if (!page) |
2242 | goto out_dec_used_blocks; | 2238 | goto out_unacct_blocks; |
2243 | 2239 | ||
2244 | page_kaddr = kmap_atomic(page); | 2240 | if (!zeropage) { /* mcopy_atomic */ |
2245 | ret = copy_from_user(page_kaddr, (const void __user *)src_addr, | 2241 | page_kaddr = kmap_atomic(page); |
2246 | PAGE_SIZE); | 2242 | ret = copy_from_user(page_kaddr, |
2247 | kunmap_atomic(page_kaddr); | 2243 | (const void __user *)src_addr, |
2248 | 2244 | PAGE_SIZE); | |
2249 | /* fallback to copy_from_user outside mmap_sem */ | 2245 | kunmap_atomic(page_kaddr); |
2250 | if (unlikely(ret)) { | 2246 | |
2251 | *pagep = page; | 2247 | /* fallback to copy_from_user outside mmap_sem */ |
2252 | if (sbinfo->max_blocks) | 2248 | if (unlikely(ret)) { |
2253 | percpu_counter_add(&sbinfo->used_blocks, -1); | 2249 | *pagep = page; |
2254 | shmem_unacct_blocks(info->flags, 1); | 2250 | shmem_inode_unacct_blocks(inode, 1); |
2255 | /* don't free the page */ | 2251 | /* don't free the page */ |
2256 | return -EFAULT; | 2252 | return -EFAULT; |
2253 | } | ||
2254 | } else { /* mfill_zeropage_atomic */ | ||
2255 | clear_highpage(page); | ||
2257 | } | 2256 | } |
2258 | } else { | 2257 | } else { |
2259 | page = *pagep; | 2258 | page = *pagep; |
@@ -2314,14 +2313,33 @@ out_release_uncharge: | |||
2314 | out_release: | 2313 | out_release: |
2315 | unlock_page(page); | 2314 | unlock_page(page); |
2316 | put_page(page); | 2315 | put_page(page); |
2317 | out_dec_used_blocks: | ||
2318 | if (sbinfo->max_blocks) | ||
2319 | percpu_counter_add(&sbinfo->used_blocks, -1); | ||
2320 | out_unacct_blocks: | 2316 | out_unacct_blocks: |
2321 | shmem_unacct_blocks(info->flags, 1); | 2317 | shmem_inode_unacct_blocks(inode, 1); |
2322 | goto out; | 2318 | goto out; |
2323 | } | 2319 | } |
2324 | 2320 | ||
2321 | int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, | ||
2322 | pmd_t *dst_pmd, | ||
2323 | struct vm_area_struct *dst_vma, | ||
2324 | unsigned long dst_addr, | ||
2325 | unsigned long src_addr, | ||
2326 | struct page **pagep) | ||
2327 | { | ||
2328 | return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, | ||
2329 | dst_addr, src_addr, false, pagep); | ||
2330 | } | ||
2331 | |||
2332 | int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, | ||
2333 | pmd_t *dst_pmd, | ||
2334 | struct vm_area_struct *dst_vma, | ||
2335 | unsigned long dst_addr) | ||
2336 | { | ||
2337 | struct page *page = NULL; | ||
2338 | |||
2339 | return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, | ||
2340 | dst_addr, 0, true, &page); | ||
2341 | } | ||
2342 | |||
2325 | #ifdef CONFIG_TMPFS | 2343 | #ifdef CONFIG_TMPFS |
2326 | static const struct inode_operations shmem_symlink_inode_operations; | 2344 | static const struct inode_operations shmem_symlink_inode_operations; |
2327 | static const struct inode_operations shmem_short_symlink_operations; | 2345 | static const struct inode_operations shmem_short_symlink_operations; |
@@ -3635,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) | |||
3635 | #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) | 3653 | #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) |
3636 | #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) | 3654 | #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) |
3637 | 3655 | ||
3638 | #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) | 3656 | #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) |
3639 | 3657 | ||
3640 | SYSCALL_DEFINE2(memfd_create, | 3658 | SYSCALL_DEFINE2(memfd_create, |
3641 | const char __user *, uname, | 3659 | const char __user *, uname, |
@@ -3647,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create, | |||
3647 | char *name; | 3665 | char *name; |
3648 | long len; | 3666 | long len; |
3649 | 3667 | ||
3650 | if (flags & ~(unsigned int)MFD_ALL_FLAGS) | 3668 | if (!(flags & MFD_HUGETLB)) { |
3651 | return -EINVAL; | 3669 | if (flags & ~(unsigned int)MFD_ALL_FLAGS) |
3670 | return -EINVAL; | ||
3671 | } else { | ||
3672 | /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ | ||
3673 | if (flags & MFD_ALLOW_SEALING) | ||
3674 | return -EINVAL; | ||
3675 | /* Allow huge page size encoding in flags. */ | ||
3676 | if (flags & ~(unsigned int)(MFD_ALL_FLAGS | | ||
3677 | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) | ||
3678 | return -EINVAL; | ||
3679 | } | ||
3652 | 3680 | ||
3653 | /* length includes terminating zero */ | 3681 | /* length includes terminating zero */ |
3654 | len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); | 3682 | len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); |
@@ -3679,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create, | |||
3679 | goto err_name; | 3707 | goto err_name; |
3680 | } | 3708 | } |
3681 | 3709 | ||
3682 | file = shmem_file_setup(name, 0, VM_NORESERVE); | 3710 | if (flags & MFD_HUGETLB) { |
3711 | struct user_struct *user = NULL; | ||
3712 | |||
3713 | file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, | ||
3714 | HUGETLB_ANONHUGE_INODE, | ||
3715 | (flags >> MFD_HUGE_SHIFT) & | ||
3716 | MFD_HUGE_MASK); | ||
3717 | } else | ||
3718 | file = shmem_file_setup(name, 0, VM_NORESERVE); | ||
3683 | if (IS_ERR(file)) { | 3719 | if (IS_ERR(file)) { |
3684 | error = PTR_ERR(file); | 3720 | error = PTR_ERR(file); |
3685 | goto err_fd; | 3721 | goto err_fd; |
3686 | } | 3722 | } |
3687 | info = SHMEM_I(file_inode(file)); | ||
3688 | file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; | 3723 | file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; |
3689 | file->f_flags |= O_RDWR | O_LARGEFILE; | 3724 | file->f_flags |= O_RDWR | O_LARGEFILE; |
3690 | if (flags & MFD_ALLOW_SEALING) | 3725 | |
3726 | if (flags & MFD_ALLOW_SEALING) { | ||
3727 | /* | ||
3728 | * flags check at beginning of function ensures | ||
3729 | * this is not a hugetlbfs (MFD_HUGETLB) file. | ||
3730 | */ | ||
3731 | info = SHMEM_I(file_inode(file)); | ||
3691 | info->seals &= ~F_SEAL_SEAL; | 3732 | info->seals &= ~F_SEAL_SEAL; |
3733 | } | ||
3692 | 3734 | ||
3693 | fd_install(fd, file); | 3735 | fd_install(fd, file); |
3694 | kfree(name); | 3736 | kfree(name); |
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/stacktrace.h> | 34 | #include <linux/stacktrace.h> |
35 | #include <linux/prefetch.h> | 35 | #include <linux/prefetch.h> |
36 | #include <linux/memcontrol.h> | 36 | #include <linux/memcontrol.h> |
37 | #include <linux/random.h> | ||
37 | 38 | ||
38 | #include <trace/events/kmem.h> | 39 | #include <trace/events/kmem.h> |
39 | 40 | ||
@@ -238,30 +239,62 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) | |||
238 | * Core slab cache functions | 239 | * Core slab cache functions |
239 | *******************************************************************/ | 240 | *******************************************************************/ |
240 | 241 | ||
242 | /* | ||
243 | * Returns freelist pointer (ptr). With hardening, this is obfuscated | ||
244 | * with an XOR of the address where the pointer is held and a per-cache | ||
245 | * random number. | ||
246 | */ | ||
247 | static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, | ||
248 | unsigned long ptr_addr) | ||
249 | { | ||
250 | #ifdef CONFIG_SLAB_FREELIST_HARDENED | ||
251 | return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); | ||
252 | #else | ||
253 | return ptr; | ||
254 | #endif | ||
255 | } | ||
256 | |||
257 | /* Returns the freelist pointer recorded at location ptr_addr. */ | ||
258 | static inline void *freelist_dereference(const struct kmem_cache *s, | ||
259 | void *ptr_addr) | ||
260 | { | ||
261 | return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr), | ||
262 | (unsigned long)ptr_addr); | ||
263 | } | ||
264 | |||
241 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | 265 | static inline void *get_freepointer(struct kmem_cache *s, void *object) |
242 | { | 266 | { |
243 | return *(void **)(object + s->offset); | 267 | return freelist_dereference(s, object + s->offset); |
244 | } | 268 | } |
245 | 269 | ||
246 | static void prefetch_freepointer(const struct kmem_cache *s, void *object) | 270 | static void prefetch_freepointer(const struct kmem_cache *s, void *object) |
247 | { | 271 | { |
248 | prefetch(object + s->offset); | 272 | if (object) |
273 | prefetch(freelist_dereference(s, object + s->offset)); | ||
249 | } | 274 | } |
250 | 275 | ||
251 | static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) | 276 | static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) |
252 | { | 277 | { |
278 | unsigned long freepointer_addr; | ||
253 | void *p; | 279 | void *p; |
254 | 280 | ||
255 | if (!debug_pagealloc_enabled()) | 281 | if (!debug_pagealloc_enabled()) |
256 | return get_freepointer(s, object); | 282 | return get_freepointer(s, object); |
257 | 283 | ||
258 | probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); | 284 | freepointer_addr = (unsigned long)object + s->offset; |
259 | return p; | 285 | probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p)); |
286 | return freelist_ptr(s, p, freepointer_addr); | ||
260 | } | 287 | } |
261 | 288 | ||
262 | static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) | 289 | static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) |
263 | { | 290 | { |
264 | *(void **)(object + s->offset) = fp; | 291 | unsigned long freeptr_addr = (unsigned long)object + s->offset; |
292 | |||
293 | #ifdef CONFIG_SLAB_FREELIST_HARDENED | ||
294 | BUG_ON(object == fp); /* naive detection of double free or corruption */ | ||
295 | #endif | ||
296 | |||
297 | *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); | ||
265 | } | 298 | } |
266 | 299 | ||
267 | /* Loop over all objects in a slab */ | 300 | /* Loop over all objects in a slab */ |
@@ -3358,8 +3391,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
3358 | struct kmem_cache_node *n; | 3391 | struct kmem_cache_node *n; |
3359 | 3392 | ||
3360 | for_each_kmem_cache_node(s, node, n) { | 3393 | for_each_kmem_cache_node(s, node, n) { |
3361 | kmem_cache_free(kmem_cache_node, n); | ||
3362 | s->node[node] = NULL; | 3394 | s->node[node] = NULL; |
3395 | kmem_cache_free(kmem_cache_node, n); | ||
3363 | } | 3396 | } |
3364 | } | 3397 | } |
3365 | 3398 | ||
@@ -3389,8 +3422,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) | |||
3389 | return 0; | 3422 | return 0; |
3390 | } | 3423 | } |
3391 | 3424 | ||
3392 | s->node[node] = n; | ||
3393 | init_kmem_cache_node(n); | 3425 | init_kmem_cache_node(n); |
3426 | s->node[node] = n; | ||
3394 | } | 3427 | } |
3395 | return 1; | 3428 | return 1; |
3396 | } | 3429 | } |
@@ -3563,6 +3596,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | |||
3563 | { | 3596 | { |
3564 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); | 3597 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); |
3565 | s->reserved = 0; | 3598 | s->reserved = 0; |
3599 | #ifdef CONFIG_SLAB_FREELIST_HARDENED | ||
3600 | s->random = get_random_long(); | ||
3601 | #endif | ||
3566 | 3602 | ||
3567 | if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) | 3603 | if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) |
3568 | s->reserved = sizeof(struct rcu_head); | 3604 | s->reserved = sizeof(struct rcu_head); |
@@ -5423,7 +5459,7 @@ static struct attribute *slab_attrs[] = { | |||
5423 | NULL | 5459 | NULL |
5424 | }; | 5460 | }; |
5425 | 5461 | ||
5426 | static struct attribute_group slab_attr_group = { | 5462 | static const struct attribute_group slab_attr_group = { |
5427 | .attrs = slab_attrs, | 5463 | .attrs = slab_attrs, |
5428 | }; | 5464 | }; |
5429 | 5465 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index c50b1a14d55e..d1a39b8051e0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
54 | if (slab_is_available()) { | 54 | if (slab_is_available()) { |
55 | struct page *page; | 55 | struct page *page; |
56 | 56 | ||
57 | if (node_state(node, N_HIGH_MEMORY)) | 57 | page = alloc_pages_node(node, |
58 | page = alloc_pages_node( | 58 | GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, |
59 | node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, | 59 | get_order(size)); |
60 | get_order(size)); | ||
61 | else | ||
62 | page = alloc_pages( | ||
63 | GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, | ||
64 | get_order(size)); | ||
65 | if (page) | 60 | if (page) |
66 | return page_address(page); | 61 | return page_address(page); |
67 | return NULL; | 62 | return NULL; |
diff --git a/mm/sparse.c b/mm/sparse.c index 7b4be3fd5cac..a9783acf2bb9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) | |||
65 | unsigned long array_size = SECTIONS_PER_ROOT * | 65 | unsigned long array_size = SECTIONS_PER_ROOT * |
66 | sizeof(struct mem_section); | 66 | sizeof(struct mem_section); |
67 | 67 | ||
68 | if (slab_is_available()) { | 68 | if (slab_is_available()) |
69 | if (node_state(nid, N_HIGH_MEMORY)) | 69 | section = kzalloc_node(array_size, GFP_KERNEL, nid); |
70 | section = kzalloc_node(array_size, GFP_KERNEL, nid); | 70 | else |
71 | else | ||
72 | section = kzalloc(array_size, GFP_KERNEL); | ||
73 | } else { | ||
74 | section = memblock_virt_alloc_node(array_size, nid); | 71 | section = memblock_virt_alloc_node(array_size, nid); |
75 | } | ||
76 | 72 | ||
77 | return section; | 73 | return section; |
78 | } | 74 | } |
@@ -946,28 +946,34 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) | |||
946 | } | 946 | } |
947 | 947 | ||
948 | /** | 948 | /** |
949 | * pagevec_lookup - gang pagecache lookup | 949 | * pagevec_lookup_range - gang pagecache lookup |
950 | * @pvec: Where the resulting pages are placed | 950 | * @pvec: Where the resulting pages are placed |
951 | * @mapping: The address_space to search | 951 | * @mapping: The address_space to search |
952 | * @start: The starting page index | 952 | * @start: The starting page index |
953 | * @end: The final page index | ||
953 | * @nr_pages: The maximum number of pages | 954 | * @nr_pages: The maximum number of pages |
954 | * | 955 | * |
955 | * pagevec_lookup() will search for and return a group of up to @nr_pages pages | 956 | * pagevec_lookup_range() will search for and return a group of up to @nr_pages |
956 | * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a | 957 | * pages in the mapping starting from index @start and upto index @end |
958 | * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a | ||
957 | * reference against the pages in @pvec. | 959 | * reference against the pages in @pvec. |
958 | * | 960 | * |
959 | * The search returns a group of mapping-contiguous pages with ascending | 961 | * The search returns a group of mapping-contiguous pages with ascending |
960 | * indexes. There may be holes in the indices due to not-present pages. | 962 | * indexes. There may be holes in the indices due to not-present pages. We |
963 | * also update @start to index the next page for the traversal. | ||
961 | * | 964 | * |
962 | * pagevec_lookup() returns the number of pages which were found. | 965 | * pagevec_lookup_range() returns the number of pages which were found. If this |
966 | * number is smaller than @nr_pages, the end of specified range has been | ||
967 | * reached. | ||
963 | */ | 968 | */ |
964 | unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, | 969 | unsigned pagevec_lookup_range(struct pagevec *pvec, |
965 | pgoff_t start, unsigned nr_pages) | 970 | struct address_space *mapping, pgoff_t *start, pgoff_t end) |
966 | { | 971 | { |
967 | pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); | 972 | pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE, |
973 | pvec->pages); | ||
968 | return pagevec_count(pvec); | 974 | return pagevec_count(pvec); |
969 | } | 975 | } |
970 | EXPORT_SYMBOL(pagevec_lookup); | 976 | EXPORT_SYMBOL(pagevec_lookup_range); |
971 | 977 | ||
972 | unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, | 978 | unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, |
973 | pgoff_t *index, int tag, unsigned nr_pages) | 979 | pgoff_t *index, int tag, unsigned nr_pages) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index b68c93014f50..71ce2d1ccbf7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = { | |||
37 | 37 | ||
38 | struct address_space *swapper_spaces[MAX_SWAPFILES]; | 38 | struct address_space *swapper_spaces[MAX_SWAPFILES]; |
39 | static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; | 39 | static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; |
40 | bool swap_vma_readahead = true; | ||
41 | |||
42 | #define SWAP_RA_MAX_ORDER_DEFAULT 3 | ||
43 | |||
44 | static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT; | ||
45 | |||
46 | #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) | ||
47 | #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) | ||
48 | #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK | ||
49 | #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) | ||
50 | |||
51 | #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) | ||
52 | #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) | ||
53 | #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) | ||
54 | |||
55 | #define SWAP_RA_VAL(addr, win, hits) \ | ||
56 | (((addr) & PAGE_MASK) | \ | ||
57 | (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ | ||
58 | ((hits) & SWAP_RA_HITS_MASK)) | ||
59 | |||
60 | /* Initial readahead hits is 4 to start up with a small window */ | ||
61 | #define GET_SWAP_RA_VAL(vma) \ | ||
62 | (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) | ||
40 | 63 | ||
41 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) | 64 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) |
42 | #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) | 65 | #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) |
@@ -297,19 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr) | |||
297 | * lock getting page table operations atomic even if we drop the page | 320 | * lock getting page table operations atomic even if we drop the page |
298 | * lock before returning. | 321 | * lock before returning. |
299 | */ | 322 | */ |
300 | struct page * lookup_swap_cache(swp_entry_t entry) | 323 | struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, |
324 | unsigned long addr) | ||
301 | { | 325 | { |
302 | struct page *page; | 326 | struct page *page; |
327 | unsigned long ra_info; | ||
328 | int win, hits, readahead; | ||
303 | 329 | ||
304 | page = find_get_page(swap_address_space(entry), swp_offset(entry)); | 330 | page = find_get_page(swap_address_space(entry), swp_offset(entry)); |
305 | 331 | ||
306 | if (page && likely(!PageTransCompound(page))) { | 332 | INC_CACHE_INFO(find_total); |
333 | if (page) { | ||
307 | INC_CACHE_INFO(find_success); | 334 | INC_CACHE_INFO(find_success); |
308 | if (TestClearPageReadahead(page)) | 335 | if (unlikely(PageTransCompound(page))) |
309 | atomic_inc(&swapin_readahead_hits); | 336 | return page; |
337 | readahead = TestClearPageReadahead(page); | ||
338 | if (vma) { | ||
339 | ra_info = GET_SWAP_RA_VAL(vma); | ||
340 | win = SWAP_RA_WIN(ra_info); | ||
341 | hits = SWAP_RA_HITS(ra_info); | ||
342 | if (readahead) | ||
343 | hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); | ||
344 | atomic_long_set(&vma->swap_readahead_info, | ||
345 | SWAP_RA_VAL(addr, win, hits)); | ||
346 | } | ||
347 | if (readahead) { | ||
348 | count_vm_event(SWAP_RA_HIT); | ||
349 | if (!vma) | ||
350 | atomic_inc(&swapin_readahead_hits); | ||
351 | } | ||
310 | } | 352 | } |
311 | |||
312 | INC_CACHE_INFO(find_total); | ||
313 | return page; | 353 | return page; |
314 | } | 354 | } |
315 | 355 | ||
@@ -424,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
424 | return retpage; | 464 | return retpage; |
425 | } | 465 | } |
426 | 466 | ||
427 | static unsigned long swapin_nr_pages(unsigned long offset) | 467 | static unsigned int __swapin_nr_pages(unsigned long prev_offset, |
468 | unsigned long offset, | ||
469 | int hits, | ||
470 | int max_pages, | ||
471 | int prev_win) | ||
428 | { | 472 | { |
429 | static unsigned long prev_offset; | 473 | unsigned int pages, last_ra; |
430 | unsigned int pages, max_pages, last_ra; | ||
431 | static atomic_t last_readahead_pages; | ||
432 | |||
433 | max_pages = 1 << READ_ONCE(page_cluster); | ||
434 | if (max_pages <= 1) | ||
435 | return 1; | ||
436 | 474 | ||
437 | /* | 475 | /* |
438 | * This heuristic has been found to work well on both sequential and | 476 | * This heuristic has been found to work well on both sequential and |
439 | * random loads, swapping to hard disk or to SSD: please don't ask | 477 | * random loads, swapping to hard disk or to SSD: please don't ask |
440 | * what the "+ 2" means, it just happens to work well, that's all. | 478 | * what the "+ 2" means, it just happens to work well, that's all. |
441 | */ | 479 | */ |
442 | pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; | 480 | pages = hits + 2; |
443 | if (pages == 2) { | 481 | if (pages == 2) { |
444 | /* | 482 | /* |
445 | * We can have no readahead hits to judge by: but must not get | 483 | * We can have no readahead hits to judge by: but must not get |
@@ -448,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset) | |||
448 | */ | 486 | */ |
449 | if (offset != prev_offset + 1 && offset != prev_offset - 1) | 487 | if (offset != prev_offset + 1 && offset != prev_offset - 1) |
450 | pages = 1; | 488 | pages = 1; |
451 | prev_offset = offset; | ||
452 | } else { | 489 | } else { |
453 | unsigned int roundup = 4; | 490 | unsigned int roundup = 4; |
454 | while (roundup < pages) | 491 | while (roundup < pages) |
@@ -460,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset) | |||
460 | pages = max_pages; | 497 | pages = max_pages; |
461 | 498 | ||
462 | /* Don't shrink readahead too fast */ | 499 | /* Don't shrink readahead too fast */ |
463 | last_ra = atomic_read(&last_readahead_pages) / 2; | 500 | last_ra = prev_win / 2; |
464 | if (pages < last_ra) | 501 | if (pages < last_ra) |
465 | pages = last_ra; | 502 | pages = last_ra; |
503 | |||
504 | return pages; | ||
505 | } | ||
506 | |||
507 | static unsigned long swapin_nr_pages(unsigned long offset) | ||
508 | { | ||
509 | static unsigned long prev_offset; | ||
510 | unsigned int hits, pages, max_pages; | ||
511 | static atomic_t last_readahead_pages; | ||
512 | |||
513 | max_pages = 1 << READ_ONCE(page_cluster); | ||
514 | if (max_pages <= 1) | ||
515 | return 1; | ||
516 | |||
517 | hits = atomic_xchg(&swapin_readahead_hits, 0); | ||
518 | pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, | ||
519 | atomic_read(&last_readahead_pages)); | ||
520 | if (!hits) | ||
521 | prev_offset = offset; | ||
466 | atomic_set(&last_readahead_pages, pages); | 522 | atomic_set(&last_readahead_pages, pages); |
467 | 523 | ||
468 | return pages; | 524 | return pages; |
@@ -496,7 +552,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
496 | unsigned long start_offset, end_offset; | 552 | unsigned long start_offset, end_offset; |
497 | unsigned long mask; | 553 | unsigned long mask; |
498 | struct blk_plug plug; | 554 | struct blk_plug plug; |
499 | bool do_poll = true; | 555 | bool do_poll = true, page_allocated; |
500 | 556 | ||
501 | mask = swapin_nr_pages(offset) - 1; | 557 | mask = swapin_nr_pages(offset) - 1; |
502 | if (!mask) | 558 | if (!mask) |
@@ -512,12 +568,19 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
512 | blk_start_plug(&plug); | 568 | blk_start_plug(&plug); |
513 | for (offset = start_offset; offset <= end_offset ; offset++) { | 569 | for (offset = start_offset; offset <= end_offset ; offset++) { |
514 | /* Ok, do the async read-ahead now */ | 570 | /* Ok, do the async read-ahead now */ |
515 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | 571 | page = __read_swap_cache_async( |
516 | gfp_mask, vma, addr, false); | 572 | swp_entry(swp_type(entry), offset), |
573 | gfp_mask, vma, addr, &page_allocated); | ||
517 | if (!page) | 574 | if (!page) |
518 | continue; | 575 | continue; |
519 | if (offset != entry_offset && likely(!PageTransCompound(page))) | 576 | if (page_allocated) { |
520 | SetPageReadahead(page); | 577 | swap_readpage(page, false); |
578 | if (offset != entry_offset && | ||
579 | likely(!PageTransCompound(page))) { | ||
580 | SetPageReadahead(page); | ||
581 | count_vm_event(SWAP_RA); | ||
582 | } | ||
583 | } | ||
521 | put_page(page); | 584 | put_page(page); |
522 | } | 585 | } |
523 | blk_finish_plug(&plug); | 586 | blk_finish_plug(&plug); |
@@ -561,3 +624,210 @@ void exit_swap_address_space(unsigned int type) | |||
561 | synchronize_rcu(); | 624 | synchronize_rcu(); |
562 | kvfree(spaces); | 625 | kvfree(spaces); |
563 | } | 626 | } |
627 | |||
628 | static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, | ||
629 | unsigned long faddr, | ||
630 | unsigned long lpfn, | ||
631 | unsigned long rpfn, | ||
632 | unsigned long *start, | ||
633 | unsigned long *end) | ||
634 | { | ||
635 | *start = max3(lpfn, PFN_DOWN(vma->vm_start), | ||
636 | PFN_DOWN(faddr & PMD_MASK)); | ||
637 | *end = min3(rpfn, PFN_DOWN(vma->vm_end), | ||
638 | PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); | ||
639 | } | ||
640 | |||
641 | struct page *swap_readahead_detect(struct vm_fault *vmf, | ||
642 | struct vma_swap_readahead *swap_ra) | ||
643 | { | ||
644 | struct vm_area_struct *vma = vmf->vma; | ||
645 | unsigned long swap_ra_info; | ||
646 | struct page *page; | ||
647 | swp_entry_t entry; | ||
648 | unsigned long faddr, pfn, fpfn; | ||
649 | unsigned long start, end; | ||
650 | pte_t *pte; | ||
651 | unsigned int max_win, hits, prev_win, win, left; | ||
652 | #ifndef CONFIG_64BIT | ||
653 | pte_t *tpte; | ||
654 | #endif | ||
655 | |||
656 | faddr = vmf->address; | ||
657 | entry = pte_to_swp_entry(vmf->orig_pte); | ||
658 | if ((unlikely(non_swap_entry(entry)))) | ||
659 | return NULL; | ||
660 | page = lookup_swap_cache(entry, vma, faddr); | ||
661 | if (page) | ||
662 | return page; | ||
663 | |||
664 | max_win = 1 << READ_ONCE(swap_ra_max_order); | ||
665 | if (max_win == 1) { | ||
666 | swap_ra->win = 1; | ||
667 | return NULL; | ||
668 | } | ||
669 | |||
670 | fpfn = PFN_DOWN(faddr); | ||
671 | swap_ra_info = GET_SWAP_RA_VAL(vma); | ||
672 | pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); | ||
673 | prev_win = SWAP_RA_WIN(swap_ra_info); | ||
674 | hits = SWAP_RA_HITS(swap_ra_info); | ||
675 | swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, | ||
676 | max_win, prev_win); | ||
677 | atomic_long_set(&vma->swap_readahead_info, | ||
678 | SWAP_RA_VAL(faddr, win, 0)); | ||
679 | |||
680 | if (win == 1) | ||
681 | return NULL; | ||
682 | |||
683 | /* Copy the PTEs because the page table may be unmapped */ | ||
684 | if (fpfn == pfn + 1) | ||
685 | swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); | ||
686 | else if (pfn == fpfn + 1) | ||
687 | swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, | ||
688 | &start, &end); | ||
689 | else { | ||
690 | left = (win - 1) / 2; | ||
691 | swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, | ||
692 | &start, &end); | ||
693 | } | ||
694 | swap_ra->nr_pte = end - start; | ||
695 | swap_ra->offset = fpfn - start; | ||
696 | pte = vmf->pte - swap_ra->offset; | ||
697 | #ifdef CONFIG_64BIT | ||
698 | swap_ra->ptes = pte; | ||
699 | #else | ||
700 | tpte = swap_ra->ptes; | ||
701 | for (pfn = start; pfn != end; pfn++) | ||
702 | *tpte++ = *pte++; | ||
703 | #endif | ||
704 | |||
705 | return NULL; | ||
706 | } | ||
707 | |||
708 | struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, | ||
709 | struct vm_fault *vmf, | ||
710 | struct vma_swap_readahead *swap_ra) | ||
711 | { | ||
712 | struct blk_plug plug; | ||
713 | struct vm_area_struct *vma = vmf->vma; | ||
714 | struct page *page; | ||
715 | pte_t *pte, pentry; | ||
716 | swp_entry_t entry; | ||
717 | unsigned int i; | ||
718 | bool page_allocated; | ||
719 | |||
720 | if (swap_ra->win == 1) | ||
721 | goto skip; | ||
722 | |||
723 | blk_start_plug(&plug); | ||
724 | for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; | ||
725 | i++, pte++) { | ||
726 | pentry = *pte; | ||
727 | if (pte_none(pentry)) | ||
728 | continue; | ||
729 | if (pte_present(pentry)) | ||
730 | continue; | ||
731 | entry = pte_to_swp_entry(pentry); | ||
732 | if (unlikely(non_swap_entry(entry))) | ||
733 | continue; | ||
734 | page = __read_swap_cache_async(entry, gfp_mask, vma, | ||
735 | vmf->address, &page_allocated); | ||
736 | if (!page) | ||
737 | continue; | ||
738 | if (page_allocated) { | ||
739 | swap_readpage(page, false); | ||
740 | if (i != swap_ra->offset && | ||
741 | likely(!PageTransCompound(page))) { | ||
742 | SetPageReadahead(page); | ||
743 | count_vm_event(SWAP_RA); | ||
744 | } | ||
745 | } | ||
746 | put_page(page); | ||
747 | } | ||
748 | blk_finish_plug(&plug); | ||
749 | lru_add_drain(); | ||
750 | skip: | ||
751 | return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, | ||
752 | swap_ra->win == 1); | ||
753 | } | ||
754 | |||
755 | #ifdef CONFIG_SYSFS | ||
756 | static ssize_t vma_ra_enabled_show(struct kobject *kobj, | ||
757 | struct kobj_attribute *attr, char *buf) | ||
758 | { | ||
759 | return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); | ||
760 | } | ||
761 | static ssize_t vma_ra_enabled_store(struct kobject *kobj, | ||
762 | struct kobj_attribute *attr, | ||
763 | const char *buf, size_t count) | ||
764 | { | ||
765 | if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) | ||
766 | swap_vma_readahead = true; | ||
767 | else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) | ||
768 | swap_vma_readahead = false; | ||
769 | else | ||
770 | return -EINVAL; | ||
771 | |||
772 | return count; | ||
773 | } | ||
774 | static struct kobj_attribute vma_ra_enabled_attr = | ||
775 | __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, | ||
776 | vma_ra_enabled_store); | ||
777 | |||
778 | static ssize_t vma_ra_max_order_show(struct kobject *kobj, | ||
779 | struct kobj_attribute *attr, char *buf) | ||
780 | { | ||
781 | return sprintf(buf, "%d\n", swap_ra_max_order); | ||
782 | } | ||
783 | static ssize_t vma_ra_max_order_store(struct kobject *kobj, | ||
784 | struct kobj_attribute *attr, | ||
785 | const char *buf, size_t count) | ||
786 | { | ||
787 | int err, v; | ||
788 | |||
789 | err = kstrtoint(buf, 10, &v); | ||
790 | if (err || v > SWAP_RA_ORDER_CEILING || v <= 0) | ||
791 | return -EINVAL; | ||
792 | |||
793 | swap_ra_max_order = v; | ||
794 | |||
795 | return count; | ||
796 | } | ||
797 | static struct kobj_attribute vma_ra_max_order_attr = | ||
798 | __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show, | ||
799 | vma_ra_max_order_store); | ||
800 | |||
801 | static struct attribute *swap_attrs[] = { | ||
802 | &vma_ra_enabled_attr.attr, | ||
803 | &vma_ra_max_order_attr.attr, | ||
804 | NULL, | ||
805 | }; | ||
806 | |||
807 | static struct attribute_group swap_attr_group = { | ||
808 | .attrs = swap_attrs, | ||
809 | }; | ||
810 | |||
811 | static int __init swap_init_sysfs(void) | ||
812 | { | ||
813 | int err; | ||
814 | struct kobject *swap_kobj; | ||
815 | |||
816 | swap_kobj = kobject_create_and_add("swap", mm_kobj); | ||
817 | if (!swap_kobj) { | ||
818 | pr_err("failed to create swap kobject\n"); | ||
819 | return -ENOMEM; | ||
820 | } | ||
821 | err = sysfs_create_group(swap_kobj, &swap_attr_group); | ||
822 | if (err) { | ||
823 | pr_err("failed to register swap group\n"); | ||
824 | goto delete_obj; | ||
825 | } | ||
826 | return 0; | ||
827 | |||
828 | delete_obj: | ||
829 | kobject_put(swap_kobj); | ||
830 | return err; | ||
831 | } | ||
832 | subsys_initcall(swap_init_sysfs); | ||
833 | #endif | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6ba4aab2db0b..d483278ee35b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages; | |||
60 | EXPORT_SYMBOL_GPL(nr_swap_pages); | 60 | EXPORT_SYMBOL_GPL(nr_swap_pages); |
61 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | 61 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ |
62 | long total_swap_pages; | 62 | long total_swap_pages; |
63 | static int least_priority; | 63 | static int least_priority = -1; |
64 | 64 | ||
65 | static const char Bad_file[] = "Bad swap file entry "; | 65 | static const char Bad_file[] = "Bad swap file entry "; |
66 | static const char Unused_file[] = "Unused swap file entry "; | 66 | static const char Unused_file[] = "Unused swap file entry "; |
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); | |||
85 | * is held and the locking order requires swap_lock to be taken | 85 | * is held and the locking order requires swap_lock to be taken |
86 | * before any swap_info_struct->lock. | 86 | * before any swap_info_struct->lock. |
87 | */ | 87 | */ |
88 | static PLIST_HEAD(swap_avail_head); | 88 | struct plist_head *swap_avail_heads; |
89 | static DEFINE_SPINLOCK(swap_avail_lock); | 89 | static DEFINE_SPINLOCK(swap_avail_lock); |
90 | 90 | ||
91 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 91 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); | |||
96 | /* Activity counter to indicate that a swapon or swapoff has occurred */ | 96 | /* Activity counter to indicate that a swapon or swapoff has occurred */ |
97 | static atomic_t proc_poll_event = ATOMIC_INIT(0); | 97 | static atomic_t proc_poll_event = ATOMIC_INIT(0); |
98 | 98 | ||
99 | atomic_t nr_rotate_swap = ATOMIC_INIT(0); | ||
100 | |||
99 | static inline unsigned char swap_count(unsigned char ent) | 101 | static inline unsigned char swap_count(unsigned char ent) |
100 | { | 102 | { |
101 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ | 103 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ |
@@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info) | |||
265 | info->data = 0; | 267 | info->data = 0; |
266 | } | 268 | } |
267 | 269 | ||
270 | static inline bool cluster_is_huge(struct swap_cluster_info *info) | ||
271 | { | ||
272 | return info->flags & CLUSTER_FLAG_HUGE; | ||
273 | } | ||
274 | |||
275 | static inline void cluster_clear_huge(struct swap_cluster_info *info) | ||
276 | { | ||
277 | info->flags &= ~CLUSTER_FLAG_HUGE; | ||
278 | } | ||
279 | |||
268 | static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, | 280 | static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, |
269 | unsigned long offset) | 281 | unsigned long offset) |
270 | { | 282 | { |
@@ -580,6 +592,21 @@ new_cluster: | |||
580 | return found_free; | 592 | return found_free; |
581 | } | 593 | } |
582 | 594 | ||
595 | static void __del_from_avail_list(struct swap_info_struct *p) | ||
596 | { | ||
597 | int nid; | ||
598 | |||
599 | for_each_node(nid) | ||
600 | plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); | ||
601 | } | ||
602 | |||
603 | static void del_from_avail_list(struct swap_info_struct *p) | ||
604 | { | ||
605 | spin_lock(&swap_avail_lock); | ||
606 | __del_from_avail_list(p); | ||
607 | spin_unlock(&swap_avail_lock); | ||
608 | } | ||
609 | |||
583 | static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, | 610 | static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, |
584 | unsigned int nr_entries) | 611 | unsigned int nr_entries) |
585 | { | 612 | { |
@@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, | |||
593 | if (si->inuse_pages == si->pages) { | 620 | if (si->inuse_pages == si->pages) { |
594 | si->lowest_bit = si->max; | 621 | si->lowest_bit = si->max; |
595 | si->highest_bit = 0; | 622 | si->highest_bit = 0; |
596 | spin_lock(&swap_avail_lock); | 623 | del_from_avail_list(si); |
597 | plist_del(&si->avail_list, &swap_avail_head); | 624 | } |
598 | spin_unlock(&swap_avail_lock); | 625 | } |
626 | |||
627 | static void add_to_avail_list(struct swap_info_struct *p) | ||
628 | { | ||
629 | int nid; | ||
630 | |||
631 | spin_lock(&swap_avail_lock); | ||
632 | for_each_node(nid) { | ||
633 | WARN_ON(!plist_node_empty(&p->avail_lists[nid])); | ||
634 | plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); | ||
599 | } | 635 | } |
636 | spin_unlock(&swap_avail_lock); | ||
600 | } | 637 | } |
601 | 638 | ||
602 | static void swap_range_free(struct swap_info_struct *si, unsigned long offset, | 639 | static void swap_range_free(struct swap_info_struct *si, unsigned long offset, |
@@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, | |||
611 | bool was_full = !si->highest_bit; | 648 | bool was_full = !si->highest_bit; |
612 | 649 | ||
613 | si->highest_bit = end; | 650 | si->highest_bit = end; |
614 | if (was_full && (si->flags & SWP_WRITEOK)) { | 651 | if (was_full && (si->flags & SWP_WRITEOK)) |
615 | spin_lock(&swap_avail_lock); | 652 | add_to_avail_list(si); |
616 | WARN_ON(!plist_node_empty(&si->avail_list)); | ||
617 | if (plist_node_empty(&si->avail_list)) | ||
618 | plist_add(&si->avail_list, &swap_avail_head); | ||
619 | spin_unlock(&swap_avail_lock); | ||
620 | } | ||
621 | } | 653 | } |
622 | atomic_long_add(nr_entries, &nr_swap_pages); | 654 | atomic_long_add(nr_entries, &nr_swap_pages); |
623 | si->inuse_pages -= nr_entries; | 655 | si->inuse_pages -= nr_entries; |
@@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) | |||
846 | offset = idx * SWAPFILE_CLUSTER; | 878 | offset = idx * SWAPFILE_CLUSTER; |
847 | ci = lock_cluster(si, offset); | 879 | ci = lock_cluster(si, offset); |
848 | alloc_cluster(si, idx); | 880 | alloc_cluster(si, idx); |
849 | cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); | 881 | cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); |
850 | 882 | ||
851 | map = si->swap_map + offset; | 883 | map = si->swap_map + offset; |
852 | for (i = 0; i < SWAPFILE_CLUSTER; i++) | 884 | for (i = 0; i < SWAPFILE_CLUSTER; i++) |
@@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) | |||
898 | struct swap_info_struct *si, *next; | 930 | struct swap_info_struct *si, *next; |
899 | long avail_pgs; | 931 | long avail_pgs; |
900 | int n_ret = 0; | 932 | int n_ret = 0; |
933 | int node; | ||
901 | 934 | ||
902 | /* Only single cluster request supported */ | 935 | /* Only single cluster request supported */ |
903 | WARN_ON_ONCE(n_goal > 1 && cluster); | 936 | WARN_ON_ONCE(n_goal > 1 && cluster); |
@@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) | |||
917 | spin_lock(&swap_avail_lock); | 950 | spin_lock(&swap_avail_lock); |
918 | 951 | ||
919 | start_over: | 952 | start_over: |
920 | plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { | 953 | node = numa_node_id(); |
954 | plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { | ||
921 | /* requeue si to after same-priority siblings */ | 955 | /* requeue si to after same-priority siblings */ |
922 | plist_requeue(&si->avail_list, &swap_avail_head); | 956 | plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); |
923 | spin_unlock(&swap_avail_lock); | 957 | spin_unlock(&swap_avail_lock); |
924 | spin_lock(&si->lock); | 958 | spin_lock(&si->lock); |
925 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { | 959 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
926 | spin_lock(&swap_avail_lock); | 960 | spin_lock(&swap_avail_lock); |
927 | if (plist_node_empty(&si->avail_list)) { | 961 | if (plist_node_empty(&si->avail_lists[node])) { |
928 | spin_unlock(&si->lock); | 962 | spin_unlock(&si->lock); |
929 | goto nextsi; | 963 | goto nextsi; |
930 | } | 964 | } |
@@ -934,13 +968,14 @@ start_over: | |||
934 | WARN(!(si->flags & SWP_WRITEOK), | 968 | WARN(!(si->flags & SWP_WRITEOK), |
935 | "swap_info %d in list but !SWP_WRITEOK\n", | 969 | "swap_info %d in list but !SWP_WRITEOK\n", |
936 | si->type); | 970 | si->type); |
937 | plist_del(&si->avail_list, &swap_avail_head); | 971 | __del_from_avail_list(si); |
938 | spin_unlock(&si->lock); | 972 | spin_unlock(&si->lock); |
939 | goto nextsi; | 973 | goto nextsi; |
940 | } | 974 | } |
941 | if (cluster) | 975 | if (cluster) { |
942 | n_ret = swap_alloc_cluster(si, swp_entries); | 976 | if (!(si->flags & SWP_FILE)) |
943 | else | 977 | n_ret = swap_alloc_cluster(si, swp_entries); |
978 | } else | ||
944 | n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, | 979 | n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, |
945 | n_goal, swp_entries); | 980 | n_goal, swp_entries); |
946 | spin_unlock(&si->lock); | 981 | spin_unlock(&si->lock); |
@@ -962,7 +997,7 @@ nextsi: | |||
962 | * swap_avail_head list then try it, otherwise start over | 997 | * swap_avail_head list then try it, otherwise start over |
963 | * if we have not gotten any slots. | 998 | * if we have not gotten any slots. |
964 | */ | 999 | */ |
965 | if (plist_node_empty(&next->avail_list)) | 1000 | if (plist_node_empty(&next->avail_lists[node])) |
966 | goto start_over; | 1001 | goto start_over; |
967 | } | 1002 | } |
968 | 1003 | ||
@@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry) | |||
1168 | struct swap_cluster_info *ci; | 1203 | struct swap_cluster_info *ci; |
1169 | struct swap_info_struct *si; | 1204 | struct swap_info_struct *si; |
1170 | unsigned char *map; | 1205 | unsigned char *map; |
1171 | unsigned int i; | 1206 | unsigned int i, free_entries = 0; |
1207 | unsigned char val; | ||
1172 | 1208 | ||
1173 | si = swap_info_get(entry); | 1209 | si = _swap_info_get(entry); |
1174 | if (!si) | 1210 | if (!si) |
1175 | return; | 1211 | return; |
1176 | 1212 | ||
1177 | ci = lock_cluster(si, offset); | 1213 | ci = lock_cluster(si, offset); |
1214 | VM_BUG_ON(!cluster_is_huge(ci)); | ||
1178 | map = si->swap_map + offset; | 1215 | map = si->swap_map + offset; |
1179 | for (i = 0; i < SWAPFILE_CLUSTER; i++) { | 1216 | for (i = 0; i < SWAPFILE_CLUSTER; i++) { |
1180 | VM_BUG_ON(map[i] != SWAP_HAS_CACHE); | 1217 | val = map[i]; |
1181 | map[i] = 0; | 1218 | VM_BUG_ON(!(val & SWAP_HAS_CACHE)); |
1219 | if (val == SWAP_HAS_CACHE) | ||
1220 | free_entries++; | ||
1221 | } | ||
1222 | if (!free_entries) { | ||
1223 | for (i = 0; i < SWAPFILE_CLUSTER; i++) | ||
1224 | map[i] &= ~SWAP_HAS_CACHE; | ||
1182 | } | 1225 | } |
1226 | cluster_clear_huge(ci); | ||
1183 | unlock_cluster(ci); | 1227 | unlock_cluster(ci); |
1184 | mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); | 1228 | if (free_entries == SWAPFILE_CLUSTER) { |
1185 | swap_free_cluster(si, idx); | 1229 | spin_lock(&si->lock); |
1186 | spin_unlock(&si->lock); | 1230 | ci = lock_cluster(si, offset); |
1231 | memset(map, 0, SWAPFILE_CLUSTER); | ||
1232 | unlock_cluster(ci); | ||
1233 | mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); | ||
1234 | swap_free_cluster(si, idx); | ||
1235 | spin_unlock(&si->lock); | ||
1236 | } else if (free_entries) { | ||
1237 | for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { | ||
1238 | if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) | ||
1239 | free_swap_slot(entry); | ||
1240 | } | ||
1241 | } | ||
1242 | } | ||
1243 | |||
1244 | int split_swap_cluster(swp_entry_t entry) | ||
1245 | { | ||
1246 | struct swap_info_struct *si; | ||
1247 | struct swap_cluster_info *ci; | ||
1248 | unsigned long offset = swp_offset(entry); | ||
1249 | |||
1250 | si = _swap_info_get(entry); | ||
1251 | if (!si) | ||
1252 | return -EBUSY; | ||
1253 | ci = lock_cluster(si, offset); | ||
1254 | cluster_clear_huge(ci); | ||
1255 | unlock_cluster(ci); | ||
1256 | return 0; | ||
1187 | } | 1257 | } |
1188 | #else | 1258 | #else |
1189 | static inline void swapcache_free_cluster(swp_entry_t entry) | 1259 | static inline void swapcache_free_cluster(swp_entry_t entry) |
@@ -1332,29 +1402,161 @@ out: | |||
1332 | return count; | 1402 | return count; |
1333 | } | 1403 | } |
1334 | 1404 | ||
1405 | #ifdef CONFIG_THP_SWAP | ||
1406 | static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, | ||
1407 | swp_entry_t entry) | ||
1408 | { | ||
1409 | struct swap_cluster_info *ci; | ||
1410 | unsigned char *map = si->swap_map; | ||
1411 | unsigned long roffset = swp_offset(entry); | ||
1412 | unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); | ||
1413 | int i; | ||
1414 | bool ret = false; | ||
1415 | |||
1416 | ci = lock_cluster_or_swap_info(si, offset); | ||
1417 | if (!ci || !cluster_is_huge(ci)) { | ||
1418 | if (map[roffset] != SWAP_HAS_CACHE) | ||
1419 | ret = true; | ||
1420 | goto unlock_out; | ||
1421 | } | ||
1422 | for (i = 0; i < SWAPFILE_CLUSTER; i++) { | ||
1423 | if (map[offset + i] != SWAP_HAS_CACHE) { | ||
1424 | ret = true; | ||
1425 | break; | ||
1426 | } | ||
1427 | } | ||
1428 | unlock_out: | ||
1429 | unlock_cluster_or_swap_info(si, ci); | ||
1430 | return ret; | ||
1431 | } | ||
1432 | |||
1433 | static bool page_swapped(struct page *page) | ||
1434 | { | ||
1435 | swp_entry_t entry; | ||
1436 | struct swap_info_struct *si; | ||
1437 | |||
1438 | if (likely(!PageTransCompound(page))) | ||
1439 | return page_swapcount(page) != 0; | ||
1440 | |||
1441 | page = compound_head(page); | ||
1442 | entry.val = page_private(page); | ||
1443 | si = _swap_info_get(entry); | ||
1444 | if (si) | ||
1445 | return swap_page_trans_huge_swapped(si, entry); | ||
1446 | return false; | ||
1447 | } | ||
1448 | |||
1449 | static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, | ||
1450 | int *total_swapcount) | ||
1451 | { | ||
1452 | int i, map_swapcount, _total_mapcount, _total_swapcount; | ||
1453 | unsigned long offset = 0; | ||
1454 | struct swap_info_struct *si; | ||
1455 | struct swap_cluster_info *ci = NULL; | ||
1456 | unsigned char *map = NULL; | ||
1457 | int mapcount, swapcount = 0; | ||
1458 | |||
1459 | /* hugetlbfs shouldn't call it */ | ||
1460 | VM_BUG_ON_PAGE(PageHuge(page), page); | ||
1461 | |||
1462 | if (likely(!PageTransCompound(page))) { | ||
1463 | mapcount = atomic_read(&page->_mapcount) + 1; | ||
1464 | if (total_mapcount) | ||
1465 | *total_mapcount = mapcount; | ||
1466 | if (PageSwapCache(page)) | ||
1467 | swapcount = page_swapcount(page); | ||
1468 | if (total_swapcount) | ||
1469 | *total_swapcount = swapcount; | ||
1470 | return mapcount + swapcount; | ||
1471 | } | ||
1472 | |||
1473 | page = compound_head(page); | ||
1474 | |||
1475 | _total_mapcount = _total_swapcount = map_swapcount = 0; | ||
1476 | if (PageSwapCache(page)) { | ||
1477 | swp_entry_t entry; | ||
1478 | |||
1479 | entry.val = page_private(page); | ||
1480 | si = _swap_info_get(entry); | ||
1481 | if (si) { | ||
1482 | map = si->swap_map; | ||
1483 | offset = swp_offset(entry); | ||
1484 | } | ||
1485 | } | ||
1486 | if (map) | ||
1487 | ci = lock_cluster(si, offset); | ||
1488 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
1489 | mapcount = atomic_read(&page[i]._mapcount) + 1; | ||
1490 | _total_mapcount += mapcount; | ||
1491 | if (map) { | ||
1492 | swapcount = swap_count(map[offset + i]); | ||
1493 | _total_swapcount += swapcount; | ||
1494 | } | ||
1495 | map_swapcount = max(map_swapcount, mapcount + swapcount); | ||
1496 | } | ||
1497 | unlock_cluster(ci); | ||
1498 | if (PageDoubleMap(page)) { | ||
1499 | map_swapcount -= 1; | ||
1500 | _total_mapcount -= HPAGE_PMD_NR; | ||
1501 | } | ||
1502 | mapcount = compound_mapcount(page); | ||
1503 | map_swapcount += mapcount; | ||
1504 | _total_mapcount += mapcount; | ||
1505 | if (total_mapcount) | ||
1506 | *total_mapcount = _total_mapcount; | ||
1507 | if (total_swapcount) | ||
1508 | *total_swapcount = _total_swapcount; | ||
1509 | |||
1510 | return map_swapcount; | ||
1511 | } | ||
1512 | #else | ||
1513 | #define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) | ||
1514 | #define page_swapped(page) (page_swapcount(page) != 0) | ||
1515 | |||
1516 | static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, | ||
1517 | int *total_swapcount) | ||
1518 | { | ||
1519 | int mapcount, swapcount = 0; | ||
1520 | |||
1521 | /* hugetlbfs shouldn't call it */ | ||
1522 | VM_BUG_ON_PAGE(PageHuge(page), page); | ||
1523 | |||
1524 | mapcount = page_trans_huge_mapcount(page, total_mapcount); | ||
1525 | if (PageSwapCache(page)) | ||
1526 | swapcount = page_swapcount(page); | ||
1527 | if (total_swapcount) | ||
1528 | *total_swapcount = swapcount; | ||
1529 | return mapcount + swapcount; | ||
1530 | } | ||
1531 | #endif | ||
1532 | |||
1335 | /* | 1533 | /* |
1336 | * We can write to an anon page without COW if there are no other references | 1534 | * We can write to an anon page without COW if there are no other references |
1337 | * to it. And as a side-effect, free up its swap: because the old content | 1535 | * to it. And as a side-effect, free up its swap: because the old content |
1338 | * on disk will never be read, and seeking back there to write new content | 1536 | * on disk will never be read, and seeking back there to write new content |
1339 | * later would only waste time away from clustering. | 1537 | * later would only waste time away from clustering. |
1340 | * | 1538 | * |
1341 | * NOTE: total_mapcount should not be relied upon by the caller if | 1539 | * NOTE: total_map_swapcount should not be relied upon by the caller if |
1342 | * reuse_swap_page() returns false, but it may be always overwritten | 1540 | * reuse_swap_page() returns false, but it may be always overwritten |
1343 | * (see the other implementation for CONFIG_SWAP=n). | 1541 | * (see the other implementation for CONFIG_SWAP=n). |
1344 | */ | 1542 | */ |
1345 | bool reuse_swap_page(struct page *page, int *total_mapcount) | 1543 | bool reuse_swap_page(struct page *page, int *total_map_swapcount) |
1346 | { | 1544 | { |
1347 | int count; | 1545 | int count, total_mapcount, total_swapcount; |
1348 | 1546 | ||
1349 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1547 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1350 | if (unlikely(PageKsm(page))) | 1548 | if (unlikely(PageKsm(page))) |
1351 | return false; | 1549 | return false; |
1352 | count = page_trans_huge_mapcount(page, total_mapcount); | 1550 | count = page_trans_huge_map_swapcount(page, &total_mapcount, |
1353 | if (count <= 1 && PageSwapCache(page)) { | 1551 | &total_swapcount); |
1354 | count += page_swapcount(page); | 1552 | if (total_map_swapcount) |
1355 | if (count != 1) | 1553 | *total_map_swapcount = total_mapcount + total_swapcount; |
1356 | goto out; | 1554 | if (count == 1 && PageSwapCache(page) && |
1555 | (likely(!PageTransCompound(page)) || | ||
1556 | /* The remaining swap count will be freed soon */ | ||
1557 | total_swapcount == page_swapcount(page))) { | ||
1357 | if (!PageWriteback(page)) { | 1558 | if (!PageWriteback(page)) { |
1559 | page = compound_head(page); | ||
1358 | delete_from_swap_cache(page); | 1560 | delete_from_swap_cache(page); |
1359 | SetPageDirty(page); | 1561 | SetPageDirty(page); |
1360 | } else { | 1562 | } else { |
@@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) | |||
1370 | spin_unlock(&p->lock); | 1572 | spin_unlock(&p->lock); |
1371 | } | 1573 | } |
1372 | } | 1574 | } |
1373 | out: | 1575 | |
1374 | return count <= 1; | 1576 | return count <= 1; |
1375 | } | 1577 | } |
1376 | 1578 | ||
@@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page) | |||
1386 | return 0; | 1588 | return 0; |
1387 | if (PageWriteback(page)) | 1589 | if (PageWriteback(page)) |
1388 | return 0; | 1590 | return 0; |
1389 | if (page_swapcount(page)) | 1591 | if (page_swapped(page)) |
1390 | return 0; | 1592 | return 0; |
1391 | 1593 | ||
1392 | /* | 1594 | /* |
@@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page) | |||
1407 | if (pm_suspended_storage()) | 1609 | if (pm_suspended_storage()) |
1408 | return 0; | 1610 | return 0; |
1409 | 1611 | ||
1612 | page = compound_head(page); | ||
1410 | delete_from_swap_cache(page); | 1613 | delete_from_swap_cache(page); |
1411 | SetPageDirty(page); | 1614 | SetPageDirty(page); |
1412 | return 1; | 1615 | return 1; |
@@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry) | |||
1428 | p = _swap_info_get(entry); | 1631 | p = _swap_info_get(entry); |
1429 | if (p) { | 1632 | if (p) { |
1430 | count = __swap_entry_free(p, entry, 1); | 1633 | count = __swap_entry_free(p, entry, 1); |
1431 | if (count == SWAP_HAS_CACHE) { | 1634 | if (count == SWAP_HAS_CACHE && |
1635 | !swap_page_trans_huge_swapped(p, entry)) { | ||
1432 | page = find_get_page(swap_address_space(entry), | 1636 | page = find_get_page(swap_address_space(entry), |
1433 | swp_offset(entry)); | 1637 | swp_offset(entry)); |
1434 | if (page && !trylock_page(page)) { | 1638 | if (page && !trylock_page(page)) { |
@@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry) | |||
1445 | */ | 1649 | */ |
1446 | if (PageSwapCache(page) && !PageWriteback(page) && | 1650 | if (PageSwapCache(page) && !PageWriteback(page) && |
1447 | (!page_mapped(page) || mem_cgroup_swap_full(page)) && | 1651 | (!page_mapped(page) || mem_cgroup_swap_full(page)) && |
1448 | !swap_swapcount(p, entry)) { | 1652 | !swap_page_trans_huge_swapped(p, entry)) { |
1653 | page = compound_head(page); | ||
1449 | delete_from_swap_cache(page); | 1654 | delete_from_swap_cache(page); |
1450 | SetPageDirty(page); | 1655 | SetPageDirty(page); |
1451 | } | 1656 | } |
@@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap, | |||
1999 | .sync_mode = WB_SYNC_NONE, | 2204 | .sync_mode = WB_SYNC_NONE, |
2000 | }; | 2205 | }; |
2001 | 2206 | ||
2002 | swap_writepage(page, &wbc); | 2207 | swap_writepage(compound_head(page), &wbc); |
2003 | lock_page(page); | 2208 | lock_page(page); |
2004 | wait_on_page_writeback(page); | 2209 | wait_on_page_writeback(page); |
2005 | } | 2210 | } |
@@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap, | |||
2012 | * delete, since it may not have been written out to swap yet. | 2217 | * delete, since it may not have been written out to swap yet. |
2013 | */ | 2218 | */ |
2014 | if (PageSwapCache(page) && | 2219 | if (PageSwapCache(page) && |
2015 | likely(page_private(page) == entry.val)) | 2220 | likely(page_private(page) == entry.val) && |
2016 | delete_from_swap_cache(page); | 2221 | !page_swapped(page)) |
2222 | delete_from_swap_cache(compound_head(page)); | ||
2017 | 2223 | ||
2018 | /* | 2224 | /* |
2019 | * So we could skip searching mms once swap count went | 2225 | * So we could skip searching mms once swap count went |
@@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
2226 | return generic_swapfile_activate(sis, swap_file, span); | 2432 | return generic_swapfile_activate(sis, swap_file, span); |
2227 | } | 2433 | } |
2228 | 2434 | ||
2435 | static int swap_node(struct swap_info_struct *p) | ||
2436 | { | ||
2437 | struct block_device *bdev; | ||
2438 | |||
2439 | if (p->bdev) | ||
2440 | bdev = p->bdev; | ||
2441 | else | ||
2442 | bdev = p->swap_file->f_inode->i_sb->s_bdev; | ||
2443 | |||
2444 | return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; | ||
2445 | } | ||
2446 | |||
2229 | static void _enable_swap_info(struct swap_info_struct *p, int prio, | 2447 | static void _enable_swap_info(struct swap_info_struct *p, int prio, |
2230 | unsigned char *swap_map, | 2448 | unsigned char *swap_map, |
2231 | struct swap_cluster_info *cluster_info) | 2449 | struct swap_cluster_info *cluster_info) |
2232 | { | 2450 | { |
2451 | int i; | ||
2452 | |||
2233 | if (prio >= 0) | 2453 | if (prio >= 0) |
2234 | p->prio = prio; | 2454 | p->prio = prio; |
2235 | else | 2455 | else |
@@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
2239 | * low-to-high, while swap ordering is high-to-low | 2459 | * low-to-high, while swap ordering is high-to-low |
2240 | */ | 2460 | */ |
2241 | p->list.prio = -p->prio; | 2461 | p->list.prio = -p->prio; |
2242 | p->avail_list.prio = -p->prio; | 2462 | for_each_node(i) { |
2463 | if (p->prio >= 0) | ||
2464 | p->avail_lists[i].prio = -p->prio; | ||
2465 | else { | ||
2466 | if (swap_node(p) == i) | ||
2467 | p->avail_lists[i].prio = 1; | ||
2468 | else | ||
2469 | p->avail_lists[i].prio = -p->prio; | ||
2470 | } | ||
2471 | } | ||
2243 | p->swap_map = swap_map; | 2472 | p->swap_map = swap_map; |
2244 | p->cluster_info = cluster_info; | 2473 | p->cluster_info = cluster_info; |
2245 | p->flags |= SWP_WRITEOK; | 2474 | p->flags |= SWP_WRITEOK; |
@@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
2258 | * swap_info_struct. | 2487 | * swap_info_struct. |
2259 | */ | 2488 | */ |
2260 | plist_add(&p->list, &swap_active_head); | 2489 | plist_add(&p->list, &swap_active_head); |
2261 | spin_lock(&swap_avail_lock); | 2490 | add_to_avail_list(p); |
2262 | plist_add(&p->avail_list, &swap_avail_head); | ||
2263 | spin_unlock(&swap_avail_lock); | ||
2264 | } | 2491 | } |
2265 | 2492 | ||
2266 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 2493 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
2345 | spin_unlock(&swap_lock); | 2572 | spin_unlock(&swap_lock); |
2346 | goto out_dput; | 2573 | goto out_dput; |
2347 | } | 2574 | } |
2348 | spin_lock(&swap_avail_lock); | 2575 | del_from_avail_list(p); |
2349 | plist_del(&p->avail_list, &swap_avail_head); | ||
2350 | spin_unlock(&swap_avail_lock); | ||
2351 | spin_lock(&p->lock); | 2576 | spin_lock(&p->lock); |
2352 | if (p->prio < 0) { | 2577 | if (p->prio < 0) { |
2353 | struct swap_info_struct *si = p; | 2578 | struct swap_info_struct *si = p; |
2579 | int nid; | ||
2354 | 2580 | ||
2355 | plist_for_each_entry_continue(si, &swap_active_head, list) { | 2581 | plist_for_each_entry_continue(si, &swap_active_head, list) { |
2356 | si->prio++; | 2582 | si->prio++; |
2357 | si->list.prio--; | 2583 | si->list.prio--; |
2358 | si->avail_list.prio--; | 2584 | for_each_node(nid) { |
2585 | if (si->avail_lists[nid].prio != 1) | ||
2586 | si->avail_lists[nid].prio--; | ||
2587 | } | ||
2359 | } | 2588 | } |
2360 | least_priority++; | 2589 | least_priority++; |
2361 | } | 2590 | } |
@@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
2387 | if (p->flags & SWP_CONTINUED) | 2616 | if (p->flags & SWP_CONTINUED) |
2388 | free_swap_count_continuations(p); | 2617 | free_swap_count_continuations(p); |
2389 | 2618 | ||
2619 | if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) | ||
2620 | atomic_dec(&nr_rotate_swap); | ||
2621 | |||
2390 | mutex_lock(&swapon_mutex); | 2622 | mutex_lock(&swapon_mutex); |
2391 | spin_lock(&swap_lock); | 2623 | spin_lock(&swap_lock); |
2392 | spin_lock(&p->lock); | 2624 | spin_lock(&p->lock); |
@@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2596 | { | 2828 | { |
2597 | struct swap_info_struct *p; | 2829 | struct swap_info_struct *p; |
2598 | unsigned int type; | 2830 | unsigned int type; |
2831 | int i; | ||
2599 | 2832 | ||
2600 | p = kzalloc(sizeof(*p), GFP_KERNEL); | 2833 | p = kzalloc(sizeof(*p), GFP_KERNEL); |
2601 | if (!p) | 2834 | if (!p) |
@@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2631 | } | 2864 | } |
2632 | INIT_LIST_HEAD(&p->first_swap_extent.list); | 2865 | INIT_LIST_HEAD(&p->first_swap_extent.list); |
2633 | plist_node_init(&p->list, 0); | 2866 | plist_node_init(&p->list, 0); |
2634 | plist_node_init(&p->avail_list, 0); | 2867 | for_each_node(i) |
2868 | plist_node_init(&p->avail_lists[i], 0); | ||
2635 | p->flags = SWP_USED; | 2869 | p->flags = SWP_USED; |
2636 | spin_unlock(&swap_lock); | 2870 | spin_unlock(&swap_lock); |
2637 | spin_lock_init(&p->lock); | 2871 | spin_lock_init(&p->lock); |
@@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2873 | if (!capable(CAP_SYS_ADMIN)) | 3107 | if (!capable(CAP_SYS_ADMIN)) |
2874 | return -EPERM; | 3108 | return -EPERM; |
2875 | 3109 | ||
3110 | if (!swap_avail_heads) | ||
3111 | return -ENOMEM; | ||
3112 | |||
2876 | p = alloc_swap_info(); | 3113 | p = alloc_swap_info(); |
2877 | if (IS_ERR(p)) | 3114 | if (IS_ERR(p)) |
2878 | return PTR_ERR(p); | 3115 | return PTR_ERR(p); |
@@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2963 | cluster = per_cpu_ptr(p->percpu_cluster, cpu); | 3200 | cluster = per_cpu_ptr(p->percpu_cluster, cpu); |
2964 | cluster_set_null(&cluster->index); | 3201 | cluster_set_null(&cluster->index); |
2965 | } | 3202 | } |
2966 | } | 3203 | } else |
3204 | atomic_inc(&nr_rotate_swap); | ||
2967 | 3205 | ||
2968 | error = swap_cgroup_swapon(p->type, maxpages); | 3206 | error = swap_cgroup_swapon(p->type, maxpages); |
2969 | if (error) | 3207 | if (error) |
@@ -3457,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si) | |||
3457 | } | 3695 | } |
3458 | } | 3696 | } |
3459 | } | 3697 | } |
3698 | |||
3699 | static int __init swapfile_init(void) | ||
3700 | { | ||
3701 | int nid; | ||
3702 | |||
3703 | swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), | ||
3704 | GFP_KERNEL); | ||
3705 | if (!swap_avail_heads) { | ||
3706 | pr_emerg("Not enough memory for swap heads, swap is disabled\n"); | ||
3707 | return -ENOMEM; | ||
3708 | } | ||
3709 | |||
3710 | for_each_node(nid) | ||
3711 | plist_head_init(&swap_avail_heads[nid]); | ||
3712 | |||
3713 | return 0; | ||
3714 | } | ||
3715 | subsys_initcall(swapfile_init); | ||
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 8bcb501bce60..81192701964d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c | |||
@@ -371,6 +371,36 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, | |||
371 | bool zeropage); | 371 | bool zeropage); |
372 | #endif /* CONFIG_HUGETLB_PAGE */ | 372 | #endif /* CONFIG_HUGETLB_PAGE */ |
373 | 373 | ||
374 | static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, | ||
375 | pmd_t *dst_pmd, | ||
376 | struct vm_area_struct *dst_vma, | ||
377 | unsigned long dst_addr, | ||
378 | unsigned long src_addr, | ||
379 | struct page **page, | ||
380 | bool zeropage) | ||
381 | { | ||
382 | ssize_t err; | ||
383 | |||
384 | if (vma_is_anonymous(dst_vma)) { | ||
385 | if (!zeropage) | ||
386 | err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, | ||
387 | dst_addr, src_addr, page); | ||
388 | else | ||
389 | err = mfill_zeropage_pte(dst_mm, dst_pmd, | ||
390 | dst_vma, dst_addr); | ||
391 | } else { | ||
392 | if (!zeropage) | ||
393 | err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, | ||
394 | dst_vma, dst_addr, | ||
395 | src_addr, page); | ||
396 | else | ||
397 | err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, | ||
398 | dst_vma, dst_addr); | ||
399 | } | ||
400 | |||
401 | return err; | ||
402 | } | ||
403 | |||
374 | static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, | 404 | static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, |
375 | unsigned long dst_start, | 405 | unsigned long dst_start, |
376 | unsigned long src_start, | 406 | unsigned long src_start, |
@@ -487,22 +517,8 @@ retry: | |||
487 | BUG_ON(pmd_none(*dst_pmd)); | 517 | BUG_ON(pmd_none(*dst_pmd)); |
488 | BUG_ON(pmd_trans_huge(*dst_pmd)); | 518 | BUG_ON(pmd_trans_huge(*dst_pmd)); |
489 | 519 | ||
490 | if (vma_is_anonymous(dst_vma)) { | 520 | err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, |
491 | if (!zeropage) | 521 | src_addr, &page, zeropage); |
492 | err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, | ||
493 | dst_addr, src_addr, | ||
494 | &page); | ||
495 | else | ||
496 | err = mfill_zeropage_pte(dst_mm, dst_pmd, | ||
497 | dst_vma, dst_addr); | ||
498 | } else { | ||
499 | err = -EINVAL; /* if zeropage is true return -EINVAL */ | ||
500 | if (likely(!zeropage)) | ||
501 | err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, | ||
502 | dst_vma, dst_addr, | ||
503 | src_addr, &page); | ||
504 | } | ||
505 | |||
506 | cond_resched(); | 522 | cond_resched(); |
507 | 523 | ||
508 | if (unlikely(err == -EFAULT)) { | 524 | if (unlikely(err == -EFAULT)) { |
@@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
614 | return 0; | 614 | return 0; |
615 | 615 | ||
616 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 616 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
617 | free = global_page_state(NR_FREE_PAGES); | 617 | free = global_zone_page_state(NR_FREE_PAGES); |
618 | free += global_node_page_state(NR_FILE_PAGES); | 618 | free += global_node_page_state(NR_FILE_PAGES); |
619 | 619 | ||
620 | /* | 620 | /* |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a47e3894c775..8a43db6284eb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -49,12 +49,10 @@ static void __vunmap(const void *, int); | |||
49 | static void free_work(struct work_struct *w) | 49 | static void free_work(struct work_struct *w) |
50 | { | 50 | { |
51 | struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); | 51 | struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); |
52 | struct llist_node *llnode = llist_del_all(&p->list); | 52 | struct llist_node *t, *llnode; |
53 | while (llnode) { | 53 | |
54 | void *p = llnode; | 54 | llist_for_each_safe(llnode, t, llist_del_all(&p->list)) |
55 | llnode = llist_next(llnode); | 55 | __vunmap((void *)llnode, 1); |
56 | __vunmap(p, 1); | ||
57 | } | ||
58 | } | 56 | } |
59 | 57 | ||
60 | /*** Page table manipulation functions ***/ | 58 | /*** Page table manipulation functions ***/ |
@@ -2482,7 +2480,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, | |||
2482 | * matching slot. While scanning, if any of the areas overlaps with | 2480 | * matching slot. While scanning, if any of the areas overlaps with |
2483 | * existing vmap_area, the base address is pulled down to fit the | 2481 | * existing vmap_area, the base address is pulled down to fit the |
2484 | * area. Scanning is repeated till all the areas fit and then all | 2482 | * area. Scanning is repeated till all the areas fit and then all |
2485 | * necessary data structres are inserted and the result is returned. | 2483 | * necessary data structures are inserted and the result is returned. |
2486 | */ | 2484 | */ |
2487 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | 2485 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, |
2488 | const size_t *sizes, int nr_vms, | 2486 | const size_t *sizes, int nr_vms, |
@@ -2510,15 +2508,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2510 | if (start > offsets[last_area]) | 2508 | if (start > offsets[last_area]) |
2511 | last_area = area; | 2509 | last_area = area; |
2512 | 2510 | ||
2513 | for (area2 = 0; area2 < nr_vms; area2++) { | 2511 | for (area2 = area + 1; area2 < nr_vms; area2++) { |
2514 | unsigned long start2 = offsets[area2]; | 2512 | unsigned long start2 = offsets[area2]; |
2515 | unsigned long end2 = start2 + sizes[area2]; | 2513 | unsigned long end2 = start2 + sizes[area2]; |
2516 | 2514 | ||
2517 | if (area2 == area) | 2515 | BUG_ON(start2 < end && start < end2); |
2518 | continue; | ||
2519 | |||
2520 | BUG_ON(start2 >= start && start2 < end); | ||
2521 | BUG_ON(end2 <= end && end2 > start); | ||
2522 | } | 2516 | } |
2523 | } | 2517 | } |
2524 | last_end = offsets[last_area] + sizes[last_area]; | 2518 | last_end = offsets[last_area] + sizes[last_area]; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index f957afe900ec..13d711dd8776 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, | |||
393 | unsigned long nr_to_scan = min(batch_size, total_scan); | 393 | unsigned long nr_to_scan = min(batch_size, total_scan); |
394 | 394 | ||
395 | shrinkctl->nr_to_scan = nr_to_scan; | 395 | shrinkctl->nr_to_scan = nr_to_scan; |
396 | shrinkctl->nr_scanned = nr_to_scan; | ||
396 | ret = shrinker->scan_objects(shrinker, shrinkctl); | 397 | ret = shrinker->scan_objects(shrinker, shrinkctl); |
397 | if (ret == SHRINK_STOP) | 398 | if (ret == SHRINK_STOP) |
398 | break; | 399 | break; |
399 | freed += ret; | 400 | freed += ret; |
400 | 401 | ||
401 | count_vm_events(SLABS_SCANNED, nr_to_scan); | 402 | count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); |
402 | total_scan -= nr_to_scan; | 403 | total_scan -= shrinkctl->nr_scanned; |
403 | scanned += nr_to_scan; | 404 | scanned += shrinkctl->nr_scanned; |
404 | 405 | ||
405 | cond_resched(); | 406 | cond_resched(); |
406 | } | 407 | } |
@@ -535,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page) | |||
535 | * that isolated the page, the page cache radix tree and | 536 | * that isolated the page, the page cache radix tree and |
536 | * optional buffer heads at page->private. | 537 | * optional buffer heads at page->private. |
537 | */ | 538 | */ |
538 | return page_count(page) - page_has_private(page) == 2; | 539 | int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? |
540 | HPAGE_PMD_NR : 1; | ||
541 | return page_count(page) - page_has_private(page) == 1 + radix_pins; | ||
539 | } | 542 | } |
540 | 543 | ||
541 | static int may_write_to_inode(struct inode *inode, struct scan_control *sc) | 544 | static int may_write_to_inode(struct inode *inode, struct scan_control *sc) |
@@ -665,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
665 | bool reclaimed) | 668 | bool reclaimed) |
666 | { | 669 | { |
667 | unsigned long flags; | 670 | unsigned long flags; |
671 | int refcount; | ||
668 | 672 | ||
669 | BUG_ON(!PageLocked(page)); | 673 | BUG_ON(!PageLocked(page)); |
670 | BUG_ON(mapping != page_mapping(page)); | 674 | BUG_ON(mapping != page_mapping(page)); |
@@ -695,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
695 | * Note that if SetPageDirty is always performed via set_page_dirty, | 699 | * Note that if SetPageDirty is always performed via set_page_dirty, |
696 | * and thus under tree_lock, then this ordering is not required. | 700 | * and thus under tree_lock, then this ordering is not required. |
697 | */ | 701 | */ |
698 | if (!page_ref_freeze(page, 2)) | 702 | if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) |
703 | refcount = 1 + HPAGE_PMD_NR; | ||
704 | else | ||
705 | refcount = 2; | ||
706 | if (!page_ref_freeze(page, refcount)) | ||
699 | goto cannot_free; | 707 | goto cannot_free; |
700 | /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ | 708 | /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ |
701 | if (unlikely(PageDirty(page))) { | 709 | if (unlikely(PageDirty(page))) { |
702 | page_ref_unfreeze(page, 2); | 710 | page_ref_unfreeze(page, refcount); |
703 | goto cannot_free; | 711 | goto cannot_free; |
704 | } | 712 | } |
705 | 713 | ||
@@ -1121,58 +1129,59 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1121 | * Try to allocate it some swap space here. | 1129 | * Try to allocate it some swap space here. |
1122 | * Lazyfree page could be freed directly | 1130 | * Lazyfree page could be freed directly |
1123 | */ | 1131 | */ |
1124 | if (PageAnon(page) && PageSwapBacked(page) && | 1132 | if (PageAnon(page) && PageSwapBacked(page)) { |
1125 | !PageSwapCache(page)) { | 1133 | if (!PageSwapCache(page)) { |
1126 | if (!(sc->gfp_mask & __GFP_IO)) | 1134 | if (!(sc->gfp_mask & __GFP_IO)) |
1127 | goto keep_locked; | 1135 | goto keep_locked; |
1128 | if (PageTransHuge(page)) { | 1136 | if (PageTransHuge(page)) { |
1129 | /* cannot split THP, skip it */ | 1137 | /* cannot split THP, skip it */ |
1130 | if (!can_split_huge_page(page, NULL)) | 1138 | if (!can_split_huge_page(page, NULL)) |
1131 | goto activate_locked; | 1139 | goto activate_locked; |
1132 | /* | 1140 | /* |
1133 | * Split pages without a PMD map right | 1141 | * Split pages without a PMD map right |
1134 | * away. Chances are some or all of the | 1142 | * away. Chances are some or all of the |
1135 | * tail pages can be freed without IO. | 1143 | * tail pages can be freed without IO. |
1136 | */ | 1144 | */ |
1137 | if (!compound_mapcount(page) && | 1145 | if (!compound_mapcount(page) && |
1138 | split_huge_page_to_list(page, page_list)) | 1146 | split_huge_page_to_list(page, |
1139 | goto activate_locked; | 1147 | page_list)) |
1140 | } | 1148 | goto activate_locked; |
1141 | if (!add_to_swap(page)) { | 1149 | } |
1142 | if (!PageTransHuge(page)) | 1150 | if (!add_to_swap(page)) { |
1143 | goto activate_locked; | 1151 | if (!PageTransHuge(page)) |
1144 | /* Split THP and swap individual base pages */ | 1152 | goto activate_locked; |
1145 | if (split_huge_page_to_list(page, page_list)) | 1153 | /* Fallback to swap normal pages */ |
1146 | goto activate_locked; | 1154 | if (split_huge_page_to_list(page, |
1147 | if (!add_to_swap(page)) | 1155 | page_list)) |
1148 | goto activate_locked; | 1156 | goto activate_locked; |
1149 | } | 1157 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
1150 | 1158 | count_vm_event(THP_SWPOUT_FALLBACK); | |
1151 | /* XXX: We don't support THP writes */ | 1159 | #endif |
1152 | if (PageTransHuge(page) && | 1160 | if (!add_to_swap(page)) |
1153 | split_huge_page_to_list(page, page_list)) { | 1161 | goto activate_locked; |
1154 | delete_from_swap_cache(page); | 1162 | } |
1155 | goto activate_locked; | ||
1156 | } | ||
1157 | 1163 | ||
1158 | may_enter_fs = 1; | 1164 | may_enter_fs = 1; |
1159 | 1165 | ||
1160 | /* Adding to swap updated mapping */ | 1166 | /* Adding to swap updated mapping */ |
1161 | mapping = page_mapping(page); | 1167 | mapping = page_mapping(page); |
1168 | } | ||
1162 | } else if (unlikely(PageTransHuge(page))) { | 1169 | } else if (unlikely(PageTransHuge(page))) { |
1163 | /* Split file THP */ | 1170 | /* Split file THP */ |
1164 | if (split_huge_page_to_list(page, page_list)) | 1171 | if (split_huge_page_to_list(page, page_list)) |
1165 | goto keep_locked; | 1172 | goto keep_locked; |
1166 | } | 1173 | } |
1167 | 1174 | ||
1168 | VM_BUG_ON_PAGE(PageTransHuge(page), page); | ||
1169 | |||
1170 | /* | 1175 | /* |
1171 | * The page is mapped into the page tables of one or more | 1176 | * The page is mapped into the page tables of one or more |
1172 | * processes. Try to unmap it here. | 1177 | * processes. Try to unmap it here. |
1173 | */ | 1178 | */ |
1174 | if (page_mapped(page)) { | 1179 | if (page_mapped(page)) { |
1175 | if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { | 1180 | enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; |
1181 | |||
1182 | if (unlikely(PageTransHuge(page))) | ||
1183 | flags |= TTU_SPLIT_HUGE_PMD; | ||
1184 | if (!try_to_unmap(page, flags)) { | ||
1176 | nr_unmap_fail++; | 1185 | nr_unmap_fail++; |
1177 | goto activate_locked; | 1186 | goto activate_locked; |
1178 | } | 1187 | } |
@@ -1312,7 +1321,11 @@ free_it: | |||
1312 | * Is there need to periodically free_page_list? It would | 1321 | * Is there need to periodically free_page_list? It would |
1313 | * appear not as the counts should be low | 1322 | * appear not as the counts should be low |
1314 | */ | 1323 | */ |
1315 | list_add(&page->lru, &free_pages); | 1324 | if (unlikely(PageTransHuge(page))) { |
1325 | mem_cgroup_uncharge(page); | ||
1326 | (*get_compound_page_dtor(page))(page); | ||
1327 | } else | ||
1328 | list_add(&page->lru, &free_pages); | ||
1316 | continue; | 1329 | continue; |
1317 | 1330 | ||
1318 | activate_locked: | 1331 | activate_locked: |
@@ -1742,9 +1755,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1742 | int file = is_file_lru(lru); | 1755 | int file = is_file_lru(lru); |
1743 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); | 1756 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
1744 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 1757 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1758 | bool stalled = false; | ||
1745 | 1759 | ||
1746 | while (unlikely(too_many_isolated(pgdat, file, sc))) { | 1760 | while (unlikely(too_many_isolated(pgdat, file, sc))) { |
1747 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1761 | if (stalled) |
1762 | return 0; | ||
1763 | |||
1764 | /* wait a bit for the reclaimer. */ | ||
1765 | msleep(100); | ||
1766 | stalled = true; | ||
1748 | 1767 | ||
1749 | /* We are about to die and free our memory. Return now. */ | 1768 | /* We are about to die and free our memory. Return now. */ |
1750 | if (fatal_signal_pending(current)) | 1769 | if (fatal_signal_pending(current)) |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9a4441bbeef2..c7e4b8458023 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -870,6 +870,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in | |||
870 | { | 870 | { |
871 | unsigned long requested = 1UL << order; | 871 | unsigned long requested = 1UL << order; |
872 | 872 | ||
873 | if (WARN_ON_ONCE(order >= MAX_ORDER)) | ||
874 | return 0; | ||
875 | |||
873 | if (!info->free_blocks_total) | 876 | if (!info->free_blocks_total) |
874 | return 0; | 877 | return 0; |
875 | 878 | ||
@@ -1071,6 +1074,8 @@ const char * const vmstat_text[] = { | |||
1071 | #endif | 1074 | #endif |
1072 | "thp_zero_page_alloc", | 1075 | "thp_zero_page_alloc", |
1073 | "thp_zero_page_alloc_failed", | 1076 | "thp_zero_page_alloc_failed", |
1077 | "thp_swpout", | ||
1078 | "thp_swpout_fallback", | ||
1074 | #endif | 1079 | #endif |
1075 | #ifdef CONFIG_MEMORY_BALLOON | 1080 | #ifdef CONFIG_MEMORY_BALLOON |
1076 | "balloon_inflate", | 1081 | "balloon_inflate", |
@@ -1093,6 +1098,10 @@ const char * const vmstat_text[] = { | |||
1093 | "vmacache_find_hits", | 1098 | "vmacache_find_hits", |
1094 | "vmacache_full_flushes", | 1099 | "vmacache_full_flushes", |
1095 | #endif | 1100 | #endif |
1101 | #ifdef CONFIG_SWAP | ||
1102 | "swap_ra", | ||
1103 | "swap_ra_hit", | ||
1104 | #endif | ||
1096 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 1105 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
1097 | }; | 1106 | }; |
1098 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ | 1107 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
@@ -1250,7 +1259,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, | |||
1250 | seq_putc(m, '\n'); | 1259 | seq_putc(m, '\n'); |
1251 | } | 1260 | } |
1252 | 1261 | ||
1253 | /* Print out the free pages at each order for each migratetype */ | 1262 | /* Print out the number of pageblocks for each migratetype */ |
1254 | static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | 1263 | static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) |
1255 | { | 1264 | { |
1256 | int mtype; | 1265 | int mtype; |
@@ -1500,7 +1509,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) | |||
1500 | if (!v) | 1509 | if (!v) |
1501 | return ERR_PTR(-ENOMEM); | 1510 | return ERR_PTR(-ENOMEM); |
1502 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 1511 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
1503 | v[i] = global_page_state(i); | 1512 | v[i] = global_zone_page_state(i); |
1504 | v += NR_VM_ZONE_STAT_ITEMS; | 1513 | v += NR_VM_ZONE_STAT_ITEMS; |
1505 | 1514 | ||
1506 | for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) | 1515 | for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) |
@@ -1589,7 +1598,7 @@ int vmstat_refresh(struct ctl_table *table, int write, | |||
1589 | * which can equally be echo'ed to or cat'ted from (by root), | 1598 | * which can equally be echo'ed to or cat'ted from (by root), |
1590 | * can be used to update the stats just before reading them. | 1599 | * can be used to update the stats just before reading them. |
1591 | * | 1600 | * |
1592 | * Oh, and since global_page_state() etc. are so careful to hide | 1601 | * Oh, and since global_zone_page_state() etc. are so careful to hide |
1593 | * transiently negative values, report an error here if any of | 1602 | * transiently negative values, report an error here if any of |
1594 | * the stats is negative, so we know to go looking for imbalance. | 1603 | * the stats is negative, so we know to go looking for imbalance. |
1595 | */ | 1604 | */ |
diff --git a/mm/z3fold.c b/mm/z3fold.c index 54f63c4a809a..486550df32be 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c | |||
@@ -23,10 +23,13 @@ | |||
23 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 23 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
24 | 24 | ||
25 | #include <linux/atomic.h> | 25 | #include <linux/atomic.h> |
26 | #include <linux/sched.h> | ||
26 | #include <linux/list.h> | 27 | #include <linux/list.h> |
27 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
28 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/percpu.h> | ||
29 | #include <linux/preempt.h> | 31 | #include <linux/preempt.h> |
32 | #include <linux/workqueue.h> | ||
30 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
31 | #include <linux/spinlock.h> | 34 | #include <linux/spinlock.h> |
32 | #include <linux/zpool.h> | 35 | #include <linux/zpool.h> |
@@ -48,11 +51,15 @@ enum buddy { | |||
48 | }; | 51 | }; |
49 | 52 | ||
50 | /* | 53 | /* |
51 | * struct z3fold_header - z3fold page metadata occupying the first chunk of each | 54 | * struct z3fold_header - z3fold page metadata occupying first chunks of each |
52 | * z3fold page, except for HEADLESS pages | 55 | * z3fold page, except for HEADLESS pages |
53 | * @buddy: links the z3fold page into the relevant list in the pool | 56 | * @buddy: links the z3fold page into the relevant list in the |
57 | * pool | ||
54 | * @page_lock: per-page lock | 58 | * @page_lock: per-page lock |
55 | * @refcount: reference cound for the z3fold page | 59 | * @refcount: reference count for the z3fold page |
60 | * @work: work_struct for page layout optimization | ||
61 | * @pool: pointer to the pool which this page belongs to | ||
62 | * @cpu: CPU which this page "belongs" to | ||
56 | * @first_chunks: the size of the first buddy in chunks, 0 if free | 63 | * @first_chunks: the size of the first buddy in chunks, 0 if free |
57 | * @middle_chunks: the size of the middle buddy in chunks, 0 if free | 64 | * @middle_chunks: the size of the middle buddy in chunks, 0 if free |
58 | * @last_chunks: the size of the last buddy in chunks, 0 if free | 65 | * @last_chunks: the size of the last buddy in chunks, 0 if free |
@@ -62,6 +69,9 @@ struct z3fold_header { | |||
62 | struct list_head buddy; | 69 | struct list_head buddy; |
63 | spinlock_t page_lock; | 70 | spinlock_t page_lock; |
64 | struct kref refcount; | 71 | struct kref refcount; |
72 | struct work_struct work; | ||
73 | struct z3fold_pool *pool; | ||
74 | short cpu; | ||
65 | unsigned short first_chunks; | 75 | unsigned short first_chunks; |
66 | unsigned short middle_chunks; | 76 | unsigned short middle_chunks; |
67 | unsigned short last_chunks; | 77 | unsigned short last_chunks; |
@@ -92,28 +102,39 @@ struct z3fold_header { | |||
92 | 102 | ||
93 | /** | 103 | /** |
94 | * struct z3fold_pool - stores metadata for each z3fold pool | 104 | * struct z3fold_pool - stores metadata for each z3fold pool |
95 | * @lock: protects all pool fields and first|last_chunk fields of any | 105 | * @name: pool name |
96 | * z3fold page in the pool | 106 | * @lock: protects pool unbuddied/lru lists |
97 | * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; | 107 | * @stale_lock: protects pool stale page list |
98 | * the lists each z3fold page is added to depends on the size of | 108 | * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- |
99 | * its free region. | 109 | * buddies; the list each z3fold page is added to depends on |
110 | * the size of its free region. | ||
100 | * @lru: list tracking the z3fold pages in LRU order by most recently | 111 | * @lru: list tracking the z3fold pages in LRU order by most recently |
101 | * added buddy. | 112 | * added buddy. |
113 | * @stale: list of pages marked for freeing | ||
102 | * @pages_nr: number of z3fold pages in the pool. | 114 | * @pages_nr: number of z3fold pages in the pool. |
103 | * @ops: pointer to a structure of user defined operations specified at | 115 | * @ops: pointer to a structure of user defined operations specified at |
104 | * pool creation time. | 116 | * pool creation time. |
117 | * @compact_wq: workqueue for page layout background optimization | ||
118 | * @release_wq: workqueue for safe page release | ||
119 | * @work: work_struct for safe page release | ||
105 | * | 120 | * |
106 | * This structure is allocated at pool creation time and maintains metadata | 121 | * This structure is allocated at pool creation time and maintains metadata |
107 | * pertaining to a particular z3fold pool. | 122 | * pertaining to a particular z3fold pool. |
108 | */ | 123 | */ |
109 | struct z3fold_pool { | 124 | struct z3fold_pool { |
125 | const char *name; | ||
110 | spinlock_t lock; | 126 | spinlock_t lock; |
111 | struct list_head unbuddied[NCHUNKS]; | 127 | spinlock_t stale_lock; |
128 | struct list_head *unbuddied; | ||
112 | struct list_head lru; | 129 | struct list_head lru; |
130 | struct list_head stale; | ||
113 | atomic64_t pages_nr; | 131 | atomic64_t pages_nr; |
114 | const struct z3fold_ops *ops; | 132 | const struct z3fold_ops *ops; |
115 | struct zpool *zpool; | 133 | struct zpool *zpool; |
116 | const struct zpool_ops *zpool_ops; | 134 | const struct zpool_ops *zpool_ops; |
135 | struct workqueue_struct *compact_wq; | ||
136 | struct workqueue_struct *release_wq; | ||
137 | struct work_struct work; | ||
117 | }; | 138 | }; |
118 | 139 | ||
119 | /* | 140 | /* |
@@ -122,9 +143,10 @@ struct z3fold_pool { | |||
122 | enum z3fold_page_flags { | 143 | enum z3fold_page_flags { |
123 | PAGE_HEADLESS = 0, | 144 | PAGE_HEADLESS = 0, |
124 | MIDDLE_CHUNK_MAPPED, | 145 | MIDDLE_CHUNK_MAPPED, |
146 | NEEDS_COMPACTING, | ||
147 | PAGE_STALE | ||
125 | }; | 148 | }; |
126 | 149 | ||
127 | |||
128 | /***************** | 150 | /***************** |
129 | * Helpers | 151 | * Helpers |
130 | *****************/ | 152 | *****************/ |
@@ -138,14 +160,19 @@ static int size_to_chunks(size_t size) | |||
138 | #define for_each_unbuddied_list(_iter, _begin) \ | 160 | #define for_each_unbuddied_list(_iter, _begin) \ |
139 | for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) | 161 | for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) |
140 | 162 | ||
163 | static void compact_page_work(struct work_struct *w); | ||
164 | |||
141 | /* Initializes the z3fold header of a newly allocated z3fold page */ | 165 | /* Initializes the z3fold header of a newly allocated z3fold page */ |
142 | static struct z3fold_header *init_z3fold_page(struct page *page) | 166 | static struct z3fold_header *init_z3fold_page(struct page *page, |
167 | struct z3fold_pool *pool) | ||
143 | { | 168 | { |
144 | struct z3fold_header *zhdr = page_address(page); | 169 | struct z3fold_header *zhdr = page_address(page); |
145 | 170 | ||
146 | INIT_LIST_HEAD(&page->lru); | 171 | INIT_LIST_HEAD(&page->lru); |
147 | clear_bit(PAGE_HEADLESS, &page->private); | 172 | clear_bit(PAGE_HEADLESS, &page->private); |
148 | clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); | 173 | clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); |
174 | clear_bit(NEEDS_COMPACTING, &page->private); | ||
175 | clear_bit(PAGE_STALE, &page->private); | ||
149 | 176 | ||
150 | spin_lock_init(&zhdr->page_lock); | 177 | spin_lock_init(&zhdr->page_lock); |
151 | kref_init(&zhdr->refcount); | 178 | kref_init(&zhdr->refcount); |
@@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page) | |||
154 | zhdr->last_chunks = 0; | 181 | zhdr->last_chunks = 0; |
155 | zhdr->first_num = 0; | 182 | zhdr->first_num = 0; |
156 | zhdr->start_middle = 0; | 183 | zhdr->start_middle = 0; |
184 | zhdr->cpu = -1; | ||
185 | zhdr->pool = pool; | ||
157 | INIT_LIST_HEAD(&zhdr->buddy); | 186 | INIT_LIST_HEAD(&zhdr->buddy); |
187 | INIT_WORK(&zhdr->work, compact_page_work); | ||
158 | return zhdr; | 188 | return zhdr; |
159 | } | 189 | } |
160 | 190 | ||
@@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page) | |||
164 | __free_page(page); | 194 | __free_page(page); |
165 | } | 195 | } |
166 | 196 | ||
167 | static void release_z3fold_page(struct kref *ref) | ||
168 | { | ||
169 | struct z3fold_header *zhdr; | ||
170 | struct page *page; | ||
171 | |||
172 | zhdr = container_of(ref, struct z3fold_header, refcount); | ||
173 | page = virt_to_page(zhdr); | ||
174 | |||
175 | if (!list_empty(&zhdr->buddy)) | ||
176 | list_del(&zhdr->buddy); | ||
177 | if (!list_empty(&page->lru)) | ||
178 | list_del(&page->lru); | ||
179 | free_z3fold_page(page); | ||
180 | } | ||
181 | |||
182 | /* Lock a z3fold page */ | 197 | /* Lock a z3fold page */ |
183 | static inline void z3fold_page_lock(struct z3fold_header *zhdr) | 198 | static inline void z3fold_page_lock(struct z3fold_header *zhdr) |
184 | { | 199 | { |
@@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle) | |||
228 | return (handle - zhdr->first_num) & BUDDY_MASK; | 243 | return (handle - zhdr->first_num) & BUDDY_MASK; |
229 | } | 244 | } |
230 | 245 | ||
246 | static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) | ||
247 | { | ||
248 | struct page *page = virt_to_page(zhdr); | ||
249 | struct z3fold_pool *pool = zhdr->pool; | ||
250 | |||
251 | WARN_ON(!list_empty(&zhdr->buddy)); | ||
252 | set_bit(PAGE_STALE, &page->private); | ||
253 | spin_lock(&pool->lock); | ||
254 | if (!list_empty(&page->lru)) | ||
255 | list_del(&page->lru); | ||
256 | spin_unlock(&pool->lock); | ||
257 | if (locked) | ||
258 | z3fold_page_unlock(zhdr); | ||
259 | spin_lock(&pool->stale_lock); | ||
260 | list_add(&zhdr->buddy, &pool->stale); | ||
261 | queue_work(pool->release_wq, &pool->work); | ||
262 | spin_unlock(&pool->stale_lock); | ||
263 | } | ||
264 | |||
265 | static void __attribute__((__unused__)) | ||
266 | release_z3fold_page(struct kref *ref) | ||
267 | { | ||
268 | struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, | ||
269 | refcount); | ||
270 | __release_z3fold_page(zhdr, false); | ||
271 | } | ||
272 | |||
273 | static void release_z3fold_page_locked(struct kref *ref) | ||
274 | { | ||
275 | struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, | ||
276 | refcount); | ||
277 | WARN_ON(z3fold_page_trylock(zhdr)); | ||
278 | __release_z3fold_page(zhdr, true); | ||
279 | } | ||
280 | |||
281 | static void release_z3fold_page_locked_list(struct kref *ref) | ||
282 | { | ||
283 | struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, | ||
284 | refcount); | ||
285 | spin_lock(&zhdr->pool->lock); | ||
286 | list_del_init(&zhdr->buddy); | ||
287 | spin_unlock(&zhdr->pool->lock); | ||
288 | |||
289 | WARN_ON(z3fold_page_trylock(zhdr)); | ||
290 | __release_z3fold_page(zhdr, true); | ||
291 | } | ||
292 | |||
293 | static void free_pages_work(struct work_struct *w) | ||
294 | { | ||
295 | struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); | ||
296 | |||
297 | spin_lock(&pool->stale_lock); | ||
298 | while (!list_empty(&pool->stale)) { | ||
299 | struct z3fold_header *zhdr = list_first_entry(&pool->stale, | ||
300 | struct z3fold_header, buddy); | ||
301 | struct page *page = virt_to_page(zhdr); | ||
302 | |||
303 | list_del(&zhdr->buddy); | ||
304 | if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) | ||
305 | continue; | ||
306 | clear_bit(NEEDS_COMPACTING, &page->private); | ||
307 | spin_unlock(&pool->stale_lock); | ||
308 | cancel_work_sync(&zhdr->work); | ||
309 | free_z3fold_page(page); | ||
310 | cond_resched(); | ||
311 | spin_lock(&pool->stale_lock); | ||
312 | } | ||
313 | spin_unlock(&pool->stale_lock); | ||
314 | } | ||
315 | |||
231 | /* | 316 | /* |
232 | * Returns the number of free chunks in a z3fold page. | 317 | * Returns the number of free chunks in a z3fold page. |
233 | * NB: can't be used with HEADLESS pages. | 318 | * NB: can't be used with HEADLESS pages. |
@@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr) | |||
252 | return nfree; | 337 | return nfree; |
253 | } | 338 | } |
254 | 339 | ||
255 | /***************** | ||
256 | * API Functions | ||
257 | *****************/ | ||
258 | /** | ||
259 | * z3fold_create_pool() - create a new z3fold pool | ||
260 | * @gfp: gfp flags when allocating the z3fold pool structure | ||
261 | * @ops: user-defined operations for the z3fold pool | ||
262 | * | ||
263 | * Return: pointer to the new z3fold pool or NULL if the metadata allocation | ||
264 | * failed. | ||
265 | */ | ||
266 | static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, | ||
267 | const struct z3fold_ops *ops) | ||
268 | { | ||
269 | struct z3fold_pool *pool; | ||
270 | int i; | ||
271 | |||
272 | pool = kzalloc(sizeof(struct z3fold_pool), gfp); | ||
273 | if (!pool) | ||
274 | return NULL; | ||
275 | spin_lock_init(&pool->lock); | ||
276 | for_each_unbuddied_list(i, 0) | ||
277 | INIT_LIST_HEAD(&pool->unbuddied[i]); | ||
278 | INIT_LIST_HEAD(&pool->lru); | ||
279 | atomic64_set(&pool->pages_nr, 0); | ||
280 | pool->ops = ops; | ||
281 | return pool; | ||
282 | } | ||
283 | |||
284 | /** | ||
285 | * z3fold_destroy_pool() - destroys an existing z3fold pool | ||
286 | * @pool: the z3fold pool to be destroyed | ||
287 | * | ||
288 | * The pool should be emptied before this function is called. | ||
289 | */ | ||
290 | static void z3fold_destroy_pool(struct z3fold_pool *pool) | ||
291 | { | ||
292 | kfree(pool); | ||
293 | } | ||
294 | |||
295 | static inline void *mchunk_memmove(struct z3fold_header *zhdr, | 340 | static inline void *mchunk_memmove(struct z3fold_header *zhdr, |
296 | unsigned short dst_chunk) | 341 | unsigned short dst_chunk) |
297 | { | 342 | { |
@@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) | |||
347 | return 0; | 392 | return 0; |
348 | } | 393 | } |
349 | 394 | ||
395 | static void do_compact_page(struct z3fold_header *zhdr, bool locked) | ||
396 | { | ||
397 | struct z3fold_pool *pool = zhdr->pool; | ||
398 | struct page *page; | ||
399 | struct list_head *unbuddied; | ||
400 | int fchunks; | ||
401 | |||
402 | page = virt_to_page(zhdr); | ||
403 | if (locked) | ||
404 | WARN_ON(z3fold_page_trylock(zhdr)); | ||
405 | else | ||
406 | z3fold_page_lock(zhdr); | ||
407 | if (test_bit(PAGE_STALE, &page->private) || | ||
408 | !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) { | ||
409 | z3fold_page_unlock(zhdr); | ||
410 | return; | ||
411 | } | ||
412 | spin_lock(&pool->lock); | ||
413 | list_del_init(&zhdr->buddy); | ||
414 | spin_unlock(&pool->lock); | ||
415 | |||
416 | z3fold_compact_page(zhdr); | ||
417 | unbuddied = get_cpu_ptr(pool->unbuddied); | ||
418 | fchunks = num_free_chunks(zhdr); | ||
419 | if (fchunks < NCHUNKS && | ||
420 | (!zhdr->first_chunks || !zhdr->middle_chunks || | ||
421 | !zhdr->last_chunks)) { | ||
422 | /* the page's not completely free and it's unbuddied */ | ||
423 | spin_lock(&pool->lock); | ||
424 | list_add(&zhdr->buddy, &unbuddied[fchunks]); | ||
425 | spin_unlock(&pool->lock); | ||
426 | zhdr->cpu = smp_processor_id(); | ||
427 | } | ||
428 | put_cpu_ptr(pool->unbuddied); | ||
429 | z3fold_page_unlock(zhdr); | ||
430 | } | ||
431 | |||
432 | static void compact_page_work(struct work_struct *w) | ||
433 | { | ||
434 | struct z3fold_header *zhdr = container_of(w, struct z3fold_header, | ||
435 | work); | ||
436 | |||
437 | do_compact_page(zhdr, false); | ||
438 | } | ||
439 | |||
440 | |||
441 | /* | ||
442 | * API Functions | ||
443 | */ | ||
444 | |||
445 | /** | ||
446 | * z3fold_create_pool() - create a new z3fold pool | ||
447 | * @name: pool name | ||
448 | * @gfp: gfp flags when allocating the z3fold pool structure | ||
449 | * @ops: user-defined operations for the z3fold pool | ||
450 | * | ||
451 | * Return: pointer to the new z3fold pool or NULL if the metadata allocation | ||
452 | * failed. | ||
453 | */ | ||
454 | static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, | ||
455 | const struct z3fold_ops *ops) | ||
456 | { | ||
457 | struct z3fold_pool *pool = NULL; | ||
458 | int i, cpu; | ||
459 | |||
460 | pool = kzalloc(sizeof(struct z3fold_pool), gfp); | ||
461 | if (!pool) | ||
462 | goto out; | ||
463 | spin_lock_init(&pool->lock); | ||
464 | spin_lock_init(&pool->stale_lock); | ||
465 | pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); | ||
466 | for_each_possible_cpu(cpu) { | ||
467 | struct list_head *unbuddied = | ||
468 | per_cpu_ptr(pool->unbuddied, cpu); | ||
469 | for_each_unbuddied_list(i, 0) | ||
470 | INIT_LIST_HEAD(&unbuddied[i]); | ||
471 | } | ||
472 | INIT_LIST_HEAD(&pool->lru); | ||
473 | INIT_LIST_HEAD(&pool->stale); | ||
474 | atomic64_set(&pool->pages_nr, 0); | ||
475 | pool->name = name; | ||
476 | pool->compact_wq = create_singlethread_workqueue(pool->name); | ||
477 | if (!pool->compact_wq) | ||
478 | goto out; | ||
479 | pool->release_wq = create_singlethread_workqueue(pool->name); | ||
480 | if (!pool->release_wq) | ||
481 | goto out_wq; | ||
482 | INIT_WORK(&pool->work, free_pages_work); | ||
483 | pool->ops = ops; | ||
484 | return pool; | ||
485 | |||
486 | out_wq: | ||
487 | destroy_workqueue(pool->compact_wq); | ||
488 | out: | ||
489 | kfree(pool); | ||
490 | return NULL; | ||
491 | } | ||
492 | |||
493 | /** | ||
494 | * z3fold_destroy_pool() - destroys an existing z3fold pool | ||
495 | * @pool: the z3fold pool to be destroyed | ||
496 | * | ||
497 | * The pool should be emptied before this function is called. | ||
498 | */ | ||
499 | static void z3fold_destroy_pool(struct z3fold_pool *pool) | ||
500 | { | ||
501 | destroy_workqueue(pool->release_wq); | ||
502 | destroy_workqueue(pool->compact_wq); | ||
503 | kfree(pool); | ||
504 | } | ||
505 | |||
350 | /** | 506 | /** |
351 | * z3fold_alloc() - allocates a region of a given size | 507 | * z3fold_alloc() - allocates a region of a given size |
352 | * @pool: z3fold pool from which to allocate | 508 | * @pool: z3fold pool from which to allocate |
@@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, | |||
371 | { | 527 | { |
372 | int chunks = 0, i, freechunks; | 528 | int chunks = 0, i, freechunks; |
373 | struct z3fold_header *zhdr = NULL; | 529 | struct z3fold_header *zhdr = NULL; |
530 | struct page *page = NULL; | ||
374 | enum buddy bud; | 531 | enum buddy bud; |
375 | struct page *page; | 532 | bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; |
376 | 533 | ||
377 | if (!size || (gfp & __GFP_HIGHMEM)) | 534 | if (!size || (gfp & __GFP_HIGHMEM)) |
378 | return -EINVAL; | 535 | return -EINVAL; |
@@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, | |||
383 | if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) | 540 | if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) |
384 | bud = HEADLESS; | 541 | bud = HEADLESS; |
385 | else { | 542 | else { |
543 | struct list_head *unbuddied; | ||
386 | chunks = size_to_chunks(size); | 544 | chunks = size_to_chunks(size); |
387 | 545 | ||
546 | lookup: | ||
388 | /* First, try to find an unbuddied z3fold page. */ | 547 | /* First, try to find an unbuddied z3fold page. */ |
389 | zhdr = NULL; | 548 | unbuddied = get_cpu_ptr(pool->unbuddied); |
390 | for_each_unbuddied_list(i, chunks) { | 549 | for_each_unbuddied_list(i, chunks) { |
391 | spin_lock(&pool->lock); | 550 | struct list_head *l = &unbuddied[i]; |
392 | zhdr = list_first_entry_or_null(&pool->unbuddied[i], | 551 | |
552 | zhdr = list_first_entry_or_null(READ_ONCE(l), | ||
393 | struct z3fold_header, buddy); | 553 | struct z3fold_header, buddy); |
394 | if (!zhdr || !z3fold_page_trylock(zhdr)) { | 554 | |
395 | spin_unlock(&pool->lock); | 555 | if (!zhdr) |
396 | continue; | 556 | continue; |
557 | |||
558 | /* Re-check under lock. */ | ||
559 | spin_lock(&pool->lock); | ||
560 | l = &unbuddied[i]; | ||
561 | if (unlikely(zhdr != list_first_entry(READ_ONCE(l), | ||
562 | struct z3fold_header, buddy)) || | ||
563 | !z3fold_page_trylock(zhdr)) { | ||
564 | spin_unlock(&pool->lock); | ||
565 | put_cpu_ptr(pool->unbuddied); | ||
566 | goto lookup; | ||
397 | } | 567 | } |
398 | kref_get(&zhdr->refcount); | ||
399 | list_del_init(&zhdr->buddy); | 568 | list_del_init(&zhdr->buddy); |
569 | zhdr->cpu = -1; | ||
400 | spin_unlock(&pool->lock); | 570 | spin_unlock(&pool->lock); |
401 | 571 | ||
402 | page = virt_to_page(zhdr); | 572 | page = virt_to_page(zhdr); |
573 | if (test_bit(NEEDS_COMPACTING, &page->private)) { | ||
574 | z3fold_page_unlock(zhdr); | ||
575 | zhdr = NULL; | ||
576 | put_cpu_ptr(pool->unbuddied); | ||
577 | if (can_sleep) | ||
578 | cond_resched(); | ||
579 | goto lookup; | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * this page could not be removed from its unbuddied | ||
584 | * list while pool lock was held, and then we've taken | ||
585 | * page lock so kref_put could not be called before | ||
586 | * we got here, so it's safe to just call kref_get() | ||
587 | */ | ||
588 | kref_get(&zhdr->refcount); | ||
589 | break; | ||
590 | } | ||
591 | put_cpu_ptr(pool->unbuddied); | ||
592 | |||
593 | if (zhdr) { | ||
403 | if (zhdr->first_chunks == 0) { | 594 | if (zhdr->first_chunks == 0) { |
404 | if (zhdr->middle_chunks != 0 && | 595 | if (zhdr->middle_chunks != 0 && |
405 | chunks >= zhdr->start_middle) | 596 | chunks >= zhdr->start_middle) |
@@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, | |||
411 | else if (zhdr->middle_chunks == 0) | 602 | else if (zhdr->middle_chunks == 0) |
412 | bud = MIDDLE; | 603 | bud = MIDDLE; |
413 | else { | 604 | else { |
414 | z3fold_page_unlock(zhdr); | ||
415 | spin_lock(&pool->lock); | ||
416 | if (kref_put(&zhdr->refcount, | 605 | if (kref_put(&zhdr->refcount, |
417 | release_z3fold_page)) | 606 | release_z3fold_page_locked)) |
418 | atomic64_dec(&pool->pages_nr); | 607 | atomic64_dec(&pool->pages_nr); |
419 | spin_unlock(&pool->lock); | 608 | else |
609 | z3fold_page_unlock(zhdr); | ||
420 | pr_err("No free chunks in unbuddied\n"); | 610 | pr_err("No free chunks in unbuddied\n"); |
421 | WARN_ON(1); | 611 | WARN_ON(1); |
422 | continue; | 612 | goto lookup; |
423 | } | 613 | } |
424 | goto found; | 614 | goto found; |
425 | } | 615 | } |
426 | bud = FIRST; | 616 | bud = FIRST; |
427 | } | 617 | } |
428 | 618 | ||
429 | /* Couldn't find unbuddied z3fold page, create new one */ | 619 | spin_lock(&pool->stale_lock); |
430 | page = alloc_page(gfp); | 620 | zhdr = list_first_entry_or_null(&pool->stale, |
621 | struct z3fold_header, buddy); | ||
622 | /* | ||
623 | * Before allocating a page, let's see if we can take one from the | ||
624 | * stale pages list. cancel_work_sync() can sleep so we must make | ||
625 | * sure it won't be called in case we're in atomic context. | ||
626 | */ | ||
627 | if (zhdr && (can_sleep || !work_pending(&zhdr->work) || | ||
628 | !unlikely(work_busy(&zhdr->work)))) { | ||
629 | list_del(&zhdr->buddy); | ||
630 | clear_bit(NEEDS_COMPACTING, &page->private); | ||
631 | spin_unlock(&pool->stale_lock); | ||
632 | if (can_sleep) | ||
633 | cancel_work_sync(&zhdr->work); | ||
634 | page = virt_to_page(zhdr); | ||
635 | } else { | ||
636 | spin_unlock(&pool->stale_lock); | ||
637 | page = alloc_page(gfp); | ||
638 | } | ||
639 | |||
431 | if (!page) | 640 | if (!page) |
432 | return -ENOMEM; | 641 | return -ENOMEM; |
433 | 642 | ||
434 | atomic64_inc(&pool->pages_nr); | 643 | atomic64_inc(&pool->pages_nr); |
435 | zhdr = init_z3fold_page(page); | 644 | zhdr = init_z3fold_page(page, pool); |
436 | 645 | ||
437 | if (bud == HEADLESS) { | 646 | if (bud == HEADLESS) { |
438 | set_bit(PAGE_HEADLESS, &page->private); | 647 | set_bit(PAGE_HEADLESS, &page->private); |
439 | spin_lock(&pool->lock); | ||
440 | goto headless; | 648 | goto headless; |
441 | } | 649 | } |
442 | z3fold_page_lock(zhdr); | 650 | z3fold_page_lock(zhdr); |
@@ -451,15 +659,21 @@ found: | |||
451 | zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; | 659 | zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; |
452 | } | 660 | } |
453 | 661 | ||
454 | spin_lock(&pool->lock); | ||
455 | if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || | 662 | if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || |
456 | zhdr->middle_chunks == 0) { | 663 | zhdr->middle_chunks == 0) { |
664 | struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); | ||
665 | |||
457 | /* Add to unbuddied list */ | 666 | /* Add to unbuddied list */ |
458 | freechunks = num_free_chunks(zhdr); | 667 | freechunks = num_free_chunks(zhdr); |
459 | list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); | 668 | spin_lock(&pool->lock); |
669 | list_add(&zhdr->buddy, &unbuddied[freechunks]); | ||
670 | spin_unlock(&pool->lock); | ||
671 | zhdr->cpu = smp_processor_id(); | ||
672 | put_cpu_ptr(pool->unbuddied); | ||
460 | } | 673 | } |
461 | 674 | ||
462 | headless: | 675 | headless: |
676 | spin_lock(&pool->lock); | ||
463 | /* Add/move z3fold page to beginning of LRU */ | 677 | /* Add/move z3fold page to beginning of LRU */ |
464 | if (!list_empty(&page->lru)) | 678 | if (!list_empty(&page->lru)) |
465 | list_del(&page->lru); | 679 | list_del(&page->lru); |
@@ -487,7 +701,6 @@ headless: | |||
487 | static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) | 701 | static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) |
488 | { | 702 | { |
489 | struct z3fold_header *zhdr; | 703 | struct z3fold_header *zhdr; |
490 | int freechunks; | ||
491 | struct page *page; | 704 | struct page *page; |
492 | enum buddy bud; | 705 | enum buddy bud; |
493 | 706 | ||
@@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) | |||
526 | spin_unlock(&pool->lock); | 739 | spin_unlock(&pool->lock); |
527 | free_z3fold_page(page); | 740 | free_z3fold_page(page); |
528 | atomic64_dec(&pool->pages_nr); | 741 | atomic64_dec(&pool->pages_nr); |
529 | } else { | 742 | return; |
530 | if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 || | 743 | } |
531 | zhdr->last_chunks != 0) { | 744 | |
532 | z3fold_compact_page(zhdr); | 745 | if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { |
533 | /* Add to the unbuddied list */ | 746 | atomic64_dec(&pool->pages_nr); |
534 | spin_lock(&pool->lock); | 747 | return; |
535 | if (!list_empty(&zhdr->buddy)) | 748 | } |
536 | list_del(&zhdr->buddy); | 749 | if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { |
537 | freechunks = num_free_chunks(zhdr); | ||
538 | list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); | ||
539 | spin_unlock(&pool->lock); | ||
540 | } | ||
541 | z3fold_page_unlock(zhdr); | 750 | z3fold_page_unlock(zhdr); |
751 | return; | ||
752 | } | ||
753 | if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { | ||
542 | spin_lock(&pool->lock); | 754 | spin_lock(&pool->lock); |
543 | if (kref_put(&zhdr->refcount, release_z3fold_page)) | 755 | list_del_init(&zhdr->buddy); |
544 | atomic64_dec(&pool->pages_nr); | ||
545 | spin_unlock(&pool->lock); | 756 | spin_unlock(&pool->lock); |
757 | zhdr->cpu = -1; | ||
758 | do_compact_page(zhdr, true); | ||
759 | return; | ||
546 | } | 760 | } |
547 | 761 | queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); | |
762 | z3fold_page_unlock(zhdr); | ||
548 | } | 763 | } |
549 | 764 | ||
550 | /** | 765 | /** |
@@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) | |||
585 | */ | 800 | */ |
586 | static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) | 801 | static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) |
587 | { | 802 | { |
588 | int i, ret = 0, freechunks; | 803 | int i, ret = 0; |
589 | struct z3fold_header *zhdr; | 804 | struct z3fold_header *zhdr = NULL; |
590 | struct page *page; | 805 | struct page *page = NULL; |
806 | struct list_head *pos; | ||
591 | unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; | 807 | unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; |
592 | 808 | ||
593 | spin_lock(&pool->lock); | 809 | spin_lock(&pool->lock); |
@@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) | |||
600 | spin_unlock(&pool->lock); | 816 | spin_unlock(&pool->lock); |
601 | return -EINVAL; | 817 | return -EINVAL; |
602 | } | 818 | } |
603 | page = list_last_entry(&pool->lru, struct page, lru); | 819 | list_for_each_prev(pos, &pool->lru) { |
820 | page = list_entry(pos, struct page, lru); | ||
821 | if (test_bit(PAGE_HEADLESS, &page->private)) | ||
822 | /* candidate found */ | ||
823 | break; | ||
824 | |||
825 | zhdr = page_address(page); | ||
826 | if (!z3fold_page_trylock(zhdr)) | ||
827 | continue; /* can't evict at this point */ | ||
828 | kref_get(&zhdr->refcount); | ||
829 | list_del_init(&zhdr->buddy); | ||
830 | zhdr->cpu = -1; | ||
831 | } | ||
832 | |||
604 | list_del_init(&page->lru); | 833 | list_del_init(&page->lru); |
834 | spin_unlock(&pool->lock); | ||
605 | 835 | ||
606 | zhdr = page_address(page); | ||
607 | if (!test_bit(PAGE_HEADLESS, &page->private)) { | 836 | if (!test_bit(PAGE_HEADLESS, &page->private)) { |
608 | if (!list_empty(&zhdr->buddy)) | ||
609 | list_del_init(&zhdr->buddy); | ||
610 | kref_get(&zhdr->refcount); | ||
611 | spin_unlock(&pool->lock); | ||
612 | z3fold_page_lock(zhdr); | ||
613 | /* | 837 | /* |
614 | * We need encode the handles before unlocking, since | 838 | * We need encode the handles before unlocking, since |
615 | * we can race with free that will set | 839 | * we can race with free that will set |
@@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) | |||
624 | middle_handle = encode_handle(zhdr, MIDDLE); | 848 | middle_handle = encode_handle(zhdr, MIDDLE); |
625 | if (zhdr->last_chunks) | 849 | if (zhdr->last_chunks) |
626 | last_handle = encode_handle(zhdr, LAST); | 850 | last_handle = encode_handle(zhdr, LAST); |
851 | /* | ||
852 | * it's safe to unlock here because we hold a | ||
853 | * reference to this page | ||
854 | */ | ||
627 | z3fold_page_unlock(zhdr); | 855 | z3fold_page_unlock(zhdr); |
628 | } else { | 856 | } else { |
629 | first_handle = encode_handle(zhdr, HEADLESS); | 857 | first_handle = encode_handle(zhdr, HEADLESS); |
630 | last_handle = middle_handle = 0; | 858 | last_handle = middle_handle = 0; |
631 | spin_unlock(&pool->lock); | ||
632 | } | 859 | } |
633 | 860 | ||
634 | /* Issue the eviction callback(s) */ | 861 | /* Issue the eviction callback(s) */ |
@@ -652,31 +879,12 @@ next: | |||
652 | if (ret == 0) { | 879 | if (ret == 0) { |
653 | free_z3fold_page(page); | 880 | free_z3fold_page(page); |
654 | return 0; | 881 | return 0; |
655 | } else { | ||
656 | spin_lock(&pool->lock); | ||
657 | } | ||
658 | } else { | ||
659 | z3fold_page_lock(zhdr); | ||
660 | if ((zhdr->first_chunks || zhdr->last_chunks || | ||
661 | zhdr->middle_chunks) && | ||
662 | !(zhdr->first_chunks && zhdr->last_chunks && | ||
663 | zhdr->middle_chunks)) { | ||
664 | z3fold_compact_page(zhdr); | ||
665 | /* add to unbuddied list */ | ||
666 | spin_lock(&pool->lock); | ||
667 | freechunks = num_free_chunks(zhdr); | ||
668 | list_add(&zhdr->buddy, | ||
669 | &pool->unbuddied[freechunks]); | ||
670 | spin_unlock(&pool->lock); | ||
671 | } | ||
672 | z3fold_page_unlock(zhdr); | ||
673 | spin_lock(&pool->lock); | ||
674 | if (kref_put(&zhdr->refcount, release_z3fold_page)) { | ||
675 | spin_unlock(&pool->lock); | ||
676 | atomic64_dec(&pool->pages_nr); | ||
677 | return 0; | ||
678 | } | 882 | } |
883 | } else if (kref_put(&zhdr->refcount, release_z3fold_page)) { | ||
884 | atomic64_dec(&pool->pages_nr); | ||
885 | return 0; | ||
679 | } | 886 | } |
887 | spin_lock(&pool->lock); | ||
680 | 888 | ||
681 | /* | 889 | /* |
682 | * Add to the beginning of LRU. | 890 | * Add to the beginning of LRU. |
@@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp, | |||
795 | { | 1003 | { |
796 | struct z3fold_pool *pool; | 1004 | struct z3fold_pool *pool; |
797 | 1005 | ||
798 | pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL); | 1006 | pool = z3fold_create_pool(name, gfp, |
1007 | zpool_ops ? &z3fold_zpool_ops : NULL); | ||
799 | if (pool) { | 1008 | if (pool) { |
800 | pool->zpool = zpool; | 1009 | pool->zpool = zpool; |
801 | pool->zpool_ops = zpool_ops; | 1010 | pool->zpool_ops = zpool_ops; |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 308acb9d814b..62457eb82330 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -1983,8 +1983,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, | |||
1983 | 1983 | ||
1984 | spin_lock(&class->lock); | 1984 | spin_lock(&class->lock); |
1985 | if (!get_zspage_inuse(zspage)) { | 1985 | if (!get_zspage_inuse(zspage)) { |
1986 | ret = -EBUSY; | 1986 | /* |
1987 | goto unlock_class; | 1987 | * Set "offset" to end of the page so that every loops |
1988 | * skips unnecessary object scanning. | ||
1989 | */ | ||
1990 | offset = PAGE_SIZE; | ||
1988 | } | 1991 | } |
1989 | 1992 | ||
1990 | pos = offset; | 1993 | pos = offset; |
@@ -2052,7 +2055,6 @@ unpin_objects: | |||
2052 | } | 2055 | } |
2053 | } | 2056 | } |
2054 | kunmap_atomic(s_addr); | 2057 | kunmap_atomic(s_addr); |
2055 | unlock_class: | ||
2056 | spin_unlock(&class->lock); | 2058 | spin_unlock(&class->lock); |
2057 | migrate_write_unlock(zspage); | 2059 | migrate_write_unlock(zspage); |
2058 | 2060 | ||
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 48397feb08fb..b920d186ad4a 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c | |||
@@ -261,7 +261,17 @@ static enum export export_no(const char *s) | |||
261 | return export_unknown; | 261 | return export_unknown; |
262 | } | 262 | } |
263 | 263 | ||
264 | static const char *sec_name(struct elf_info *elf, int secindex); | 264 | static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr) |
265 | { | ||
266 | return (void *)elf->hdr + | ||
267 | elf->sechdrs[elf->secindex_strings].sh_offset + | ||
268 | sechdr->sh_name; | ||
269 | } | ||
270 | |||
271 | static const char *sec_name(struct elf_info *elf, int secindex) | ||
272 | { | ||
273 | return sech_name(elf, &elf->sechdrs[secindex]); | ||
274 | } | ||
265 | 275 | ||
266 | #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) | 276 | #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) |
267 | 277 | ||
@@ -775,21 +785,6 @@ static const char *sym_name(struct elf_info *elf, Elf_Sym *sym) | |||
775 | return "(unknown)"; | 785 | return "(unknown)"; |
776 | } | 786 | } |
777 | 787 | ||
778 | static const char *sec_name(struct elf_info *elf, int secindex) | ||
779 | { | ||
780 | Elf_Shdr *sechdrs = elf->sechdrs; | ||
781 | return (void *)elf->hdr + | ||
782 | elf->sechdrs[elf->secindex_strings].sh_offset + | ||
783 | sechdrs[secindex].sh_name; | ||
784 | } | ||
785 | |||
786 | static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr) | ||
787 | { | ||
788 | return (void *)elf->hdr + | ||
789 | elf->sechdrs[elf->secindex_strings].sh_offset + | ||
790 | sechdr->sh_name; | ||
791 | } | ||
792 | |||
793 | /* The pattern is an array of simple patterns. | 788 | /* The pattern is an array of simple patterns. |
794 | * "foo" will match an exact string equal to "foo" | 789 | * "foo" will match an exact string equal to "foo" |
795 | * "*foo" will match a string that ends with "foo" | 790 | * "*foo" will match a string that ends with "foo" |
diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile index ad8a0897e47f..bc9d02d615da 100644 --- a/tools/testing/selftests/memfd/Makefile +++ b/tools/testing/selftests/memfd/Makefile | |||
@@ -3,7 +3,7 @@ CFLAGS += -I../../../../include/uapi/ | |||
3 | CFLAGS += -I../../../../include/ | 3 | CFLAGS += -I../../../../include/ |
4 | CFLAGS += -I../../../../usr/include/ | 4 | CFLAGS += -I../../../../usr/include/ |
5 | 5 | ||
6 | TEST_PROGS := run_fuse_test.sh | 6 | TEST_PROGS := run_tests.sh |
7 | TEST_GEN_FILES := memfd_test fuse_mnt fuse_test | 7 | TEST_GEN_FILES := memfd_test fuse_mnt fuse_test |
8 | 8 | ||
9 | fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags) | 9 | fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags) |
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 26546892cd54..f94c6d1fb46f 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c | |||
@@ -18,12 +18,48 @@ | |||
18 | #include <sys/wait.h> | 18 | #include <sys/wait.h> |
19 | #include <unistd.h> | 19 | #include <unistd.h> |
20 | 20 | ||
21 | #define MEMFD_STR "memfd:" | ||
22 | #define SHARED_FT_STR "(shared file-table)" | ||
23 | |||
21 | #define MFD_DEF_SIZE 8192 | 24 | #define MFD_DEF_SIZE 8192 |
22 | #define STACK_SIZE 65536 | 25 | #define STACK_SIZE 65536 |
23 | 26 | ||
27 | /* | ||
28 | * Default is not to test hugetlbfs | ||
29 | */ | ||
30 | static int hugetlbfs_test; | ||
31 | static size_t mfd_def_size = MFD_DEF_SIZE; | ||
32 | |||
33 | /* | ||
34 | * Copied from mlock2-tests.c | ||
35 | */ | ||
36 | static unsigned long default_huge_page_size(void) | ||
37 | { | ||
38 | unsigned long hps = 0; | ||
39 | char *line = NULL; | ||
40 | size_t linelen = 0; | ||
41 | FILE *f = fopen("/proc/meminfo", "r"); | ||
42 | |||
43 | if (!f) | ||
44 | return 0; | ||
45 | while (getline(&line, &linelen, f) > 0) { | ||
46 | if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { | ||
47 | hps <<= 10; | ||
48 | break; | ||
49 | } | ||
50 | } | ||
51 | |||
52 | free(line); | ||
53 | fclose(f); | ||
54 | return hps; | ||
55 | } | ||
56 | |||
24 | static int sys_memfd_create(const char *name, | 57 | static int sys_memfd_create(const char *name, |
25 | unsigned int flags) | 58 | unsigned int flags) |
26 | { | 59 | { |
60 | if (hugetlbfs_test) | ||
61 | flags |= MFD_HUGETLB; | ||
62 | |||
27 | return syscall(__NR_memfd_create, name, flags); | 63 | return syscall(__NR_memfd_create, name, flags); |
28 | } | 64 | } |
29 | 65 | ||
@@ -150,7 +186,7 @@ static void *mfd_assert_mmap_shared(int fd) | |||
150 | void *p; | 186 | void *p; |
151 | 187 | ||
152 | p = mmap(NULL, | 188 | p = mmap(NULL, |
153 | MFD_DEF_SIZE, | 189 | mfd_def_size, |
154 | PROT_READ | PROT_WRITE, | 190 | PROT_READ | PROT_WRITE, |
155 | MAP_SHARED, | 191 | MAP_SHARED, |
156 | fd, | 192 | fd, |
@@ -168,7 +204,7 @@ static void *mfd_assert_mmap_private(int fd) | |||
168 | void *p; | 204 | void *p; |
169 | 205 | ||
170 | p = mmap(NULL, | 206 | p = mmap(NULL, |
171 | MFD_DEF_SIZE, | 207 | mfd_def_size, |
172 | PROT_READ, | 208 | PROT_READ, |
173 | MAP_PRIVATE, | 209 | MAP_PRIVATE, |
174 | fd, | 210 | fd, |
@@ -223,7 +259,7 @@ static void mfd_assert_read(int fd) | |||
223 | 259 | ||
224 | /* verify PROT_READ *is* allowed */ | 260 | /* verify PROT_READ *is* allowed */ |
225 | p = mmap(NULL, | 261 | p = mmap(NULL, |
226 | MFD_DEF_SIZE, | 262 | mfd_def_size, |
227 | PROT_READ, | 263 | PROT_READ, |
228 | MAP_PRIVATE, | 264 | MAP_PRIVATE, |
229 | fd, | 265 | fd, |
@@ -232,11 +268,11 @@ static void mfd_assert_read(int fd) | |||
232 | printf("mmap() failed: %m\n"); | 268 | printf("mmap() failed: %m\n"); |
233 | abort(); | 269 | abort(); |
234 | } | 270 | } |
235 | munmap(p, MFD_DEF_SIZE); | 271 | munmap(p, mfd_def_size); |
236 | 272 | ||
237 | /* verify MAP_PRIVATE is *always* allowed (even writable) */ | 273 | /* verify MAP_PRIVATE is *always* allowed (even writable) */ |
238 | p = mmap(NULL, | 274 | p = mmap(NULL, |
239 | MFD_DEF_SIZE, | 275 | mfd_def_size, |
240 | PROT_READ | PROT_WRITE, | 276 | PROT_READ | PROT_WRITE, |
241 | MAP_PRIVATE, | 277 | MAP_PRIVATE, |
242 | fd, | 278 | fd, |
@@ -245,7 +281,7 @@ static void mfd_assert_read(int fd) | |||
245 | printf("mmap() failed: %m\n"); | 281 | printf("mmap() failed: %m\n"); |
246 | abort(); | 282 | abort(); |
247 | } | 283 | } |
248 | munmap(p, MFD_DEF_SIZE); | 284 | munmap(p, mfd_def_size); |
249 | } | 285 | } |
250 | 286 | ||
251 | static void mfd_assert_write(int fd) | 287 | static void mfd_assert_write(int fd) |
@@ -254,16 +290,22 @@ static void mfd_assert_write(int fd) | |||
254 | void *p; | 290 | void *p; |
255 | int r; | 291 | int r; |
256 | 292 | ||
257 | /* verify write() succeeds */ | 293 | /* |
258 | l = write(fd, "\0\0\0\0", 4); | 294 | * huegtlbfs does not support write, but we want to |
259 | if (l != 4) { | 295 | * verify everything else here. |
260 | printf("write() failed: %m\n"); | 296 | */ |
261 | abort(); | 297 | if (!hugetlbfs_test) { |
298 | /* verify write() succeeds */ | ||
299 | l = write(fd, "\0\0\0\0", 4); | ||
300 | if (l != 4) { | ||
301 | printf("write() failed: %m\n"); | ||
302 | abort(); | ||
303 | } | ||
262 | } | 304 | } |
263 | 305 | ||
264 | /* verify PROT_READ | PROT_WRITE is allowed */ | 306 | /* verify PROT_READ | PROT_WRITE is allowed */ |
265 | p = mmap(NULL, | 307 | p = mmap(NULL, |
266 | MFD_DEF_SIZE, | 308 | mfd_def_size, |
267 | PROT_READ | PROT_WRITE, | 309 | PROT_READ | PROT_WRITE, |
268 | MAP_SHARED, | 310 | MAP_SHARED, |
269 | fd, | 311 | fd, |
@@ -273,11 +315,11 @@ static void mfd_assert_write(int fd) | |||
273 | abort(); | 315 | abort(); |
274 | } | 316 | } |
275 | *(char *)p = 0; | 317 | *(char *)p = 0; |
276 | munmap(p, MFD_DEF_SIZE); | 318 | munmap(p, mfd_def_size); |
277 | 319 | ||
278 | /* verify PROT_WRITE is allowed */ | 320 | /* verify PROT_WRITE is allowed */ |
279 | p = mmap(NULL, | 321 | p = mmap(NULL, |
280 | MFD_DEF_SIZE, | 322 | mfd_def_size, |
281 | PROT_WRITE, | 323 | PROT_WRITE, |
282 | MAP_SHARED, | 324 | MAP_SHARED, |
283 | fd, | 325 | fd, |
@@ -287,12 +329,12 @@ static void mfd_assert_write(int fd) | |||
287 | abort(); | 329 | abort(); |
288 | } | 330 | } |
289 | *(char *)p = 0; | 331 | *(char *)p = 0; |
290 | munmap(p, MFD_DEF_SIZE); | 332 | munmap(p, mfd_def_size); |
291 | 333 | ||
292 | /* verify PROT_READ with MAP_SHARED is allowed and a following | 334 | /* verify PROT_READ with MAP_SHARED is allowed and a following |
293 | * mprotect(PROT_WRITE) allows writing */ | 335 | * mprotect(PROT_WRITE) allows writing */ |
294 | p = mmap(NULL, | 336 | p = mmap(NULL, |
295 | MFD_DEF_SIZE, | 337 | mfd_def_size, |
296 | PROT_READ, | 338 | PROT_READ, |
297 | MAP_SHARED, | 339 | MAP_SHARED, |
298 | fd, | 340 | fd, |
@@ -302,20 +344,20 @@ static void mfd_assert_write(int fd) | |||
302 | abort(); | 344 | abort(); |
303 | } | 345 | } |
304 | 346 | ||
305 | r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE); | 347 | r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE); |
306 | if (r < 0) { | 348 | if (r < 0) { |
307 | printf("mprotect() failed: %m\n"); | 349 | printf("mprotect() failed: %m\n"); |
308 | abort(); | 350 | abort(); |
309 | } | 351 | } |
310 | 352 | ||
311 | *(char *)p = 0; | 353 | *(char *)p = 0; |
312 | munmap(p, MFD_DEF_SIZE); | 354 | munmap(p, mfd_def_size); |
313 | 355 | ||
314 | /* verify PUNCH_HOLE works */ | 356 | /* verify PUNCH_HOLE works */ |
315 | r = fallocate(fd, | 357 | r = fallocate(fd, |
316 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | 358 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, |
317 | 0, | 359 | 0, |
318 | MFD_DEF_SIZE); | 360 | mfd_def_size); |
319 | if (r < 0) { | 361 | if (r < 0) { |
320 | printf("fallocate(PUNCH_HOLE) failed: %m\n"); | 362 | printf("fallocate(PUNCH_HOLE) failed: %m\n"); |
321 | abort(); | 363 | abort(); |
@@ -337,7 +379,7 @@ static void mfd_fail_write(int fd) | |||
337 | 379 | ||
338 | /* verify PROT_READ | PROT_WRITE is not allowed */ | 380 | /* verify PROT_READ | PROT_WRITE is not allowed */ |
339 | p = mmap(NULL, | 381 | p = mmap(NULL, |
340 | MFD_DEF_SIZE, | 382 | mfd_def_size, |
341 | PROT_READ | PROT_WRITE, | 383 | PROT_READ | PROT_WRITE, |
342 | MAP_SHARED, | 384 | MAP_SHARED, |
343 | fd, | 385 | fd, |
@@ -349,7 +391,7 @@ static void mfd_fail_write(int fd) | |||
349 | 391 | ||
350 | /* verify PROT_WRITE is not allowed */ | 392 | /* verify PROT_WRITE is not allowed */ |
351 | p = mmap(NULL, | 393 | p = mmap(NULL, |
352 | MFD_DEF_SIZE, | 394 | mfd_def_size, |
353 | PROT_WRITE, | 395 | PROT_WRITE, |
354 | MAP_SHARED, | 396 | MAP_SHARED, |
355 | fd, | 397 | fd, |
@@ -362,13 +404,13 @@ static void mfd_fail_write(int fd) | |||
362 | /* Verify PROT_READ with MAP_SHARED with a following mprotect is not | 404 | /* Verify PROT_READ with MAP_SHARED with a following mprotect is not |
363 | * allowed. Note that for r/w the kernel already prevents the mmap. */ | 405 | * allowed. Note that for r/w the kernel already prevents the mmap. */ |
364 | p = mmap(NULL, | 406 | p = mmap(NULL, |
365 | MFD_DEF_SIZE, | 407 | mfd_def_size, |
366 | PROT_READ, | 408 | PROT_READ, |
367 | MAP_SHARED, | 409 | MAP_SHARED, |
368 | fd, | 410 | fd, |
369 | 0); | 411 | 0); |
370 | if (p != MAP_FAILED) { | 412 | if (p != MAP_FAILED) { |
371 | r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE); | 413 | r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE); |
372 | if (r >= 0) { | 414 | if (r >= 0) { |
373 | printf("mmap()+mprotect() didn't fail as expected\n"); | 415 | printf("mmap()+mprotect() didn't fail as expected\n"); |
374 | abort(); | 416 | abort(); |
@@ -379,7 +421,7 @@ static void mfd_fail_write(int fd) | |||
379 | r = fallocate(fd, | 421 | r = fallocate(fd, |
380 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | 422 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, |
381 | 0, | 423 | 0, |
382 | MFD_DEF_SIZE); | 424 | mfd_def_size); |
383 | if (r >= 0) { | 425 | if (r >= 0) { |
384 | printf("fallocate(PUNCH_HOLE) didn't fail as expected\n"); | 426 | printf("fallocate(PUNCH_HOLE) didn't fail as expected\n"); |
385 | abort(); | 427 | abort(); |
@@ -390,13 +432,13 @@ static void mfd_assert_shrink(int fd) | |||
390 | { | 432 | { |
391 | int r, fd2; | 433 | int r, fd2; |
392 | 434 | ||
393 | r = ftruncate(fd, MFD_DEF_SIZE / 2); | 435 | r = ftruncate(fd, mfd_def_size / 2); |
394 | if (r < 0) { | 436 | if (r < 0) { |
395 | printf("ftruncate(SHRINK) failed: %m\n"); | 437 | printf("ftruncate(SHRINK) failed: %m\n"); |
396 | abort(); | 438 | abort(); |
397 | } | 439 | } |
398 | 440 | ||
399 | mfd_assert_size(fd, MFD_DEF_SIZE / 2); | 441 | mfd_assert_size(fd, mfd_def_size / 2); |
400 | 442 | ||
401 | fd2 = mfd_assert_open(fd, | 443 | fd2 = mfd_assert_open(fd, |
402 | O_RDWR | O_CREAT | O_TRUNC, | 444 | O_RDWR | O_CREAT | O_TRUNC, |
@@ -410,7 +452,7 @@ static void mfd_fail_shrink(int fd) | |||
410 | { | 452 | { |
411 | int r; | 453 | int r; |
412 | 454 | ||
413 | r = ftruncate(fd, MFD_DEF_SIZE / 2); | 455 | r = ftruncate(fd, mfd_def_size / 2); |
414 | if (r >= 0) { | 456 | if (r >= 0) { |
415 | printf("ftruncate(SHRINK) didn't fail as expected\n"); | 457 | printf("ftruncate(SHRINK) didn't fail as expected\n"); |
416 | abort(); | 458 | abort(); |
@@ -425,31 +467,31 @@ static void mfd_assert_grow(int fd) | |||
425 | { | 467 | { |
426 | int r; | 468 | int r; |
427 | 469 | ||
428 | r = ftruncate(fd, MFD_DEF_SIZE * 2); | 470 | r = ftruncate(fd, mfd_def_size * 2); |
429 | if (r < 0) { | 471 | if (r < 0) { |
430 | printf("ftruncate(GROW) failed: %m\n"); | 472 | printf("ftruncate(GROW) failed: %m\n"); |
431 | abort(); | 473 | abort(); |
432 | } | 474 | } |
433 | 475 | ||
434 | mfd_assert_size(fd, MFD_DEF_SIZE * 2); | 476 | mfd_assert_size(fd, mfd_def_size * 2); |
435 | 477 | ||
436 | r = fallocate(fd, | 478 | r = fallocate(fd, |
437 | 0, | 479 | 0, |
438 | 0, | 480 | 0, |
439 | MFD_DEF_SIZE * 4); | 481 | mfd_def_size * 4); |
440 | if (r < 0) { | 482 | if (r < 0) { |
441 | printf("fallocate(ALLOC) failed: %m\n"); | 483 | printf("fallocate(ALLOC) failed: %m\n"); |
442 | abort(); | 484 | abort(); |
443 | } | 485 | } |
444 | 486 | ||
445 | mfd_assert_size(fd, MFD_DEF_SIZE * 4); | 487 | mfd_assert_size(fd, mfd_def_size * 4); |
446 | } | 488 | } |
447 | 489 | ||
448 | static void mfd_fail_grow(int fd) | 490 | static void mfd_fail_grow(int fd) |
449 | { | 491 | { |
450 | int r; | 492 | int r; |
451 | 493 | ||
452 | r = ftruncate(fd, MFD_DEF_SIZE * 2); | 494 | r = ftruncate(fd, mfd_def_size * 2); |
453 | if (r >= 0) { | 495 | if (r >= 0) { |
454 | printf("ftruncate(GROW) didn't fail as expected\n"); | 496 | printf("ftruncate(GROW) didn't fail as expected\n"); |
455 | abort(); | 497 | abort(); |
@@ -458,7 +500,7 @@ static void mfd_fail_grow(int fd) | |||
458 | r = fallocate(fd, | 500 | r = fallocate(fd, |
459 | 0, | 501 | 0, |
460 | 0, | 502 | 0, |
461 | MFD_DEF_SIZE * 4); | 503 | mfd_def_size * 4); |
462 | if (r >= 0) { | 504 | if (r >= 0) { |
463 | printf("fallocate(ALLOC) didn't fail as expected\n"); | 505 | printf("fallocate(ALLOC) didn't fail as expected\n"); |
464 | abort(); | 506 | abort(); |
@@ -467,25 +509,37 @@ static void mfd_fail_grow(int fd) | |||
467 | 509 | ||
468 | static void mfd_assert_grow_write(int fd) | 510 | static void mfd_assert_grow_write(int fd) |
469 | { | 511 | { |
470 | static char buf[MFD_DEF_SIZE * 8]; | 512 | static char *buf; |
471 | ssize_t l; | 513 | ssize_t l; |
472 | 514 | ||
473 | l = pwrite(fd, buf, sizeof(buf), 0); | 515 | buf = malloc(mfd_def_size * 8); |
474 | if (l != sizeof(buf)) { | 516 | if (!buf) { |
517 | printf("malloc(%d) failed: %m\n", mfd_def_size * 8); | ||
518 | abort(); | ||
519 | } | ||
520 | |||
521 | l = pwrite(fd, buf, mfd_def_size * 8, 0); | ||
522 | if (l != (mfd_def_size * 8)) { | ||
475 | printf("pwrite() failed: %m\n"); | 523 | printf("pwrite() failed: %m\n"); |
476 | abort(); | 524 | abort(); |
477 | } | 525 | } |
478 | 526 | ||
479 | mfd_assert_size(fd, MFD_DEF_SIZE * 8); | 527 | mfd_assert_size(fd, mfd_def_size * 8); |
480 | } | 528 | } |
481 | 529 | ||
482 | static void mfd_fail_grow_write(int fd) | 530 | static void mfd_fail_grow_write(int fd) |
483 | { | 531 | { |
484 | static char buf[MFD_DEF_SIZE * 8]; | 532 | static char *buf; |
485 | ssize_t l; | 533 | ssize_t l; |
486 | 534 | ||
487 | l = pwrite(fd, buf, sizeof(buf), 0); | 535 | buf = malloc(mfd_def_size * 8); |
488 | if (l == sizeof(buf)) { | 536 | if (!buf) { |
537 | printf("malloc(%d) failed: %m\n", mfd_def_size * 8); | ||
538 | abort(); | ||
539 | } | ||
540 | |||
541 | l = pwrite(fd, buf, mfd_def_size * 8, 0); | ||
542 | if (l == (mfd_def_size * 8)) { | ||
489 | printf("pwrite() didn't fail as expected\n"); | 543 | printf("pwrite() didn't fail as expected\n"); |
490 | abort(); | 544 | abort(); |
491 | } | 545 | } |
@@ -543,6 +597,8 @@ static void test_create(void) | |||
543 | char buf[2048]; | 597 | char buf[2048]; |
544 | int fd; | 598 | int fd; |
545 | 599 | ||
600 | printf("%s CREATE\n", MEMFD_STR); | ||
601 | |||
546 | /* test NULL name */ | 602 | /* test NULL name */ |
547 | mfd_fail_new(NULL, 0); | 603 | mfd_fail_new(NULL, 0); |
548 | 604 | ||
@@ -570,13 +626,18 @@ static void test_create(void) | |||
570 | fd = mfd_assert_new("", 0, MFD_CLOEXEC); | 626 | fd = mfd_assert_new("", 0, MFD_CLOEXEC); |
571 | close(fd); | 627 | close(fd); |
572 | 628 | ||
573 | /* verify MFD_ALLOW_SEALING is allowed */ | 629 | if (!hugetlbfs_test) { |
574 | fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); | 630 | /* verify MFD_ALLOW_SEALING is allowed */ |
575 | close(fd); | 631 | fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); |
576 | 632 | close(fd); | |
577 | /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ | 633 | |
578 | fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); | 634 | /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ |
579 | close(fd); | 635 | fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); |
636 | close(fd); | ||
637 | } else { | ||
638 | /* sealing is not supported on hugetlbfs */ | ||
639 | mfd_fail_new("", MFD_ALLOW_SEALING); | ||
640 | } | ||
580 | } | 641 | } |
581 | 642 | ||
582 | /* | 643 | /* |
@@ -587,8 +648,14 @@ static void test_basic(void) | |||
587 | { | 648 | { |
588 | int fd; | 649 | int fd; |
589 | 650 | ||
651 | /* hugetlbfs does not contain sealing support */ | ||
652 | if (hugetlbfs_test) | ||
653 | return; | ||
654 | |||
655 | printf("%s BASIC\n", MEMFD_STR); | ||
656 | |||
590 | fd = mfd_assert_new("kern_memfd_basic", | 657 | fd = mfd_assert_new("kern_memfd_basic", |
591 | MFD_DEF_SIZE, | 658 | mfd_def_size, |
592 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 659 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
593 | 660 | ||
594 | /* add basic seals */ | 661 | /* add basic seals */ |
@@ -619,7 +686,7 @@ static void test_basic(void) | |||
619 | 686 | ||
620 | /* verify sealing does not work without MFD_ALLOW_SEALING */ | 687 | /* verify sealing does not work without MFD_ALLOW_SEALING */ |
621 | fd = mfd_assert_new("kern_memfd_basic", | 688 | fd = mfd_assert_new("kern_memfd_basic", |
622 | MFD_DEF_SIZE, | 689 | mfd_def_size, |
623 | MFD_CLOEXEC); | 690 | MFD_CLOEXEC); |
624 | mfd_assert_has_seals(fd, F_SEAL_SEAL); | 691 | mfd_assert_has_seals(fd, F_SEAL_SEAL); |
625 | mfd_fail_add_seals(fd, F_SEAL_SHRINK | | 692 | mfd_fail_add_seals(fd, F_SEAL_SHRINK | |
@@ -630,6 +697,28 @@ static void test_basic(void) | |||
630 | } | 697 | } |
631 | 698 | ||
632 | /* | 699 | /* |
700 | * hugetlbfs doesn't support seals or write, so just verify grow and shrink | ||
701 | * on a hugetlbfs file created via memfd_create. | ||
702 | */ | ||
703 | static void test_hugetlbfs_grow_shrink(void) | ||
704 | { | ||
705 | int fd; | ||
706 | |||
707 | printf("%s HUGETLBFS-GROW-SHRINK\n", MEMFD_STR); | ||
708 | |||
709 | fd = mfd_assert_new("kern_memfd_seal_write", | ||
710 | mfd_def_size, | ||
711 | MFD_CLOEXEC); | ||
712 | |||
713 | mfd_assert_read(fd); | ||
714 | mfd_assert_write(fd); | ||
715 | mfd_assert_shrink(fd); | ||
716 | mfd_assert_grow(fd); | ||
717 | |||
718 | close(fd); | ||
719 | } | ||
720 | |||
721 | /* | ||
633 | * Test SEAL_WRITE | 722 | * Test SEAL_WRITE |
634 | * Test whether SEAL_WRITE actually prevents modifications. | 723 | * Test whether SEAL_WRITE actually prevents modifications. |
635 | */ | 724 | */ |
@@ -637,8 +726,17 @@ static void test_seal_write(void) | |||
637 | { | 726 | { |
638 | int fd; | 727 | int fd; |
639 | 728 | ||
729 | /* | ||
730 | * hugetlbfs does not contain sealing or write support. Just test | ||
731 | * basic grow and shrink via test_hugetlbfs_grow_shrink. | ||
732 | */ | ||
733 | if (hugetlbfs_test) | ||
734 | return test_hugetlbfs_grow_shrink(); | ||
735 | |||
736 | printf("%s SEAL-WRITE\n", MEMFD_STR); | ||
737 | |||
640 | fd = mfd_assert_new("kern_memfd_seal_write", | 738 | fd = mfd_assert_new("kern_memfd_seal_write", |
641 | MFD_DEF_SIZE, | 739 | mfd_def_size, |
642 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 740 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
643 | mfd_assert_has_seals(fd, 0); | 741 | mfd_assert_has_seals(fd, 0); |
644 | mfd_assert_add_seals(fd, F_SEAL_WRITE); | 742 | mfd_assert_add_seals(fd, F_SEAL_WRITE); |
@@ -661,8 +759,14 @@ static void test_seal_shrink(void) | |||
661 | { | 759 | { |
662 | int fd; | 760 | int fd; |
663 | 761 | ||
762 | /* hugetlbfs does not contain sealing support */ | ||
763 | if (hugetlbfs_test) | ||
764 | return; | ||
765 | |||
766 | printf("%s SEAL-SHRINK\n", MEMFD_STR); | ||
767 | |||
664 | fd = mfd_assert_new("kern_memfd_seal_shrink", | 768 | fd = mfd_assert_new("kern_memfd_seal_shrink", |
665 | MFD_DEF_SIZE, | 769 | mfd_def_size, |
666 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 770 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
667 | mfd_assert_has_seals(fd, 0); | 771 | mfd_assert_has_seals(fd, 0); |
668 | mfd_assert_add_seals(fd, F_SEAL_SHRINK); | 772 | mfd_assert_add_seals(fd, F_SEAL_SHRINK); |
@@ -685,8 +789,14 @@ static void test_seal_grow(void) | |||
685 | { | 789 | { |
686 | int fd; | 790 | int fd; |
687 | 791 | ||
792 | /* hugetlbfs does not contain sealing support */ | ||
793 | if (hugetlbfs_test) | ||
794 | return; | ||
795 | |||
796 | printf("%s SEAL-GROW\n", MEMFD_STR); | ||
797 | |||
688 | fd = mfd_assert_new("kern_memfd_seal_grow", | 798 | fd = mfd_assert_new("kern_memfd_seal_grow", |
689 | MFD_DEF_SIZE, | 799 | mfd_def_size, |
690 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 800 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
691 | mfd_assert_has_seals(fd, 0); | 801 | mfd_assert_has_seals(fd, 0); |
692 | mfd_assert_add_seals(fd, F_SEAL_GROW); | 802 | mfd_assert_add_seals(fd, F_SEAL_GROW); |
@@ -709,8 +819,14 @@ static void test_seal_resize(void) | |||
709 | { | 819 | { |
710 | int fd; | 820 | int fd; |
711 | 821 | ||
822 | /* hugetlbfs does not contain sealing support */ | ||
823 | if (hugetlbfs_test) | ||
824 | return; | ||
825 | |||
826 | printf("%s SEAL-RESIZE\n", MEMFD_STR); | ||
827 | |||
712 | fd = mfd_assert_new("kern_memfd_seal_resize", | 828 | fd = mfd_assert_new("kern_memfd_seal_resize", |
713 | MFD_DEF_SIZE, | 829 | mfd_def_size, |
714 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 830 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
715 | mfd_assert_has_seals(fd, 0); | 831 | mfd_assert_has_seals(fd, 0); |
716 | mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW); | 832 | mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW); |
@@ -726,15 +842,52 @@ static void test_seal_resize(void) | |||
726 | } | 842 | } |
727 | 843 | ||
728 | /* | 844 | /* |
845 | * hugetlbfs does not support seals. Basic test to dup the memfd created | ||
846 | * fd and perform some basic operations on it. | ||
847 | */ | ||
848 | static void hugetlbfs_dup(char *b_suffix) | ||
849 | { | ||
850 | int fd, fd2; | ||
851 | |||
852 | printf("%s HUGETLBFS-DUP %s\n", MEMFD_STR, b_suffix); | ||
853 | |||
854 | fd = mfd_assert_new("kern_memfd_share_dup", | ||
855 | mfd_def_size, | ||
856 | MFD_CLOEXEC); | ||
857 | |||
858 | fd2 = mfd_assert_dup(fd); | ||
859 | |||
860 | mfd_assert_read(fd); | ||
861 | mfd_assert_write(fd); | ||
862 | |||
863 | mfd_assert_shrink(fd2); | ||
864 | mfd_assert_grow(fd2); | ||
865 | |||
866 | close(fd2); | ||
867 | close(fd); | ||
868 | } | ||
869 | |||
870 | /* | ||
729 | * Test sharing via dup() | 871 | * Test sharing via dup() |
730 | * Test that seals are shared between dupped FDs and they're all equal. | 872 | * Test that seals are shared between dupped FDs and they're all equal. |
731 | */ | 873 | */ |
732 | static void test_share_dup(void) | 874 | static void test_share_dup(char *banner, char *b_suffix) |
733 | { | 875 | { |
734 | int fd, fd2; | 876 | int fd, fd2; |
735 | 877 | ||
878 | /* | ||
879 | * hugetlbfs does not contain sealing support. Perform some | ||
880 | * basic testing on dup'ed fd instead via hugetlbfs_dup. | ||
881 | */ | ||
882 | if (hugetlbfs_test) { | ||
883 | hugetlbfs_dup(b_suffix); | ||
884 | return; | ||
885 | } | ||
886 | |||
887 | printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); | ||
888 | |||
736 | fd = mfd_assert_new("kern_memfd_share_dup", | 889 | fd = mfd_assert_new("kern_memfd_share_dup", |
737 | MFD_DEF_SIZE, | 890 | mfd_def_size, |
738 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 891 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
739 | mfd_assert_has_seals(fd, 0); | 892 | mfd_assert_has_seals(fd, 0); |
740 | 893 | ||
@@ -768,13 +921,19 @@ static void test_share_dup(void) | |||
768 | * Test sealing with active mmap()s | 921 | * Test sealing with active mmap()s |
769 | * Modifying seals is only allowed if no other mmap() refs exist. | 922 | * Modifying seals is only allowed if no other mmap() refs exist. |
770 | */ | 923 | */ |
771 | static void test_share_mmap(void) | 924 | static void test_share_mmap(char *banner, char *b_suffix) |
772 | { | 925 | { |
773 | int fd; | 926 | int fd; |
774 | void *p; | 927 | void *p; |
775 | 928 | ||
929 | /* hugetlbfs does not contain sealing support */ | ||
930 | if (hugetlbfs_test) | ||
931 | return; | ||
932 | |||
933 | printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); | ||
934 | |||
776 | fd = mfd_assert_new("kern_memfd_share_mmap", | 935 | fd = mfd_assert_new("kern_memfd_share_mmap", |
777 | MFD_DEF_SIZE, | 936 | mfd_def_size, |
778 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 937 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
779 | mfd_assert_has_seals(fd, 0); | 938 | mfd_assert_has_seals(fd, 0); |
780 | 939 | ||
@@ -784,14 +943,40 @@ static void test_share_mmap(void) | |||
784 | mfd_assert_has_seals(fd, 0); | 943 | mfd_assert_has_seals(fd, 0); |
785 | mfd_assert_add_seals(fd, F_SEAL_SHRINK); | 944 | mfd_assert_add_seals(fd, F_SEAL_SHRINK); |
786 | mfd_assert_has_seals(fd, F_SEAL_SHRINK); | 945 | mfd_assert_has_seals(fd, F_SEAL_SHRINK); |
787 | munmap(p, MFD_DEF_SIZE); | 946 | munmap(p, mfd_def_size); |
788 | 947 | ||
789 | /* readable ref allows sealing */ | 948 | /* readable ref allows sealing */ |
790 | p = mfd_assert_mmap_private(fd); | 949 | p = mfd_assert_mmap_private(fd); |
791 | mfd_assert_add_seals(fd, F_SEAL_WRITE); | 950 | mfd_assert_add_seals(fd, F_SEAL_WRITE); |
792 | mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); | 951 | mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); |
793 | munmap(p, MFD_DEF_SIZE); | 952 | munmap(p, mfd_def_size); |
953 | |||
954 | close(fd); | ||
955 | } | ||
956 | |||
957 | /* | ||
958 | * Basic test to make sure we can open the hugetlbfs fd via /proc and | ||
959 | * perform some simple operations on it. | ||
960 | */ | ||
961 | static void hugetlbfs_proc_open(char *b_suffix) | ||
962 | { | ||
963 | int fd, fd2; | ||
964 | |||
965 | printf("%s HUGETLBFS-PROC-OPEN %s\n", MEMFD_STR, b_suffix); | ||
794 | 966 | ||
967 | fd = mfd_assert_new("kern_memfd_share_open", | ||
968 | mfd_def_size, | ||
969 | MFD_CLOEXEC); | ||
970 | |||
971 | fd2 = mfd_assert_open(fd, O_RDWR, 0); | ||
972 | |||
973 | mfd_assert_read(fd); | ||
974 | mfd_assert_write(fd); | ||
975 | |||
976 | mfd_assert_shrink(fd2); | ||
977 | mfd_assert_grow(fd2); | ||
978 | |||
979 | close(fd2); | ||
795 | close(fd); | 980 | close(fd); |
796 | } | 981 | } |
797 | 982 | ||
@@ -801,12 +986,23 @@ static void test_share_mmap(void) | |||
801 | * This is *not* like dup(), but like a real separate open(). Make sure the | 986 | * This is *not* like dup(), but like a real separate open(). Make sure the |
802 | * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR. | 987 | * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR. |
803 | */ | 988 | */ |
804 | static void test_share_open(void) | 989 | static void test_share_open(char *banner, char *b_suffix) |
805 | { | 990 | { |
806 | int fd, fd2; | 991 | int fd, fd2; |
807 | 992 | ||
993 | /* | ||
994 | * hugetlbfs does not contain sealing support. So test basic | ||
995 | * functionality of using /proc fd via hugetlbfs_proc_open | ||
996 | */ | ||
997 | if (hugetlbfs_test) { | ||
998 | hugetlbfs_proc_open(b_suffix); | ||
999 | return; | ||
1000 | } | ||
1001 | |||
1002 | printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); | ||
1003 | |||
808 | fd = mfd_assert_new("kern_memfd_share_open", | 1004 | fd = mfd_assert_new("kern_memfd_share_open", |
809 | MFD_DEF_SIZE, | 1005 | mfd_def_size, |
810 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 1006 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
811 | mfd_assert_has_seals(fd, 0); | 1007 | mfd_assert_has_seals(fd, 0); |
812 | 1008 | ||
@@ -841,13 +1037,19 @@ static void test_share_open(void) | |||
841 | * Test sharing via fork() | 1037 | * Test sharing via fork() |
842 | * Test whether seal-modifications work as expected with forked childs. | 1038 | * Test whether seal-modifications work as expected with forked childs. |
843 | */ | 1039 | */ |
844 | static void test_share_fork(void) | 1040 | static void test_share_fork(char *banner, char *b_suffix) |
845 | { | 1041 | { |
846 | int fd; | 1042 | int fd; |
847 | pid_t pid; | 1043 | pid_t pid; |
848 | 1044 | ||
1045 | /* hugetlbfs does not contain sealing support */ | ||
1046 | if (hugetlbfs_test) | ||
1047 | return; | ||
1048 | |||
1049 | printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); | ||
1050 | |||
849 | fd = mfd_assert_new("kern_memfd_share_fork", | 1051 | fd = mfd_assert_new("kern_memfd_share_fork", |
850 | MFD_DEF_SIZE, | 1052 | mfd_def_size, |
851 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | 1053 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
852 | mfd_assert_has_seals(fd, 0); | 1054 | mfd_assert_has_seals(fd, 0); |
853 | 1055 | ||
@@ -870,40 +1072,40 @@ int main(int argc, char **argv) | |||
870 | { | 1072 | { |
871 | pid_t pid; | 1073 | pid_t pid; |
872 | 1074 | ||
873 | printf("memfd: CREATE\n"); | 1075 | if (argc == 2) { |
1076 | if (!strcmp(argv[1], "hugetlbfs")) { | ||
1077 | unsigned long hpage_size = default_huge_page_size(); | ||
1078 | |||
1079 | if (!hpage_size) { | ||
1080 | printf("Unable to determine huge page size\n"); | ||
1081 | abort(); | ||
1082 | } | ||
1083 | |||
1084 | hugetlbfs_test = 1; | ||
1085 | mfd_def_size = hpage_size * 2; | ||
1086 | } | ||
1087 | } | ||
1088 | |||
874 | test_create(); | 1089 | test_create(); |
875 | printf("memfd: BASIC\n"); | ||
876 | test_basic(); | 1090 | test_basic(); |
877 | 1091 | ||
878 | printf("memfd: SEAL-WRITE\n"); | ||
879 | test_seal_write(); | 1092 | test_seal_write(); |
880 | printf("memfd: SEAL-SHRINK\n"); | ||
881 | test_seal_shrink(); | 1093 | test_seal_shrink(); |
882 | printf("memfd: SEAL-GROW\n"); | ||
883 | test_seal_grow(); | 1094 | test_seal_grow(); |
884 | printf("memfd: SEAL-RESIZE\n"); | ||
885 | test_seal_resize(); | 1095 | test_seal_resize(); |
886 | 1096 | ||
887 | printf("memfd: SHARE-DUP\n"); | 1097 | test_share_dup("SHARE-DUP", ""); |
888 | test_share_dup(); | 1098 | test_share_mmap("SHARE-MMAP", ""); |
889 | printf("memfd: SHARE-MMAP\n"); | 1099 | test_share_open("SHARE-OPEN", ""); |
890 | test_share_mmap(); | 1100 | test_share_fork("SHARE-FORK", ""); |
891 | printf("memfd: SHARE-OPEN\n"); | ||
892 | test_share_open(); | ||
893 | printf("memfd: SHARE-FORK\n"); | ||
894 | test_share_fork(); | ||
895 | 1101 | ||
896 | /* Run test-suite in a multi-threaded environment with a shared | 1102 | /* Run test-suite in a multi-threaded environment with a shared |
897 | * file-table. */ | 1103 | * file-table. */ |
898 | pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM); | 1104 | pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM); |
899 | printf("memfd: SHARE-DUP (shared file-table)\n"); | 1105 | test_share_dup("SHARE-DUP", SHARED_FT_STR); |
900 | test_share_dup(); | 1106 | test_share_mmap("SHARE-MMAP", SHARED_FT_STR); |
901 | printf("memfd: SHARE-MMAP (shared file-table)\n"); | 1107 | test_share_open("SHARE-OPEN", SHARED_FT_STR); |
902 | test_share_mmap(); | 1108 | test_share_fork("SHARE-FORK", SHARED_FT_STR); |
903 | printf("memfd: SHARE-OPEN (shared file-table)\n"); | ||
904 | test_share_open(); | ||
905 | printf("memfd: SHARE-FORK (shared file-table)\n"); | ||
906 | test_share_fork(); | ||
907 | join_idle_thread(pid); | 1109 | join_idle_thread(pid); |
908 | 1110 | ||
909 | printf("memfd: DONE\n"); | 1111 | printf("memfd: DONE\n"); |
diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh new file mode 100644 index 000000000000..daabb350697c --- /dev/null +++ b/tools/testing/selftests/memfd/run_tests.sh | |||
@@ -0,0 +1,69 @@ | |||
1 | #!/bin/bash | ||
2 | # please run as root | ||
3 | |||
4 | # | ||
5 | # Normal tests requiring no special resources | ||
6 | # | ||
7 | ./run_fuse_test.sh | ||
8 | ./memfd_test | ||
9 | |||
10 | # | ||
11 | # To test memfd_create with hugetlbfs, there needs to be hpages_test | ||
12 | # huge pages free. Attempt to allocate enough pages to test. | ||
13 | # | ||
14 | hpages_test=8 | ||
15 | |||
16 | # | ||
17 | # Get count of free huge pages from /proc/meminfo | ||
18 | # | ||
19 | while read name size unit; do | ||
20 | if [ "$name" = "HugePages_Free:" ]; then | ||
21 | freepgs=$size | ||
22 | fi | ||
23 | done < /proc/meminfo | ||
24 | |||
25 | # | ||
26 | # If not enough free huge pages for test, attempt to increase | ||
27 | # | ||
28 | if [ -n "$freepgs" ] && [ $freepgs -lt $hpages_test ]; then | ||
29 | nr_hugepgs=`cat /proc/sys/vm/nr_hugepages` | ||
30 | hpages_needed=`expr $hpages_test - $freepgs` | ||
31 | |||
32 | echo 3 > /proc/sys/vm/drop_caches | ||
33 | echo $(( $hpages_needed + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages | ||
34 | if [ $? -ne 0 ]; then | ||
35 | echo "Please run this test as root" | ||
36 | exit 1 | ||
37 | fi | ||
38 | while read name size unit; do | ||
39 | if [ "$name" = "HugePages_Free:" ]; then | ||
40 | freepgs=$size | ||
41 | fi | ||
42 | done < /proc/meminfo | ||
43 | fi | ||
44 | |||
45 | # | ||
46 | # If still not enough huge pages available, exit. But, give back any huge | ||
47 | # pages potentially allocated above. | ||
48 | # | ||
49 | if [ $freepgs -lt $hpages_test ]; then | ||
50 | # nr_hugepgs non-zero only if we attempted to increase | ||
51 | if [ -n "$nr_hugepgs" ]; then | ||
52 | echo $nr_hugepgs > /proc/sys/vm/nr_hugepages | ||
53 | fi | ||
54 | printf "Not enough huge pages available (%d < %d)\n" \ | ||
55 | $freepgs $needpgs | ||
56 | exit 1 | ||
57 | fi | ||
58 | |||
59 | # | ||
60 | # Run the hugetlbfs test | ||
61 | # | ||
62 | ./memfd_test hugetlbfs | ||
63 | |||
64 | # | ||
65 | # Give back any huge pages allocated for the test | ||
66 | # | ||
67 | if [ -n "$nr_hugepgs" ]; then | ||
68 | echo $nr_hugepgs > /proc/sys/vm/nr_hugepages | ||
69 | fi | ||
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 1eae79ae5b4e..a2c53a3d223d 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c | |||
@@ -66,6 +66,8 @@ | |||
66 | #include <sys/wait.h> | 66 | #include <sys/wait.h> |
67 | #include <pthread.h> | 67 | #include <pthread.h> |
68 | #include <linux/userfaultfd.h> | 68 | #include <linux/userfaultfd.h> |
69 | #include <setjmp.h> | ||
70 | #include <stdbool.h> | ||
69 | 71 | ||
70 | #ifdef __NR_userfaultfd | 72 | #ifdef __NR_userfaultfd |
71 | 73 | ||
@@ -82,11 +84,17 @@ static int bounces; | |||
82 | #define TEST_SHMEM 3 | 84 | #define TEST_SHMEM 3 |
83 | static int test_type; | 85 | static int test_type; |
84 | 86 | ||
87 | /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ | ||
88 | #define ALARM_INTERVAL_SECS 10 | ||
89 | static volatile bool test_uffdio_copy_eexist = true; | ||
90 | static volatile bool test_uffdio_zeropage_eexist = true; | ||
91 | |||
92 | static bool map_shared; | ||
85 | static int huge_fd; | 93 | static int huge_fd; |
86 | static char *huge_fd_off0; | 94 | static char *huge_fd_off0; |
87 | static unsigned long long *count_verify; | 95 | static unsigned long long *count_verify; |
88 | static int uffd, uffd_flags, finished, *pipefd; | 96 | static int uffd, uffd_flags, finished, *pipefd; |
89 | static char *area_src, *area_dst; | 97 | static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; |
90 | static char *zeropage; | 98 | static char *zeropage; |
91 | pthread_attr_t attr; | 99 | pthread_attr_t attr; |
92 | 100 | ||
@@ -125,6 +133,9 @@ static void anon_allocate_area(void **alloc_area) | |||
125 | } | 133 | } |
126 | } | 134 | } |
127 | 135 | ||
136 | static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) | ||
137 | { | ||
138 | } | ||
128 | 139 | ||
129 | /* HugeTLB memory */ | 140 | /* HugeTLB memory */ |
130 | static int hugetlb_release_pages(char *rel_area) | 141 | static int hugetlb_release_pages(char *rel_area) |
@@ -145,17 +156,51 @@ static int hugetlb_release_pages(char *rel_area) | |||
145 | 156 | ||
146 | static void hugetlb_allocate_area(void **alloc_area) | 157 | static void hugetlb_allocate_area(void **alloc_area) |
147 | { | 158 | { |
159 | void *area_alias = NULL; | ||
160 | char **alloc_area_alias; | ||
148 | *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, | 161 | *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, |
149 | MAP_PRIVATE | MAP_HUGETLB, huge_fd, | 162 | (map_shared ? MAP_SHARED : MAP_PRIVATE) | |
150 | *alloc_area == area_src ? 0 : | 163 | MAP_HUGETLB, |
151 | nr_pages * page_size); | 164 | huge_fd, *alloc_area == area_src ? 0 : |
165 | nr_pages * page_size); | ||
152 | if (*alloc_area == MAP_FAILED) { | 166 | if (*alloc_area == MAP_FAILED) { |
153 | fprintf(stderr, "mmap of hugetlbfs file failed\n"); | 167 | fprintf(stderr, "mmap of hugetlbfs file failed\n"); |
154 | *alloc_area = NULL; | 168 | *alloc_area = NULL; |
155 | } | 169 | } |
156 | 170 | ||
157 | if (*alloc_area == area_src) | 171 | if (map_shared) { |
172 | area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, | ||
173 | MAP_SHARED | MAP_HUGETLB, | ||
174 | huge_fd, *alloc_area == area_src ? 0 : | ||
175 | nr_pages * page_size); | ||
176 | if (area_alias == MAP_FAILED) { | ||
177 | if (munmap(*alloc_area, nr_pages * page_size) < 0) | ||
178 | perror("hugetlb munmap"), exit(1); | ||
179 | *alloc_area = NULL; | ||
180 | return; | ||
181 | } | ||
182 | } | ||
183 | if (*alloc_area == area_src) { | ||
158 | huge_fd_off0 = *alloc_area; | 184 | huge_fd_off0 = *alloc_area; |
185 | alloc_area_alias = &area_src_alias; | ||
186 | } else { | ||
187 | alloc_area_alias = &area_dst_alias; | ||
188 | } | ||
189 | if (area_alias) | ||
190 | *alloc_area_alias = area_alias; | ||
191 | } | ||
192 | |||
193 | static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) | ||
194 | { | ||
195 | if (!map_shared) | ||
196 | return; | ||
197 | /* | ||
198 | * We can't zap just the pagetable with hugetlbfs because | ||
199 | * MADV_DONTEED won't work. So exercise -EEXIST on a alias | ||
200 | * mapping where the pagetables are not established initially, | ||
201 | * this way we'll exercise the -EEXEC at the fs level. | ||
202 | */ | ||
203 | *start = (unsigned long) area_dst_alias + offset; | ||
159 | } | 204 | } |
160 | 205 | ||
161 | /* Shared memory */ | 206 | /* Shared memory */ |
@@ -185,6 +230,7 @@ struct uffd_test_ops { | |||
185 | unsigned long expected_ioctls; | 230 | unsigned long expected_ioctls; |
186 | void (*allocate_area)(void **alloc_area); | 231 | void (*allocate_area)(void **alloc_area); |
187 | int (*release_pages)(char *rel_area); | 232 | int (*release_pages)(char *rel_area); |
233 | void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); | ||
188 | }; | 234 | }; |
189 | 235 | ||
190 | #define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ | 236 | #define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ |
@@ -195,18 +241,21 @@ static struct uffd_test_ops anon_uffd_test_ops = { | |||
195 | .expected_ioctls = ANON_EXPECTED_IOCTLS, | 241 | .expected_ioctls = ANON_EXPECTED_IOCTLS, |
196 | .allocate_area = anon_allocate_area, | 242 | .allocate_area = anon_allocate_area, |
197 | .release_pages = anon_release_pages, | 243 | .release_pages = anon_release_pages, |
244 | .alias_mapping = noop_alias_mapping, | ||
198 | }; | 245 | }; |
199 | 246 | ||
200 | static struct uffd_test_ops shmem_uffd_test_ops = { | 247 | static struct uffd_test_ops shmem_uffd_test_ops = { |
201 | .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, | 248 | .expected_ioctls = ANON_EXPECTED_IOCTLS, |
202 | .allocate_area = shmem_allocate_area, | 249 | .allocate_area = shmem_allocate_area, |
203 | .release_pages = shmem_release_pages, | 250 | .release_pages = shmem_release_pages, |
251 | .alias_mapping = noop_alias_mapping, | ||
204 | }; | 252 | }; |
205 | 253 | ||
206 | static struct uffd_test_ops hugetlb_uffd_test_ops = { | 254 | static struct uffd_test_ops hugetlb_uffd_test_ops = { |
207 | .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, | 255 | .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, |
208 | .allocate_area = hugetlb_allocate_area, | 256 | .allocate_area = hugetlb_allocate_area, |
209 | .release_pages = hugetlb_release_pages, | 257 | .release_pages = hugetlb_release_pages, |
258 | .alias_mapping = hugetlb_alias_mapping, | ||
210 | }; | 259 | }; |
211 | 260 | ||
212 | static struct uffd_test_ops *uffd_test_ops; | 261 | static struct uffd_test_ops *uffd_test_ops; |
@@ -331,6 +380,23 @@ static void *locking_thread(void *arg) | |||
331 | return NULL; | 380 | return NULL; |
332 | } | 381 | } |
333 | 382 | ||
383 | static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, | ||
384 | unsigned long offset) | ||
385 | { | ||
386 | uffd_test_ops->alias_mapping(&uffdio_copy->dst, | ||
387 | uffdio_copy->len, | ||
388 | offset); | ||
389 | if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { | ||
390 | /* real retval in ufdio_copy.copy */ | ||
391 | if (uffdio_copy->copy != -EEXIST) | ||
392 | fprintf(stderr, "UFFDIO_COPY retry error %Ld\n", | ||
393 | uffdio_copy->copy), exit(1); | ||
394 | } else { | ||
395 | fprintf(stderr, "UFFDIO_COPY retry unexpected %Ld\n", | ||
396 | uffdio_copy->copy), exit(1); | ||
397 | } | ||
398 | } | ||
399 | |||
334 | static int copy_page(int ufd, unsigned long offset) | 400 | static int copy_page(int ufd, unsigned long offset) |
335 | { | 401 | { |
336 | struct uffdio_copy uffdio_copy; | 402 | struct uffdio_copy uffdio_copy; |
@@ -351,8 +417,13 @@ static int copy_page(int ufd, unsigned long offset) | |||
351 | } else if (uffdio_copy.copy != page_size) { | 417 | } else if (uffdio_copy.copy != page_size) { |
352 | fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", | 418 | fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", |
353 | uffdio_copy.copy), exit(1); | 419 | uffdio_copy.copy), exit(1); |
354 | } else | 420 | } else { |
421 | if (test_uffdio_copy_eexist) { | ||
422 | test_uffdio_copy_eexist = false; | ||
423 | retry_copy_page(ufd, &uffdio_copy, offset); | ||
424 | } | ||
355 | return 1; | 425 | return 1; |
426 | } | ||
356 | return 0; | 427 | return 0; |
357 | } | 428 | } |
358 | 429 | ||
@@ -408,6 +479,7 @@ static void *uffd_poll_thread(void *arg) | |||
408 | userfaults++; | 479 | userfaults++; |
409 | break; | 480 | break; |
410 | case UFFD_EVENT_FORK: | 481 | case UFFD_EVENT_FORK: |
482 | close(uffd); | ||
411 | uffd = msg.arg.fork.ufd; | 483 | uffd = msg.arg.fork.ufd; |
412 | pollfd[0].fd = uffd; | 484 | pollfd[0].fd = uffd; |
413 | break; | 485 | break; |
@@ -572,6 +644,17 @@ static int userfaultfd_open(int features) | |||
572 | return 0; | 644 | return 0; |
573 | } | 645 | } |
574 | 646 | ||
647 | sigjmp_buf jbuf, *sigbuf; | ||
648 | |||
649 | static void sighndl(int sig, siginfo_t *siginfo, void *ptr) | ||
650 | { | ||
651 | if (sig == SIGBUS) { | ||
652 | if (sigbuf) | ||
653 | siglongjmp(*sigbuf, 1); | ||
654 | abort(); | ||
655 | } | ||
656 | } | ||
657 | |||
575 | /* | 658 | /* |
576 | * For non-cooperative userfaultfd test we fork() a process that will | 659 | * For non-cooperative userfaultfd test we fork() a process that will |
577 | * generate pagefaults, will mremap the area monitored by the | 660 | * generate pagefaults, will mremap the area monitored by the |
@@ -585,19 +668,59 @@ static int userfaultfd_open(int features) | |||
585 | * The release of the pages currently generates event for shmem and | 668 | * The release of the pages currently generates event for shmem and |
586 | * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked | 669 | * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked |
587 | * for hugetlb. | 670 | * for hugetlb. |
671 | * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register | ||
672 | * monitored area, generate pagefaults and test that signal is delivered. | ||
673 | * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 | ||
674 | * test robustness use case - we release monitored area, fork a process | ||
675 | * that will generate pagefaults and verify signal is generated. | ||
676 | * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal | ||
677 | * feature. Using monitor thread, verify no userfault events are generated. | ||
588 | */ | 678 | */ |
589 | static int faulting_process(void) | 679 | static int faulting_process(int signal_test) |
590 | { | 680 | { |
591 | unsigned long nr; | 681 | unsigned long nr; |
592 | unsigned long long count; | 682 | unsigned long long count; |
593 | unsigned long split_nr_pages; | 683 | unsigned long split_nr_pages; |
684 | unsigned long lastnr; | ||
685 | struct sigaction act; | ||
686 | unsigned long signalled = 0; | ||
594 | 687 | ||
595 | if (test_type != TEST_HUGETLB) | 688 | if (test_type != TEST_HUGETLB) |
596 | split_nr_pages = (nr_pages + 1) / 2; | 689 | split_nr_pages = (nr_pages + 1) / 2; |
597 | else | 690 | else |
598 | split_nr_pages = nr_pages; | 691 | split_nr_pages = nr_pages; |
599 | 692 | ||
693 | if (signal_test) { | ||
694 | sigbuf = &jbuf; | ||
695 | memset(&act, 0, sizeof(act)); | ||
696 | act.sa_sigaction = sighndl; | ||
697 | act.sa_flags = SA_SIGINFO; | ||
698 | if (sigaction(SIGBUS, &act, 0)) { | ||
699 | perror("sigaction"); | ||
700 | return 1; | ||
701 | } | ||
702 | lastnr = (unsigned long)-1; | ||
703 | } | ||
704 | |||
600 | for (nr = 0; nr < split_nr_pages; nr++) { | 705 | for (nr = 0; nr < split_nr_pages; nr++) { |
706 | if (signal_test) { | ||
707 | if (sigsetjmp(*sigbuf, 1) != 0) { | ||
708 | if (nr == lastnr) { | ||
709 | fprintf(stderr, "Signal repeated\n"); | ||
710 | return 1; | ||
711 | } | ||
712 | |||
713 | lastnr = nr; | ||
714 | if (signal_test == 1) { | ||
715 | if (copy_page(uffd, nr * page_size)) | ||
716 | signalled++; | ||
717 | } else { | ||
718 | signalled++; | ||
719 | continue; | ||
720 | } | ||
721 | } | ||
722 | } | ||
723 | |||
601 | count = *area_count(area_dst, nr); | 724 | count = *area_count(area_dst, nr); |
602 | if (count != count_verify[nr]) { | 725 | if (count != count_verify[nr]) { |
603 | fprintf(stderr, | 726 | fprintf(stderr, |
@@ -607,6 +730,9 @@ static int faulting_process(void) | |||
607 | } | 730 | } |
608 | } | 731 | } |
609 | 732 | ||
733 | if (signal_test) | ||
734 | return signalled != split_nr_pages; | ||
735 | |||
610 | if (test_type == TEST_HUGETLB) | 736 | if (test_type == TEST_HUGETLB) |
611 | return 0; | 737 | return 0; |
612 | 738 | ||
@@ -636,6 +762,23 @@ static int faulting_process(void) | |||
636 | return 0; | 762 | return 0; |
637 | } | 763 | } |
638 | 764 | ||
765 | static void retry_uffdio_zeropage(int ufd, | ||
766 | struct uffdio_zeropage *uffdio_zeropage, | ||
767 | unsigned long offset) | ||
768 | { | ||
769 | uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, | ||
770 | uffdio_zeropage->range.len, | ||
771 | offset); | ||
772 | if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { | ||
773 | if (uffdio_zeropage->zeropage != -EEXIST) | ||
774 | fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n", | ||
775 | uffdio_zeropage->zeropage), exit(1); | ||
776 | } else { | ||
777 | fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n", | ||
778 | uffdio_zeropage->zeropage), exit(1); | ||
779 | } | ||
780 | } | ||
781 | |||
639 | static int uffdio_zeropage(int ufd, unsigned long offset) | 782 | static int uffdio_zeropage(int ufd, unsigned long offset) |
640 | { | 783 | { |
641 | struct uffdio_zeropage uffdio_zeropage; | 784 | struct uffdio_zeropage uffdio_zeropage; |
@@ -670,8 +813,14 @@ static int uffdio_zeropage(int ufd, unsigned long offset) | |||
670 | if (uffdio_zeropage.zeropage != page_size) { | 813 | if (uffdio_zeropage.zeropage != page_size) { |
671 | fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", | 814 | fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", |
672 | uffdio_zeropage.zeropage), exit(1); | 815 | uffdio_zeropage.zeropage), exit(1); |
673 | } else | 816 | } else { |
817 | if (test_uffdio_zeropage_eexist) { | ||
818 | test_uffdio_zeropage_eexist = false; | ||
819 | retry_uffdio_zeropage(ufd, &uffdio_zeropage, | ||
820 | offset); | ||
821 | } | ||
674 | return 1; | 822 | return 1; |
823 | } | ||
675 | } else { | 824 | } else { |
676 | fprintf(stderr, | 825 | fprintf(stderr, |
677 | "UFFDIO_ZEROPAGE succeeded %Ld\n", | 826 | "UFFDIO_ZEROPAGE succeeded %Ld\n", |
@@ -761,7 +910,7 @@ static int userfaultfd_events_test(void) | |||
761 | perror("fork"), exit(1); | 910 | perror("fork"), exit(1); |
762 | 911 | ||
763 | if (!pid) | 912 | if (!pid) |
764 | return faulting_process(); | 913 | return faulting_process(0); |
765 | 914 | ||
766 | waitpid(pid, &err, 0); | 915 | waitpid(pid, &err, 0); |
767 | if (err) | 916 | if (err) |
@@ -778,6 +927,72 @@ static int userfaultfd_events_test(void) | |||
778 | return userfaults != nr_pages; | 927 | return userfaults != nr_pages; |
779 | } | 928 | } |
780 | 929 | ||
930 | static int userfaultfd_sig_test(void) | ||
931 | { | ||
932 | struct uffdio_register uffdio_register; | ||
933 | unsigned long expected_ioctls; | ||
934 | unsigned long userfaults; | ||
935 | pthread_t uffd_mon; | ||
936 | int err, features; | ||
937 | pid_t pid; | ||
938 | char c; | ||
939 | |||
940 | printf("testing signal delivery: "); | ||
941 | fflush(stdout); | ||
942 | |||
943 | if (uffd_test_ops->release_pages(area_dst)) | ||
944 | return 1; | ||
945 | |||
946 | features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; | ||
947 | if (userfaultfd_open(features) < 0) | ||
948 | return 1; | ||
949 | fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); | ||
950 | |||
951 | uffdio_register.range.start = (unsigned long) area_dst; | ||
952 | uffdio_register.range.len = nr_pages * page_size; | ||
953 | uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; | ||
954 | if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) | ||
955 | fprintf(stderr, "register failure\n"), exit(1); | ||
956 | |||
957 | expected_ioctls = uffd_test_ops->expected_ioctls; | ||
958 | if ((uffdio_register.ioctls & expected_ioctls) != | ||
959 | expected_ioctls) | ||
960 | fprintf(stderr, | ||
961 | "unexpected missing ioctl for anon memory\n"), | ||
962 | exit(1); | ||
963 | |||
964 | if (faulting_process(1)) | ||
965 | fprintf(stderr, "faulting process failed\n"), exit(1); | ||
966 | |||
967 | if (uffd_test_ops->release_pages(area_dst)) | ||
968 | return 1; | ||
969 | |||
970 | if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL)) | ||
971 | perror("uffd_poll_thread create"), exit(1); | ||
972 | |||
973 | pid = fork(); | ||
974 | if (pid < 0) | ||
975 | perror("fork"), exit(1); | ||
976 | |||
977 | if (!pid) | ||
978 | exit(faulting_process(2)); | ||
979 | |||
980 | waitpid(pid, &err, 0); | ||
981 | if (err) | ||
982 | fprintf(stderr, "faulting process failed\n"), exit(1); | ||
983 | |||
984 | if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) | ||
985 | perror("pipe write"), exit(1); | ||
986 | if (pthread_join(uffd_mon, (void **)&userfaults)) | ||
987 | return 1; | ||
988 | |||
989 | printf("done.\n"); | ||
990 | if (userfaults) | ||
991 | fprintf(stderr, "Signal test failed, userfaults: %ld\n", | ||
992 | userfaults); | ||
993 | close(uffd); | ||
994 | return userfaults != 0; | ||
995 | } | ||
781 | static int userfaultfd_stress(void) | 996 | static int userfaultfd_stress(void) |
782 | { | 997 | { |
783 | void *area; | 998 | void *area; |
@@ -879,6 +1094,15 @@ static int userfaultfd_stress(void) | |||
879 | return 1; | 1094 | return 1; |
880 | } | 1095 | } |
881 | 1096 | ||
1097 | if (area_dst_alias) { | ||
1098 | uffdio_register.range.start = (unsigned long) | ||
1099 | area_dst_alias; | ||
1100 | if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { | ||
1101 | fprintf(stderr, "register failure alias\n"); | ||
1102 | return 1; | ||
1103 | } | ||
1104 | } | ||
1105 | |||
882 | /* | 1106 | /* |
883 | * The madvise done previously isn't enough: some | 1107 | * The madvise done previously isn't enough: some |
884 | * uffd_thread could have read userfaults (one of | 1108 | * uffd_thread could have read userfaults (one of |
@@ -912,9 +1136,17 @@ static int userfaultfd_stress(void) | |||
912 | 1136 | ||
913 | /* unregister */ | 1137 | /* unregister */ |
914 | if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { | 1138 | if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { |
915 | fprintf(stderr, "register failure\n"); | 1139 | fprintf(stderr, "unregister failure\n"); |
916 | return 1; | 1140 | return 1; |
917 | } | 1141 | } |
1142 | if (area_dst_alias) { | ||
1143 | uffdio_register.range.start = (unsigned long) area_dst; | ||
1144 | if (ioctl(uffd, UFFDIO_UNREGISTER, | ||
1145 | &uffdio_register.range)) { | ||
1146 | fprintf(stderr, "unregister failure alias\n"); | ||
1147 | return 1; | ||
1148 | } | ||
1149 | } | ||
918 | 1150 | ||
919 | /* verification */ | 1151 | /* verification */ |
920 | if (bounces & BOUNCE_VERIFY) { | 1152 | if (bounces & BOUNCE_VERIFY) { |
@@ -936,6 +1168,10 @@ static int userfaultfd_stress(void) | |||
936 | area_src = area_dst; | 1168 | area_src = area_dst; |
937 | area_dst = tmp_area; | 1169 | area_dst = tmp_area; |
938 | 1170 | ||
1171 | tmp_area = area_src_alias; | ||
1172 | area_src_alias = area_dst_alias; | ||
1173 | area_dst_alias = tmp_area; | ||
1174 | |||
939 | printf("userfaults:"); | 1175 | printf("userfaults:"); |
940 | for (cpu = 0; cpu < nr_cpus; cpu++) | 1176 | for (cpu = 0; cpu < nr_cpus; cpu++) |
941 | printf(" %lu", userfaults[cpu]); | 1177 | printf(" %lu", userfaults[cpu]); |
@@ -946,7 +1182,8 @@ static int userfaultfd_stress(void) | |||
946 | return err; | 1182 | return err; |
947 | 1183 | ||
948 | close(uffd); | 1184 | close(uffd); |
949 | return userfaultfd_zeropage_test() || userfaultfd_events_test(); | 1185 | return userfaultfd_zeropage_test() || userfaultfd_sig_test() |
1186 | || userfaultfd_events_test(); | ||
950 | } | 1187 | } |
951 | 1188 | ||
952 | /* | 1189 | /* |
@@ -981,7 +1218,12 @@ static void set_test_type(const char *type) | |||
981 | } else if (!strcmp(type, "hugetlb")) { | 1218 | } else if (!strcmp(type, "hugetlb")) { |
982 | test_type = TEST_HUGETLB; | 1219 | test_type = TEST_HUGETLB; |
983 | uffd_test_ops = &hugetlb_uffd_test_ops; | 1220 | uffd_test_ops = &hugetlb_uffd_test_ops; |
1221 | } else if (!strcmp(type, "hugetlb_shared")) { | ||
1222 | map_shared = true; | ||
1223 | test_type = TEST_HUGETLB; | ||
1224 | uffd_test_ops = &hugetlb_uffd_test_ops; | ||
984 | } else if (!strcmp(type, "shmem")) { | 1225 | } else if (!strcmp(type, "shmem")) { |
1226 | map_shared = true; | ||
985 | test_type = TEST_SHMEM; | 1227 | test_type = TEST_SHMEM; |
986 | uffd_test_ops = &shmem_uffd_test_ops; | 1228 | uffd_test_ops = &shmem_uffd_test_ops; |
987 | } else { | 1229 | } else { |
@@ -1001,12 +1243,25 @@ static void set_test_type(const char *type) | |||
1001 | fprintf(stderr, "Impossible to run this test\n"), exit(2); | 1243 | fprintf(stderr, "Impossible to run this test\n"), exit(2); |
1002 | } | 1244 | } |
1003 | 1245 | ||
1246 | static void sigalrm(int sig) | ||
1247 | { | ||
1248 | if (sig != SIGALRM) | ||
1249 | abort(); | ||
1250 | test_uffdio_copy_eexist = true; | ||
1251 | test_uffdio_zeropage_eexist = true; | ||
1252 | alarm(ALARM_INTERVAL_SECS); | ||
1253 | } | ||
1254 | |||
1004 | int main(int argc, char **argv) | 1255 | int main(int argc, char **argv) |
1005 | { | 1256 | { |
1006 | if (argc < 4) | 1257 | if (argc < 4) |
1007 | fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"), | 1258 | fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"), |
1008 | exit(1); | 1259 | exit(1); |
1009 | 1260 | ||
1261 | if (signal(SIGALRM, sigalrm) == SIG_ERR) | ||
1262 | fprintf(stderr, "failed to arm SIGALRM"), exit(1); | ||
1263 | alarm(ALARM_INTERVAL_SECS); | ||
1264 | |||
1010 | set_test_type(argv[1]); | 1265 | set_test_type(argv[1]); |
1011 | 1266 | ||
1012 | nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); | 1267 | nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); |